From 5244aafc2d7945c11c96398b8d5b752b45fd148c Mon Sep 17 00:00:00 2001
From: Xianjin YE <advancedxy@gmail.com>
Date: Tue, 2 Jan 2018 23:30:38 +0800
Subject: [PATCH 001/566] [SPARK-22897][CORE] Expose stageAttemptId in
 TaskContext

## What changes were proposed in this pull request?
stageAttemptId added in TaskContext and corresponding construction modification

## How was this patch tested?
Added a new test in TaskContextSuite, two cases are tested:
1. Normal case without failure
2. Exception case with resubmitted stages

Link to [SPARK-22897](https://issues.apache.org/jira/browse/SPARK-22897)

Author: Xianjin YE <advancedxy@gmail.com>

Closes #20082 from advancedxy/SPARK-22897.

(cherry picked from commit a6fc300e91273230e7134ac6db95ccb4436c6f8f)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../scala/org/apache/spark/TaskContext.scala  |  9 +++++-
 .../org/apache/spark/TaskContextImpl.scala    |  5 ++--
 .../org/apache/spark/scheduler/Task.scala     |  1 +
 .../spark/JavaTaskContextCompileCheck.java    |  2 ++
 .../scala/org/apache/spark/ShuffleSuite.scala |  6 ++--
 .../spark/memory/MemoryTestingUtils.scala     |  1 +
 .../spark/scheduler/TaskContextSuite.scala    | 29 +++++++++++++++++--
 .../spark/storage/BlockInfoManagerSuite.scala |  2 +-
 project/MimaExcludes.scala                    |  3 ++
 .../UnsafeFixedWidthAggregationMapSuite.scala |  1 +
 .../UnsafeKVExternalSorterSuite.scala         |  1 +
 .../execution/UnsafeRowSerializerSuite.scala  |  2 +-
 .../SortBasedAggregationStoreSuite.scala      |  3 +-
 13 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala b/core/src/main/scala/org/apache/spark/TaskContext.scala
index 0b87cd503d4fa..69739745aa6cf 100644
--- a/core/src/main/scala/org/apache/spark/TaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContext.scala
@@ -66,7 +66,7 @@ object TaskContext {
    * An empty task context that does not represent an actual task.  This is only used in tests.
    */
   private[spark] def empty(): TaskContextImpl = {
-    new TaskContextImpl(0, 0, 0, 0, null, new Properties, null)
+    new TaskContextImpl(0, 0, 0, 0, 0, null, new Properties, null)
   }
 }
 
@@ -150,6 +150,13 @@ abstract class TaskContext extends Serializable {
    */
   def stageId(): Int
 
+  /**
+   * How many times the stage that this task belongs to has been attempted. The first stage attempt
+   * will be assigned stageAttemptNumber = 0, and subsequent attempts will have increasing attempt
+   * numbers.
+   */
+  def stageAttemptNumber(): Int
+
   /**
    * The ID of the RDD partition that is computed by this task.
    */
diff --git a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
index 01d8973e1bb06..cccd3ea457ba4 100644
--- a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
@@ -41,8 +41,9 @@ import org.apache.spark.util._
  * `TaskMetrics` & `MetricsSystem` objects are not thread safe.
  */
 private[spark] class TaskContextImpl(
-    val stageId: Int,
-    val partitionId: Int,
+    override val stageId: Int,
+    override val stageAttemptNumber: Int,
+    override val partitionId: Int,
     override val taskAttemptId: Long,
     override val attemptNumber: Int,
     override val taskMemoryManager: TaskMemoryManager,
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
index 7767ef1803a06..f536fc2a5f0a1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
@@ -79,6 +79,7 @@ private[spark] abstract class Task[T](
     SparkEnv.get.blockManager.registerTask(taskAttemptId)
     context = new TaskContextImpl(
       stageId,
+      stageAttemptId, // stageAttemptId and stageAttemptNumber are semantically equal
       partitionId,
       taskAttemptId,
       attemptNumber,
diff --git a/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java b/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java
index 94f5805853e1e..f8e233a05a447 100644
--- a/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java
+++ b/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java
@@ -38,6 +38,7 @@ public static void test() {
     tc.attemptNumber();
     tc.partitionId();
     tc.stageId();
+    tc.stageAttemptNumber();
     tc.taskAttemptId();
   }
 
@@ -51,6 +52,7 @@ public void onTaskCompletion(TaskContext context) {
       context.isCompleted();
       context.isInterrupted();
       context.stageId();
+      context.stageAttemptNumber();
       context.partitionId();
       context.addTaskCompletionListener(this);
     }
diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
index 3931d53b4ae0a..ced5a06516f75 100644
--- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
@@ -363,14 +363,14 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
 
     // first attempt -- its successful
     val writer1 = manager.getWriter[Int, Int](shuffleHandle, 0,
-      new TaskContextImpl(0, 0, 0L, 0, taskMemoryManager, new Properties, metricsSystem))
+      new TaskContextImpl(0, 0, 0, 0L, 0, taskMemoryManager, new Properties, metricsSystem))
     val data1 = (1 to 10).map { x => x -> x}
 
     // second attempt -- also successful.  We'll write out different data,
     // just to simulate the fact that the records may get written differently
     // depending on what gets spilled, what gets combined, etc.
     val writer2 = manager.getWriter[Int, Int](shuffleHandle, 0,
-      new TaskContextImpl(0, 0, 1L, 0, taskMemoryManager, new Properties, metricsSystem))
+      new TaskContextImpl(0, 0, 0, 1L, 0, taskMemoryManager, new Properties, metricsSystem))
     val data2 = (11 to 20).map { x => x -> x}
 
     // interleave writes of both attempts -- we want to test that both attempts can occur
@@ -398,7 +398,7 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
     }
 
     val reader = manager.getReader[Int, Int](shuffleHandle, 0, 1,
-      new TaskContextImpl(1, 0, 2L, 0, taskMemoryManager, new Properties, metricsSystem))
+      new TaskContextImpl(1, 0, 0, 2L, 0, taskMemoryManager, new Properties, metricsSystem))
     val readData = reader.read().toIndexedSeq
     assert(readData === data1.toIndexedSeq || readData === data2.toIndexedSeq)
 
diff --git a/core/src/test/scala/org/apache/spark/memory/MemoryTestingUtils.scala b/core/src/test/scala/org/apache/spark/memory/MemoryTestingUtils.scala
index 362cd861cc248..dcf89e4f75acf 100644
--- a/core/src/test/scala/org/apache/spark/memory/MemoryTestingUtils.scala
+++ b/core/src/test/scala/org/apache/spark/memory/MemoryTestingUtils.scala
@@ -29,6 +29,7 @@ object MemoryTestingUtils {
     val taskMemoryManager = new TaskMemoryManager(env.memoryManager, 0)
     new TaskContextImpl(
       stageId = 0,
+      stageAttemptNumber = 0,
       partitionId = 0,
       taskAttemptId = 0,
       attemptNumber = 0,
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
index a1d9085fa085d..aa9c36c0aaacb 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
@@ -29,6 +29,7 @@ import org.apache.spark.memory.TaskMemoryManager
 import org.apache.spark.metrics.source.JvmSource
 import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.rdd.RDD
+import org.apache.spark.shuffle.FetchFailedException
 import org.apache.spark.util._
 
 class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSparkContext {
@@ -158,6 +159,30 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark
     assert(attemptIdsWithFailedTask.toSet === Set(0, 1))
   }
 
+  test("TaskContext.stageAttemptNumber getter") {
+    sc = new SparkContext("local[1,2]", "test")
+
+    // Check stageAttemptNumbers are 0 for initial stage
+    val stageAttemptNumbers = sc.parallelize(Seq(1, 2), 2).mapPartitions { _ =>
+      Seq(TaskContext.get().stageAttemptNumber()).iterator
+    }.collect()
+    assert(stageAttemptNumbers.toSet === Set(0))
+
+    // Check stageAttemptNumbers that are resubmitted when tasks have FetchFailedException
+    val stageAttemptNumbersWithFailedStage =
+      sc.parallelize(Seq(1, 2, 3, 4), 4).repartition(1).mapPartitions { _ =>
+      val stageAttemptNumber = TaskContext.get().stageAttemptNumber()
+      if (stageAttemptNumber < 2) {
+        // Throw FetchFailedException to explicitly trigger stage resubmission. A normal exception
+        // will only trigger task resubmission in the same stage.
+        throw new FetchFailedException(null, 0, 0, 0, "Fake")
+      }
+      Seq(stageAttemptNumber).iterator
+    }.collect()
+
+    assert(stageAttemptNumbersWithFailedStage.toSet === Set(2))
+  }
+
   test("accumulators are updated on exception failures") {
     // This means use 1 core and 4 max task failures
     sc = new SparkContext("local[1,4]", "test")
@@ -190,7 +215,7 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark
     // accumulator updates from it.
     val taskMetrics = TaskMetrics.empty
     val task = new Task[Int](0, 0, 0) {
-      context = new TaskContextImpl(0, 0, 0L, 0,
+      context = new TaskContextImpl(0, 0, 0, 0L, 0,
         new TaskMemoryManager(SparkEnv.get.memoryManager, 0L),
         new Properties,
         SparkEnv.get.metricsSystem,
@@ -213,7 +238,7 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark
     // accumulator updates from it.
     val taskMetrics = TaskMetrics.registered
     val task = new Task[Int](0, 0, 0) {
-      context = new TaskContextImpl(0, 0, 0L, 0,
+      context = new TaskContextImpl(0, 0, 0, 0L, 0,
         new TaskMemoryManager(SparkEnv.get.memoryManager, 0L),
         new Properties,
         SparkEnv.get.metricsSystem,
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockInfoManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockInfoManagerSuite.scala
index 917db766f7f11..9c0699bc981f8 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockInfoManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockInfoManagerSuite.scala
@@ -62,7 +62,7 @@ class BlockInfoManagerSuite extends SparkFunSuite with BeforeAndAfterEach {
   private def withTaskId[T](taskAttemptId: Long)(block: => T): T = {
     try {
       TaskContext.setTaskContext(
-        new TaskContextImpl(0, 0, taskAttemptId, 0, null, new Properties, null))
+        new TaskContextImpl(0, 0, 0, taskAttemptId, 0, null, new Properties, null))
       block
     } finally {
       TaskContext.unset()
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 81584af6813ea..3b452f35c5ec1 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -36,6 +36,9 @@ object MimaExcludes {
 
   // Exclude rules for 2.3.x
   lazy val v23excludes = v22excludes ++ Seq(
+    // [SPARK-22897] Expose stageAttemptId in TaskContext
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.TaskContext.stageAttemptNumber"),
+
     // SPARK-22789: Map-only continuous processing execution
     ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryManager.startQuery$default$8"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryManager.startQuery$default$6"),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
index 232c1beae7998..3e31d22e15c0e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
@@ -70,6 +70,7 @@ class UnsafeFixedWidthAggregationMapSuite
 
       TaskContext.setTaskContext(new TaskContextImpl(
         stageId = 0,
+        stageAttemptNumber = 0,
         partitionId = 0,
         taskAttemptId = Random.nextInt(10000),
         attemptNumber = 0,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala
index 604502f2a57d0..6af9f8b77f8d3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala
@@ -116,6 +116,7 @@ class UnsafeKVExternalSorterSuite extends SparkFunSuite with SharedSQLContext {
     val taskMemMgr = new TaskMemoryManager(memoryManager, 0)
     TaskContext.setTaskContext(new TaskContextImpl(
       stageId = 0,
+      stageAttemptNumber = 0,
       partitionId = 0,
       taskAttemptId = 98456,
       attemptNumber = 0,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
index dff88ce7f1b9a..a3ae93810aa3c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
@@ -114,7 +114,7 @@ class UnsafeRowSerializerSuite extends SparkFunSuite with LocalSparkContext {
         (i, converter(Row(i)))
       }
       val taskMemoryManager = new TaskMemoryManager(sc.env.memoryManager, 0)
-      val taskContext = new TaskContextImpl(0, 0, 0, 0, taskMemoryManager, new Properties, null)
+      val taskContext = new TaskContextImpl(0, 0, 0, 0, 0, taskMemoryManager, new Properties, null)
 
       val sorter = new ExternalSorter[Int, UnsafeRow, UnsafeRow](
         taskContext,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala
index 10f1ee279bedf..3fad7dfddadcc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala
@@ -35,7 +35,8 @@ class SortBasedAggregationStoreSuite  extends SparkFunSuite with LocalSparkConte
     val conf = new SparkConf()
     sc = new SparkContext("local[2, 4]", "test", conf)
     val taskManager = new TaskMemoryManager(new TestMemoryManager(conf), 0)
-    TaskContext.setTaskContext(new TaskContextImpl(0, 0, 0, 0, taskManager, new Properties, null))
+    TaskContext.setTaskContext(
+      new TaskContextImpl(0, 0, 0, 0, 0, taskManager, new Properties, null))
   }
 
   override def afterAll(): Unit = TaskContext.unset()

From b96a2132413937c013e1099be3ec4bc420c947fd Mon Sep 17 00:00:00 2001
From: Juliusz Sompolski <julek@databricks.com>
Date: Wed, 3 Jan 2018 21:40:51 +0800
Subject: [PATCH 002/566] [SPARK-22938] Assert that SQLConf.get is accessed
 only on the driver.

## What changes were proposed in this pull request?

Assert if code tries to access SQLConf.get on executor.
This can lead to hard to detect bugs, where the executor will read fallbackConf, falling back to default config values, ignoring potentially changed non-default configs.
If a config is to be passed to executor code, it needs to be read on the driver, and passed explicitly.

## How was this patch tested?

Check in existing tests.

Author: Juliusz Sompolski <julek@databricks.com>

Closes #20136 from juliuszsompolski/SPARK-22938.

(cherry picked from commit 247a08939d58405aef39b2a4e7773aa45474ad12)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../scala/org/apache/spark/sql/internal/SQLConf.scala  | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 4f77c54a7af57..80cdc61484c0f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -27,11 +27,13 @@ import scala.util.matching.Regex
 
 import org.apache.hadoop.fs.Path
 
+import org.apache.spark.{SparkContext, SparkEnv}
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.network.util.ByteUnit
 import org.apache.spark.sql.catalyst.analysis.Resolver
 import org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator
+import org.apache.spark.util.Utils
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // This file defines the configuration options for Spark SQL.
@@ -70,7 +72,7 @@ object SQLConf {
    * Default config. Only used when there is no active SparkSession for the thread.
    * See [[get]] for more information.
    */
-  private val fallbackConf = new ThreadLocal[SQLConf] {
+  private lazy val fallbackConf = new ThreadLocal[SQLConf] {
     override def initialValue: SQLConf = new SQLConf
   }
 
@@ -1087,6 +1089,12 @@ object SQLConf {
 class SQLConf extends Serializable with Logging {
   import SQLConf._
 
+  if (Utils.isTesting && SparkEnv.get != null) {
+    // assert that we're only accessing it on the driver.
+    assert(SparkEnv.get.executorId == SparkContext.DRIVER_IDENTIFIER,
+      "SQLConf should only be created and accessed on the driver.")
+  }
+
   /** Only low degree of contention is expected for conf, thus NOT using ConcurrentHashMap. */
   @transient protected[spark] val settings = java.util.Collections.synchronizedMap(
     new java.util.HashMap[String, String]())

From a05e85ecb76091567a26a3a14ad0879b4728addc Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Wed, 3 Jan 2018 22:09:30 +0800
Subject: [PATCH 003/566] [SPARK-22934][SQL] Make optional clauses order
 insensitive for CREATE TABLE SQL statement

## What changes were proposed in this pull request?
Currently, our CREATE TABLE syntax require the EXACT order of clauses. It is pretty hard to remember the exact order. Thus, this PR is to make optional clauses order insensitive for `CREATE TABLE` SQL statement.

```
CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db_name.]table_name
    [(col_name1 col_type1 [COMMENT col_comment1], ...)]
    USING datasource
    [OPTIONS (key1=val1, key2=val2, ...)]
    [PARTITIONED BY (col_name1, col_name2, ...)]
    [CLUSTERED BY (col_name3, col_name4, ...) INTO num_buckets BUCKETS]
    [LOCATION path]
    [COMMENT table_comment]
    [TBLPROPERTIES (key1=val1, key2=val2, ...)]
    [AS select_statement]
```

The proposal is to make the following clauses order insensitive.
```
    [OPTIONS (key1=val1, key2=val2, ...)]
    [PARTITIONED BY (col_name1, col_name2, ...)]
    [CLUSTERED BY (col_name3, col_name4, ...) INTO num_buckets BUCKETS]
    [LOCATION path]
    [COMMENT table_comment]
    [TBLPROPERTIES (key1=val1, key2=val2, ...)]
```

The same idea is also applicable to Create Hive Table.
```
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
    [(col_name1[:] col_type1 [COMMENT col_comment1], ...)]
    [COMMENT table_comment]
    [PARTITIONED BY (col_name2[:] col_type2 [COMMENT col_comment2], ...)]
    [ROW FORMAT row_format]
    [STORED AS file_format]
    [LOCATION path]
    [TBLPROPERTIES (key1=val1, key2=val2, ...)]
    [AS select_statement]
```

The proposal is to make the following clauses order insensitive.
```
    [COMMENT table_comment]
    [PARTITIONED BY (col_name2[:] col_type2 [COMMENT col_comment2], ...)]
    [ROW FORMAT row_format]
    [STORED AS file_format]
    [LOCATION path]
    [TBLPROPERTIES (key1=val1, key2=val2, ...)]
```

## How was this patch tested?
Added test cases

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20133 from gatorsmile/createDataSourceTableDDL.

(cherry picked from commit 1a87a1609c4d2c9027a2cf669ea3337b89f61fb6)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../spark/sql/catalyst/parser/SqlBase.g4      |  24 +-
 .../sql/catalyst/parser/ParserUtils.scala     |   9 +
 .../spark/sql/execution/SparkSqlParser.scala  |  81 +++++--
 .../execution/command/DDLParserSuite.scala    | 220 ++++++++++++++----
 .../sql/execution/command/DDLSuite.scala      |   2 +-
 .../sql/hive/execution/HiveDDLSuite.scala     |  13 +-
 .../sql/hive/execution/SQLQuerySuite.scala    | 124 +++++-----
 7 files changed, 335 insertions(+), 138 deletions(-)

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 6fe995f650d55..6daf01d98426c 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -73,18 +73,22 @@ statement
     | ALTER DATABASE identifier SET DBPROPERTIES tablePropertyList     #setDatabaseProperties
     | DROP DATABASE (IF EXISTS)? identifier (RESTRICT | CASCADE)?      #dropDatabase
     | createTableHeader ('(' colTypeList ')')? tableProvider
-        (OPTIONS options=tablePropertyList)?
-        (PARTITIONED BY partitionColumnNames=identifierList)?
-        bucketSpec? locationSpec?
-        (COMMENT comment=STRING)?
-        (TBLPROPERTIES tableProps=tablePropertyList)?
+        ((OPTIONS options=tablePropertyList) |
+        (PARTITIONED BY partitionColumnNames=identifierList) |
+        bucketSpec |
+        locationSpec |
+        (COMMENT comment=STRING) |
+        (TBLPROPERTIES tableProps=tablePropertyList))*
         (AS? query)?                                                   #createTable
     | createTableHeader ('(' columns=colTypeList ')')?
-        (COMMENT comment=STRING)?
-        (PARTITIONED BY '(' partitionColumns=colTypeList ')')?
-        bucketSpec? skewSpec?
-        rowFormat?  createFileFormat? locationSpec?
-        (TBLPROPERTIES tablePropertyList)?
+        ((COMMENT comment=STRING) |
+        (PARTITIONED BY '(' partitionColumns=colTypeList ')') |
+        bucketSpec |
+        skewSpec |
+        rowFormat |
+        createFileFormat |
+        locationSpec |
+        (TBLPROPERTIES tableProps=tablePropertyList))*
         (AS? query)?                                                   #createHiveTable
     | CREATE TABLE (IF NOT EXISTS)? target=tableIdentifier
         LIKE source=tableIdentifier locationSpec?                      #createTableLike
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
index 9b127f91648e6..89347f4b1f7bf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
@@ -16,6 +16,8 @@
  */
 package org.apache.spark.sql.catalyst.parser
 
+import java.util
+
 import scala.collection.mutable.StringBuilder
 
 import org.antlr.v4.runtime.{ParserRuleContext, Token}
@@ -39,6 +41,13 @@ object ParserUtils {
     throw new ParseException(s"Operation not allowed: $message", ctx)
   }
 
+  def checkDuplicateClauses[T](
+      nodes: util.List[T], clauseName: String, ctx: ParserRuleContext): Unit = {
+    if (nodes.size() > 1) {
+      throw new ParseException(s"Found duplicate clauses: $clauseName", ctx)
+    }
+  }
+
   /** Check if duplicate keys exist in a set of key-value pairs. */
   def checkDuplicateKeys[T](keyPairs: Seq[(String, T)], ctx: ParserRuleContext): Unit = {
     keyPairs.groupBy(_._1).filter(_._2.size > 1).foreach { case (key, _) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 29b584b55972c..d3cfd2a1ffbf2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -383,16 +383,19 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
    * {{{
    *   CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db_name.]table_name
    *   USING table_provider
-   *   [OPTIONS table_property_list]
-   *   [PARTITIONED BY (col_name, col_name, ...)]
-   *   [CLUSTERED BY (col_name, col_name, ...)
-   *    [SORTED BY (col_name [ASC|DESC], ...)]
-   *    INTO num_buckets BUCKETS
-   *   ]
-   *   [LOCATION path]
-   *   [COMMENT table_comment]
-   *   [TBLPROPERTIES (property_name=property_value, ...)]
+   *   create_table_clauses
    *   [[AS] select_statement];
+   *
+   *   create_table_clauses (order insensitive):
+   *     [OPTIONS table_property_list]
+   *     [PARTITIONED BY (col_name, col_name, ...)]
+   *     [CLUSTERED BY (col_name, col_name, ...)
+   *       [SORTED BY (col_name [ASC|DESC], ...)]
+   *       INTO num_buckets BUCKETS
+   *     ]
+   *     [LOCATION path]
+   *     [COMMENT table_comment]
+   *     [TBLPROPERTIES (property_name=property_value, ...)]
    * }}}
    */
   override def visitCreateTable(ctx: CreateTableContext): LogicalPlan = withOrigin(ctx) {
@@ -400,6 +403,14 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
     if (external) {
       operationNotAllowed("CREATE EXTERNAL TABLE ... USING", ctx)
     }
+
+    checkDuplicateClauses(ctx.TBLPROPERTIES, "TBLPROPERTIES", ctx)
+    checkDuplicateClauses(ctx.OPTIONS, "OPTIONS", ctx)
+    checkDuplicateClauses(ctx.PARTITIONED, "PARTITIONED BY", ctx)
+    checkDuplicateClauses(ctx.COMMENT, "COMMENT", ctx)
+    checkDuplicateClauses(ctx.bucketSpec(), "CLUSTERED BY", ctx)
+    checkDuplicateClauses(ctx.locationSpec, "LOCATION", ctx)
+
     val options = Option(ctx.options).map(visitPropertyKeyValues).getOrElse(Map.empty)
     val provider = ctx.tableProvider.qualifiedName.getText
     val schema = Option(ctx.colTypeList()).map(createSchema)
@@ -408,9 +419,9 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
         .map(visitIdentifierList(_).toArray)
         .getOrElse(Array.empty[String])
     val properties = Option(ctx.tableProps).map(visitPropertyKeyValues).getOrElse(Map.empty)
-    val bucketSpec = Option(ctx.bucketSpec()).map(visitBucketSpec)
+    val bucketSpec = ctx.bucketSpec().asScala.headOption.map(visitBucketSpec)
 
-    val location = Option(ctx.locationSpec).map(visitLocationSpec)
+    val location = ctx.locationSpec.asScala.headOption.map(visitLocationSpec)
     val storage = DataSource.buildStorageFormatFromOptions(options)
 
     if (location.isDefined && storage.locationUri.isDefined) {
@@ -1087,13 +1098,16 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
    * {{{
    *   CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
    *   [(col1[:] data_type [COMMENT col_comment], ...)]
-   *   [COMMENT table_comment]
-   *   [PARTITIONED BY (col2[:] data_type [COMMENT col_comment], ...)]
-   *   [ROW FORMAT row_format]
-   *   [STORED AS file_format]
-   *   [LOCATION path]
-   *   [TBLPROPERTIES (property_name=property_value, ...)]
+   *   create_table_clauses
    *   [AS select_statement];
+   *
+   *   create_table_clauses (order insensitive):
+   *     [COMMENT table_comment]
+   *     [PARTITIONED BY (col2[:] data_type [COMMENT col_comment], ...)]
+   *     [ROW FORMAT row_format]
+   *     [STORED AS file_format]
+   *     [LOCATION path]
+   *     [TBLPROPERTIES (property_name=property_value, ...)]
    * }}}
    */
   override def visitCreateHiveTable(ctx: CreateHiveTableContext): LogicalPlan = withOrigin(ctx) {
@@ -1104,15 +1118,23 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
         "CREATE TEMPORARY TABLE is not supported yet. " +
           "Please use CREATE TEMPORARY VIEW as an alternative.", ctx)
     }
-    if (ctx.skewSpec != null) {
+    if (ctx.skewSpec.size > 0) {
       operationNotAllowed("CREATE TABLE ... SKEWED BY", ctx)
     }
 
+    checkDuplicateClauses(ctx.TBLPROPERTIES, "TBLPROPERTIES", ctx)
+    checkDuplicateClauses(ctx.PARTITIONED, "PARTITIONED BY", ctx)
+    checkDuplicateClauses(ctx.COMMENT, "COMMENT", ctx)
+    checkDuplicateClauses(ctx.bucketSpec(), "CLUSTERED BY", ctx)
+    checkDuplicateClauses(ctx.createFileFormat, "STORED AS/BY", ctx)
+    checkDuplicateClauses(ctx.rowFormat, "ROW FORMAT", ctx)
+    checkDuplicateClauses(ctx.locationSpec, "LOCATION", ctx)
+
     val dataCols = Option(ctx.columns).map(visitColTypeList).getOrElse(Nil)
     val partitionCols = Option(ctx.partitionColumns).map(visitColTypeList).getOrElse(Nil)
-    val properties = Option(ctx.tablePropertyList).map(visitPropertyKeyValues).getOrElse(Map.empty)
+    val properties = Option(ctx.tableProps).map(visitPropertyKeyValues).getOrElse(Map.empty)
     val selectQuery = Option(ctx.query).map(plan)
-    val bucketSpec = Option(ctx.bucketSpec()).map(visitBucketSpec)
+    val bucketSpec = ctx.bucketSpec().asScala.headOption.map(visitBucketSpec)
 
     // Note: Hive requires partition columns to be distinct from the schema, so we need
     // to include the partition columns here explicitly
@@ -1120,12 +1142,12 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
 
     // Storage format
     val defaultStorage = HiveSerDe.getDefaultStorage(conf)
-    validateRowFormatFileFormat(ctx.rowFormat, ctx.createFileFormat, ctx)
-    val fileStorage = Option(ctx.createFileFormat).map(visitCreateFileFormat)
+    validateRowFormatFileFormat(ctx.rowFormat.asScala, ctx.createFileFormat.asScala, ctx)
+    val fileStorage = ctx.createFileFormat.asScala.headOption.map(visitCreateFileFormat)
       .getOrElse(CatalogStorageFormat.empty)
-    val rowStorage = Option(ctx.rowFormat).map(visitRowFormat)
+    val rowStorage = ctx.rowFormat.asScala.headOption.map(visitRowFormat)
       .getOrElse(CatalogStorageFormat.empty)
-    val location = Option(ctx.locationSpec).map(visitLocationSpec)
+    val location = ctx.locationSpec.asScala.headOption.map(visitLocationSpec)
     // If we are creating an EXTERNAL table, then the LOCATION field is required
     if (external && location.isEmpty) {
       operationNotAllowed("CREATE EXTERNAL TABLE must be accompanied by LOCATION", ctx)
@@ -1180,7 +1202,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
             ctx)
         }
 
-        val hasStorageProperties = (ctx.createFileFormat != null) || (ctx.rowFormat != null)
+        val hasStorageProperties = (ctx.createFileFormat.size != 0) || (ctx.rowFormat.size != 0)
         if (conf.convertCTAS && !hasStorageProperties) {
           // At here, both rowStorage.serdeProperties and fileStorage.serdeProperties
           // are empty Maps.
@@ -1366,6 +1388,15 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
     }
   }
 
+  private def validateRowFormatFileFormat(
+      rowFormatCtx: Seq[RowFormatContext],
+      createFileFormatCtx: Seq[CreateFileFormatContext],
+      parentCtx: ParserRuleContext): Unit = {
+    if (rowFormatCtx.size == 1 && createFileFormatCtx.size == 1) {
+      validateRowFormatFileFormat(rowFormatCtx.head, createFileFormatCtx.head, parentCtx)
+    }
+  }
+
   /**
    * Create or replace a view. This creates a [[CreateViewCommand]] command.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
index eb7c33590b602..2b1aea08b1223 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
@@ -54,6 +54,13 @@ class DDLParserSuite extends PlanTest with SharedSQLContext {
     }
   }
 
+  private def intercept(sqlCommand: String, messages: String*): Unit = {
+    val e = intercept[ParseException](parser.parsePlan(sqlCommand)).getMessage
+    messages.foreach { message =>
+      assert(e.contains(message))
+    }
+  }
+
   private def parseAs[T: ClassTag](query: String): T = {
     parser.parsePlan(query) match {
       case t: T => t
@@ -494,6 +501,37 @@ class DDLParserSuite extends PlanTest with SharedSQLContext {
     }
   }
 
+  test("Duplicate clauses - create table") {
+    def createTableHeader(duplicateClause: String, isNative: Boolean): String = {
+      val fileFormat = if (isNative) "USING parquet" else "STORED AS parquet"
+      s"CREATE TABLE my_tab(a INT, b STRING) $fileFormat $duplicateClause $duplicateClause"
+    }
+
+    Seq(true, false).foreach { isNative =>
+      intercept(createTableHeader("TBLPROPERTIES('test' = 'test2')", isNative),
+        "Found duplicate clauses: TBLPROPERTIES")
+      intercept(createTableHeader("LOCATION '/tmp/file'", isNative),
+        "Found duplicate clauses: LOCATION")
+      intercept(createTableHeader("COMMENT 'a table'", isNative),
+        "Found duplicate clauses: COMMENT")
+      intercept(createTableHeader("CLUSTERED BY(b) INTO 256 BUCKETS", isNative),
+        "Found duplicate clauses: CLUSTERED BY")
+    }
+
+    // Only for native data source tables
+    intercept(createTableHeader("PARTITIONED BY (b)", isNative = true),
+      "Found duplicate clauses: PARTITIONED BY")
+
+    // Only for Hive serde tables
+    intercept(createTableHeader("PARTITIONED BY (k int)", isNative = false),
+      "Found duplicate clauses: PARTITIONED BY")
+    intercept(createTableHeader("STORED AS parquet", isNative = false),
+      "Found duplicate clauses: STORED AS/BY")
+    intercept(
+      createTableHeader("ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe'", isNative = false),
+      "Found duplicate clauses: ROW FORMAT")
+  }
+
   test("create table - with location") {
     val v1 = "CREATE TABLE my_tab(a INT, b STRING) USING parquet LOCATION '/tmp/file'"
 
@@ -1153,38 +1191,119 @@ class DDLParserSuite extends PlanTest with SharedSQLContext {
     }
   }
 
+  test("Test CTAS against data source tables") {
+    val s1 =
+      """
+        |CREATE TABLE IF NOT EXISTS mydb.page_view
+        |USING parquet
+        |COMMENT 'This is the staging page view table'
+        |LOCATION '/user/external/page_view'
+        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
+        |AS SELECT * FROM src
+      """.stripMargin
+
+    val s2 =
+      """
+        |CREATE TABLE IF NOT EXISTS mydb.page_view
+        |USING parquet
+        |LOCATION '/user/external/page_view'
+        |COMMENT 'This is the staging page view table'
+        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
+        |AS SELECT * FROM src
+      """.stripMargin
+
+    val s3 =
+      """
+        |CREATE TABLE IF NOT EXISTS mydb.page_view
+        |USING parquet
+        |COMMENT 'This is the staging page view table'
+        |LOCATION '/user/external/page_view'
+        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
+        |AS SELECT * FROM src
+      """.stripMargin
+
+    checkParsing(s1)
+    checkParsing(s2)
+    checkParsing(s3)
+
+    def checkParsing(sql: String): Unit = {
+      val (desc, exists) = extractTableDesc(sql)
+      assert(exists)
+      assert(desc.identifier.database == Some("mydb"))
+      assert(desc.identifier.table == "page_view")
+      assert(desc.storage.locationUri == Some(new URI("/user/external/page_view")))
+      assert(desc.schema.isEmpty) // will be populated later when the table is actually created
+      assert(desc.comment == Some("This is the staging page view table"))
+      assert(desc.viewText.isEmpty)
+      assert(desc.viewDefaultDatabase.isEmpty)
+      assert(desc.viewQueryColumnNames.isEmpty)
+      assert(desc.partitionColumnNames.isEmpty)
+      assert(desc.provider == Some("parquet"))
+      assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2"))
+    }
+  }
+
   test("Test CTAS #1") {
     val s1 =
-      """CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
+      """
+        |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
         |COMMENT 'This is the staging page view table'
         |STORED AS RCFILE
         |LOCATION '/user/external/page_view'
         |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
-        |AS SELECT * FROM src""".stripMargin
+        |AS SELECT * FROM src
+      """.stripMargin
 
-    val (desc, exists) = extractTableDesc(s1)
-    assert(exists)
-    assert(desc.identifier.database == Some("mydb"))
-    assert(desc.identifier.table == "page_view")
-    assert(desc.tableType == CatalogTableType.EXTERNAL)
-    assert(desc.storage.locationUri == Some(new URI("/user/external/page_view")))
-    assert(desc.schema.isEmpty) // will be populated later when the table is actually created
-    assert(desc.comment == Some("This is the staging page view table"))
-    // TODO will be SQLText
-    assert(desc.viewText.isEmpty)
-    assert(desc.viewDefaultDatabase.isEmpty)
-    assert(desc.viewQueryColumnNames.isEmpty)
-    assert(desc.partitionColumnNames.isEmpty)
-    assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileInputFormat"))
-    assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileOutputFormat"))
-    assert(desc.storage.serde ==
-      Some("org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe"))
-    assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2"))
+    val s2 =
+      """
+        |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
+        |STORED AS RCFILE
+        |COMMENT 'This is the staging page view table'
+        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
+        |LOCATION '/user/external/page_view'
+        |AS SELECT * FROM src
+      """.stripMargin
+
+    val s3 =
+      """
+        |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
+        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
+        |LOCATION '/user/external/page_view'
+        |STORED AS RCFILE
+        |COMMENT 'This is the staging page view table'
+        |AS SELECT * FROM src
+      """.stripMargin
+
+    checkParsing(s1)
+    checkParsing(s2)
+    checkParsing(s3)
+
+    def checkParsing(sql: String): Unit = {
+      val (desc, exists) = extractTableDesc(sql)
+      assert(exists)
+      assert(desc.identifier.database == Some("mydb"))
+      assert(desc.identifier.table == "page_view")
+      assert(desc.tableType == CatalogTableType.EXTERNAL)
+      assert(desc.storage.locationUri == Some(new URI("/user/external/page_view")))
+      assert(desc.schema.isEmpty) // will be populated later when the table is actually created
+      assert(desc.comment == Some("This is the staging page view table"))
+      // TODO will be SQLText
+      assert(desc.viewText.isEmpty)
+      assert(desc.viewDefaultDatabase.isEmpty)
+      assert(desc.viewQueryColumnNames.isEmpty)
+      assert(desc.partitionColumnNames.isEmpty)
+      assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileInputFormat"))
+      assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileOutputFormat"))
+      assert(desc.storage.serde ==
+        Some("org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe"))
+      assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2"))
+    }
   }
 
   test("Test CTAS #2") {
-    val s2 =
-      """CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
+    val s1 =
+      """
+        |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
         |COMMENT 'This is the staging page view table'
         |ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe'
         | STORED AS
@@ -1192,26 +1311,45 @@ class DDLParserSuite extends PlanTest with SharedSQLContext {
         | OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat'
         |LOCATION '/user/external/page_view'
         |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
-        |AS SELECT * FROM src""".stripMargin
+        |AS SELECT * FROM src
+      """.stripMargin
 
-    val (desc, exists) = extractTableDesc(s2)
-    assert(exists)
-    assert(desc.identifier.database == Some("mydb"))
-    assert(desc.identifier.table == "page_view")
-    assert(desc.tableType == CatalogTableType.EXTERNAL)
-    assert(desc.storage.locationUri == Some(new URI("/user/external/page_view")))
-    assert(desc.schema.isEmpty) // will be populated later when the table is actually created
-    // TODO will be SQLText
-    assert(desc.comment == Some("This is the staging page view table"))
-    assert(desc.viewText.isEmpty)
-    assert(desc.viewDefaultDatabase.isEmpty)
-    assert(desc.viewQueryColumnNames.isEmpty)
-    assert(desc.partitionColumnNames.isEmpty)
-    assert(desc.storage.properties == Map())
-    assert(desc.storage.inputFormat == Some("parquet.hive.DeprecatedParquetInputFormat"))
-    assert(desc.storage.outputFormat == Some("parquet.hive.DeprecatedParquetOutputFormat"))
-    assert(desc.storage.serde == Some("parquet.hive.serde.ParquetHiveSerDe"))
-    assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2"))
+    val s2 =
+      """
+        |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
+        |LOCATION '/user/external/page_view'
+        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
+        |ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe'
+        | STORED AS
+        | INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat'
+        | OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat'
+        |COMMENT 'This is the staging page view table'
+        |AS SELECT * FROM src
+      """.stripMargin
+
+    checkParsing(s1)
+    checkParsing(s2)
+
+    def checkParsing(sql: String): Unit = {
+      val (desc, exists) = extractTableDesc(sql)
+      assert(exists)
+      assert(desc.identifier.database == Some("mydb"))
+      assert(desc.identifier.table == "page_view")
+      assert(desc.tableType == CatalogTableType.EXTERNAL)
+      assert(desc.storage.locationUri == Some(new URI("/user/external/page_view")))
+      assert(desc.schema.isEmpty) // will be populated later when the table is actually created
+      // TODO will be SQLText
+      assert(desc.comment == Some("This is the staging page view table"))
+      assert(desc.viewText.isEmpty)
+      assert(desc.viewDefaultDatabase.isEmpty)
+      assert(desc.viewQueryColumnNames.isEmpty)
+      assert(desc.partitionColumnNames.isEmpty)
+      assert(desc.storage.properties == Map())
+      assert(desc.storage.inputFormat == Some("parquet.hive.DeprecatedParquetInputFormat"))
+      assert(desc.storage.outputFormat == Some("parquet.hive.DeprecatedParquetOutputFormat"))
+      assert(desc.storage.serde == Some("parquet.hive.serde.ParquetHiveSerDe"))
+      assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2"))
+    }
   }
 
   test("Test CTAS #3") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index fdb9b2f51f9cb..591510c1d8283 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -1971,8 +1971,8 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
           s"""
              |CREATE TABLE t(a int, b int, c int, d int)
              |USING parquet
-             |PARTITIONED BY(a, b)
              |LOCATION "${dir.toURI}"
+             |PARTITIONED BY(a, b)
            """.stripMargin)
         spark.sql("INSERT INTO TABLE t PARTITION(a=1, b=2) SELECT 3, 4")
         checkAnswer(spark.table("t"), Row(3, 4, 1, 2) :: Nil)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index f2e0c695ca38b..65be244418670 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -875,12 +875,13 @@ class HiveDDLSuite
 
   test("desc table for Hive table - bucketed + sorted table") {
     withTable("tbl") {
-      sql(s"""
-        CREATE TABLE tbl (id int, name string)
-        PARTITIONED BY (ds string)
-        CLUSTERED BY(id)
-        SORTED BY(id, name) INTO 1024 BUCKETS
-        """)
+      sql(
+        s"""
+          |CREATE TABLE tbl (id int, name string)
+          |CLUSTERED BY(id)
+          |SORTED BY(id, name) INTO 1024 BUCKETS
+          |PARTITIONED BY (ds string)
+        """.stripMargin)
 
       val x = sql("DESC FORMATTED tbl").collect()
       assert(x.containsSlice(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 07ae3ae945848..47adc77a52d51 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -461,51 +461,55 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   }
 
   test("CTAS without serde without location") {
-    val originalConf = sessionState.conf.convertCTAS
-
-    setConf(SQLConf.CONVERT_CTAS, true)
-
-    val defaultDataSource = sessionState.conf.defaultDataSourceName
-    try {
-      sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
-      sql("CREATE TABLE IF NOT EXISTS ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
-      val message = intercept[AnalysisException] {
+    withSQLConf(SQLConf.CONVERT_CTAS.key -> "true") {
+      val defaultDataSource = sessionState.conf.defaultDataSourceName
+      withTable("ctas1") {
         sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
-      }.getMessage
-      assert(message.contains("already exists"))
-      checkRelation("ctas1", true, defaultDataSource)
-      sql("DROP TABLE ctas1")
+        sql("CREATE TABLE IF NOT EXISTS ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
+        val message = intercept[AnalysisException] {
+          sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
+        }.getMessage
+        assert(message.contains("already exists"))
+        checkRelation("ctas1", isDataSourceTable = true, defaultDataSource)
+      }
 
       // Specifying database name for query can be converted to data source write path
       // is not allowed right now.
-      sql("CREATE TABLE default.ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
-      checkRelation("ctas1", true, defaultDataSource)
-      sql("DROP TABLE ctas1")
+      withTable("ctas1") {
+        sql("CREATE TABLE default.ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
+        checkRelation("ctas1", isDataSourceTable = true, defaultDataSource)
+      }
 
-      sql("CREATE TABLE ctas1 stored as textfile" +
+      withTable("ctas1") {
+        sql("CREATE TABLE ctas1 stored as textfile" +
           " AS SELECT key k, value FROM src ORDER BY k, value")
-      checkRelation("ctas1", false, "text")
-      sql("DROP TABLE ctas1")
+        checkRelation("ctas1", isDataSourceTable = false, "text")
+      }
 
-      sql("CREATE TABLE ctas1 stored as sequencefile" +
-            " AS SELECT key k, value FROM src ORDER BY k, value")
-      checkRelation("ctas1", false, "sequence")
-      sql("DROP TABLE ctas1")
+      withTable("ctas1") {
+        sql("CREATE TABLE ctas1 stored as sequencefile" +
+          " AS SELECT key k, value FROM src ORDER BY k, value")
+        checkRelation("ctas1", isDataSourceTable = false, "sequence")
+      }
 
-      sql("CREATE TABLE ctas1 stored as rcfile AS SELECT key k, value FROM src ORDER BY k, value")
-      checkRelation("ctas1", false, "rcfile")
-      sql("DROP TABLE ctas1")
+      withTable("ctas1") {
+        sql("CREATE TABLE ctas1 stored as rcfile AS SELECT key k, value FROM src ORDER BY k, value")
+        checkRelation("ctas1", isDataSourceTable = false, "rcfile")
+      }
 
-      sql("CREATE TABLE ctas1 stored as orc AS SELECT key k, value FROM src ORDER BY k, value")
-      checkRelation("ctas1", false, "orc")
-      sql("DROP TABLE ctas1")
+      withTable("ctas1") {
+        sql("CREATE TABLE ctas1 stored as orc AS SELECT key k, value FROM src ORDER BY k, value")
+        checkRelation("ctas1", isDataSourceTable = false, "orc")
+      }
 
-      sql("CREATE TABLE ctas1 stored as parquet AS SELECT key k, value FROM src ORDER BY k, value")
-      checkRelation("ctas1", false, "parquet")
-      sql("DROP TABLE ctas1")
-    } finally {
-      setConf(SQLConf.CONVERT_CTAS, originalConf)
-      sql("DROP TABLE IF EXISTS ctas1")
+      withTable("ctas1") {
+        sql(
+          """
+            |CREATE TABLE ctas1 stored as parquet
+            |AS SELECT key k, value FROM src ORDER BY k, value
+          """.stripMargin)
+        checkRelation("ctas1", isDataSourceTable = false, "parquet")
+      }
     }
   }
 
@@ -539,30 +543,40 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
         val defaultDataSource = sessionState.conf.defaultDataSourceName
 
         val tempLocation = dir.toURI.getPath.stripSuffix("/")
-        sql(s"CREATE TABLE ctas1 LOCATION 'file:$tempLocation/c1'" +
-          " AS SELECT key k, value FROM src ORDER BY k, value")
-        checkRelation("ctas1", true, defaultDataSource, Some(s"file:$tempLocation/c1"))
-        sql("DROP TABLE ctas1")
+        withTable("ctas1") {
+          sql(s"CREATE TABLE ctas1 LOCATION 'file:$tempLocation/c1'" +
+            " AS SELECT key k, value FROM src ORDER BY k, value")
+          checkRelation(
+            "ctas1", isDataSourceTable = true, defaultDataSource, Some(s"file:$tempLocation/c1"))
+        }
 
-        sql(s"CREATE TABLE ctas1 LOCATION 'file:$tempLocation/c2'" +
-          " AS SELECT key k, value FROM src ORDER BY k, value")
-        checkRelation("ctas1", true, defaultDataSource, Some(s"file:$tempLocation/c2"))
-        sql("DROP TABLE ctas1")
+        withTable("ctas1") {
+          sql(s"CREATE TABLE ctas1 LOCATION 'file:$tempLocation/c2'" +
+            " AS SELECT key k, value FROM src ORDER BY k, value")
+          checkRelation(
+            "ctas1", isDataSourceTable = true, defaultDataSource, Some(s"file:$tempLocation/c2"))
+        }
 
-        sql(s"CREATE TABLE ctas1 stored as textfile LOCATION 'file:$tempLocation/c3'" +
-          " AS SELECT key k, value FROM src ORDER BY k, value")
-        checkRelation("ctas1", false, "text", Some(s"file:$tempLocation/c3"))
-        sql("DROP TABLE ctas1")
+        withTable("ctas1") {
+          sql(s"CREATE TABLE ctas1 stored as textfile LOCATION 'file:$tempLocation/c3'" +
+            " AS SELECT key k, value FROM src ORDER BY k, value")
+          checkRelation(
+            "ctas1", isDataSourceTable = false, "text", Some(s"file:$tempLocation/c3"))
+        }
 
-        sql(s"CREATE TABLE ctas1 stored as sequenceFile LOCATION 'file:$tempLocation/c4'" +
-          " AS SELECT key k, value FROM src ORDER BY k, value")
-        checkRelation("ctas1", false, "sequence", Some(s"file:$tempLocation/c4"))
-        sql("DROP TABLE ctas1")
+        withTable("ctas1") {
+          sql(s"CREATE TABLE ctas1 stored as sequenceFile LOCATION 'file:$tempLocation/c4'" +
+            " AS SELECT key k, value FROM src ORDER BY k, value")
+          checkRelation(
+            "ctas1", isDataSourceTable = false, "sequence", Some(s"file:$tempLocation/c4"))
+        }
 
-        sql(s"CREATE TABLE ctas1 stored as rcfile LOCATION 'file:$tempLocation/c5'" +
-          " AS SELECT key k, value FROM src ORDER BY k, value")
-        checkRelation("ctas1", false, "rcfile", Some(s"file:$tempLocation/c5"))
-        sql("DROP TABLE ctas1")
+        withTable("ctas1") {
+          sql(s"CREATE TABLE ctas1 stored as rcfile LOCATION 'file:$tempLocation/c5'" +
+            " AS SELECT key k, value FROM src ORDER BY k, value")
+          checkRelation(
+            "ctas1", isDataSourceTable = false, "rcfile", Some(s"file:$tempLocation/c5"))
+        }
       }
     }
   }

From b96248862589bae1ddcdb14ce4c802789a001306 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 3 Jan 2018 22:18:13 +0800
Subject: [PATCH 004/566] [SPARK-20236][SQL] dynamic partition overwrite

## What changes were proposed in this pull request?

When overwriting a partitioned table with dynamic partition columns, the behavior is different between data source and hive tables.

data source table: delete all partition directories that match the static partition values provided in the insert statement.

hive table: only delete partition directories which have data written into it

This PR adds a new config to make users be able to choose hive's behavior.

## How was this patch tested?

new tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #18714 from cloud-fan/overwrite-partition.

(cherry picked from commit a66fe36cee9363b01ee70e469f1c968f633c5713)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../internal/io/FileCommitProtocol.scala      | 25 ++++--
 .../io/HadoopMapReduceCommitProtocol.scala    | 75 ++++++++++++++----
 .../apache/spark/sql/internal/SQLConf.scala   | 21 +++++
 .../InsertIntoHadoopFsRelationCommand.scala   | 20 ++++-
 .../SQLHadoopMapReduceCommitProtocol.scala    | 10 ++-
 .../spark/sql/sources/InsertSuite.scala       | 78 +++++++++++++++++++
 6 files changed, 200 insertions(+), 29 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 50f51e1af4530..6d0059b6a0272 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -28,8 +28,9 @@ import org.apache.spark.util.Utils
  *
  * 1. Implementations must be serializable, as the committer instance instantiated on the driver
  *    will be used for tasks on executors.
- * 2. Implementations should have a constructor with 2 arguments:
- *      (jobId: String, path: String)
+ * 2. Implementations should have a constructor with 2 or 3 arguments:
+ *      (jobId: String, path: String) or
+ *      (jobId: String, path: String, dynamicPartitionOverwrite: Boolean)
  * 3. A committer should not be reused across multiple Spark jobs.
  *
  * The proper call sequence is:
@@ -139,10 +140,22 @@ object FileCommitProtocol {
   /**
    * Instantiates a FileCommitProtocol using the given className.
    */
-  def instantiate(className: String, jobId: String, outputPath: String)
-    : FileCommitProtocol = {
+  def instantiate(
+      className: String,
+      jobId: String,
+      outputPath: String,
+      dynamicPartitionOverwrite: Boolean = false): FileCommitProtocol = {
     val clazz = Utils.classForName(className).asInstanceOf[Class[FileCommitProtocol]]
-    val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String])
-    ctor.newInstance(jobId, outputPath)
+    // First try the constructor with arguments (jobId: String, outputPath: String,
+    // dynamicPartitionOverwrite: Boolean).
+    // If that doesn't exist, try the one with (jobId: string, outputPath: String).
+    try {
+      val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String], classOf[Boolean])
+      ctor.newInstance(jobId, outputPath, dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
+    } catch {
+      case _: NoSuchMethodException =>
+        val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String])
+        ctor.newInstance(jobId, outputPath)
+    }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 95c99d29c3a9c..6d20ef1f98a3c 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -39,8 +39,19 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
  *
  * @param jobId the job's or stage's id
  * @param path the job's output path, or null if committer acts as a noop
+ * @param dynamicPartitionOverwrite If true, Spark will overwrite partition directories at runtime
+ *                                  dynamically, i.e., we first write files under a staging
+ *                                  directory with partition path, e.g.
+ *                                  /path/to/staging/a=1/b=1/xxx.parquet. When committing the job,
+ *                                  we first clean up the corresponding partition directories at
+ *                                  destination path, e.g. /path/to/destination/a=1/b=1, and move
+ *                                  files from staging directory to the corresponding partition
+ *                                  directories under destination path.
  */
-class HadoopMapReduceCommitProtocol(jobId: String, path: String)
+class HadoopMapReduceCommitProtocol(
+    jobId: String,
+    path: String,
+    dynamicPartitionOverwrite: Boolean = false)
   extends FileCommitProtocol with Serializable with Logging {
 
   import FileCommitProtocol._
@@ -67,9 +78,17 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
   @transient private var addedAbsPathFiles: mutable.Map[String, String] = null
 
   /**
-   * The staging directory for all files committed with absolute output paths.
+   * Tracks partitions with default path that have new files written into them by this task,
+   * e.g. a=1/b=2. Files under these partitions will be saved into staging directory and moved to
+   * destination directory at the end, if `dynamicPartitionOverwrite` is true.
    */
-  private def absPathStagingDir: Path = new Path(path, "_temporary-" + jobId)
+  @transient private var partitionPaths: mutable.Set[String] = null
+
+  /**
+   * The staging directory of this write job. Spark uses it to deal with files with absolute output
+   * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true.
+   */
+  private def stagingDir = new Path(path, ".spark-staging-" + jobId)
 
   protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     val format = context.getOutputFormatClass.newInstance()
@@ -85,11 +104,16 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
       taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = {
     val filename = getFilename(taskContext, ext)
 
-    val stagingDir: String = committer match {
+    val stagingDir: Path = committer match {
+      case _ if dynamicPartitionOverwrite =>
+        assert(dir.isDefined,
+          "The dataset to be written must be partitioned when dynamicPartitionOverwrite is true.")
+        partitionPaths += dir.get
+        this.stagingDir
       // For FileOutputCommitter it has its own staging path called "work path".
       case f: FileOutputCommitter =>
-        Option(f.getWorkPath).map(_.toString).getOrElse(path)
-      case _ => path
+        new Path(Option(f.getWorkPath).map(_.toString).getOrElse(path))
+      case _ => new Path(path)
     }
 
     dir.map { d =>
@@ -106,8 +130,7 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
 
     // Include a UUID here to prevent file collisions for one task writing to different dirs.
     // In principle we could include hash(absoluteDir) instead but this is simpler.
-    val tmpOutputPath = new Path(
-      absPathStagingDir, UUID.randomUUID().toString() + "-" + filename).toString
+    val tmpOutputPath = new Path(stagingDir, UUID.randomUUID().toString() + "-" + filename).toString
 
     addedAbsPathFiles(tmpOutputPath) = absOutputPath
     tmpOutputPath
@@ -141,23 +164,42 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
 
   override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = {
     committer.commitJob(jobContext)
-    val filesToMove = taskCommits.map(_.obj.asInstanceOf[Map[String, String]])
-      .foldLeft(Map[String, String]())(_ ++ _)
-    logDebug(s"Committing files staged for absolute locations $filesToMove")
+
     if (hasValidPath) {
-      val fs = absPathStagingDir.getFileSystem(jobContext.getConfiguration)
+      val (allAbsPathFiles, allPartitionPaths) =
+        taskCommits.map(_.obj.asInstanceOf[(Map[String, String], Set[String])]).unzip
+      val fs = stagingDir.getFileSystem(jobContext.getConfiguration)
+
+      val filesToMove = allAbsPathFiles.foldLeft(Map[String, String]())(_ ++ _)
+      logDebug(s"Committing files staged for absolute locations $filesToMove")
+      if (dynamicPartitionOverwrite) {
+        val absPartitionPaths = filesToMove.values.map(new Path(_).getParent).toSet
+        logDebug(s"Clean up absolute partition directories for overwriting: $absPartitionPaths")
+        absPartitionPaths.foreach(fs.delete(_, true))
+      }
       for ((src, dst) <- filesToMove) {
         fs.rename(new Path(src), new Path(dst))
       }
-      fs.delete(absPathStagingDir, true)
+
+      if (dynamicPartitionOverwrite) {
+        val partitionPaths = allPartitionPaths.foldLeft(Set[String]())(_ ++ _)
+        logDebug(s"Clean up default partition directories for overwriting: $partitionPaths")
+        for (part <- partitionPaths) {
+          val finalPartPath = new Path(path, part)
+          fs.delete(finalPartPath, true)
+          fs.rename(new Path(stagingDir, part), finalPartPath)
+        }
+      }
+
+      fs.delete(stagingDir, true)
     }
   }
 
   override def abortJob(jobContext: JobContext): Unit = {
     committer.abortJob(jobContext, JobStatus.State.FAILED)
     if (hasValidPath) {
-      val fs = absPathStagingDir.getFileSystem(jobContext.getConfiguration)
-      fs.delete(absPathStagingDir, true)
+      val fs = stagingDir.getFileSystem(jobContext.getConfiguration)
+      fs.delete(stagingDir, true)
     }
   }
 
@@ -165,13 +207,14 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
     committer = setupCommitter(taskContext)
     committer.setupTask(taskContext)
     addedAbsPathFiles = mutable.Map[String, String]()
+    partitionPaths = mutable.Set[String]()
   }
 
   override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = {
     val attemptId = taskContext.getTaskAttemptID
     SparkHadoopMapRedUtil.commitTask(
       committer, taskContext, attemptId.getJobID.getId, attemptId.getTaskID.getId)
-    new TaskCommitMessage(addedAbsPathFiles.toMap)
+    new TaskCommitMessage(addedAbsPathFiles.toMap -> partitionPaths.toSet)
   }
 
   override def abortTask(taskContext: TaskAttemptContext): Unit = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 80cdc61484c0f..5d6edf6b8abec 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1068,6 +1068,24 @@ object SQLConf {
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefault(100)
 
+  object PartitionOverwriteMode extends Enumeration {
+    val STATIC, DYNAMIC = Value
+  }
+
+  val PARTITION_OVERWRITE_MODE =
+    buildConf("spark.sql.sources.partitionOverwriteMode")
+      .doc("When INSERT OVERWRITE a partitioned data source table, we currently support 2 modes: " +
+        "static and dynamic. In static mode, Spark deletes all the partitions that match the " +
+        "partition specification(e.g. PARTITION(a=1,b)) in the INSERT statement, before " +
+        "overwriting. In dynamic mode, Spark doesn't delete partitions ahead, and only overwrite " +
+        "those partitions that have data written into it at runtime. By default we use static " +
+        "mode to keep the same behavior of Spark prior to 2.3. Note that this config doesn't " +
+        "affect Hive serde tables, as they are always overwritten with dynamic mode.")
+      .stringConf
+      .transform(_.toUpperCase(Locale.ROOT))
+      .checkValues(PartitionOverwriteMode.values.map(_.toString))
+      .createWithDefault(PartitionOverwriteMode.STATIC.toString)
+
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
   }
@@ -1394,6 +1412,9 @@ class SQLConf extends Serializable with Logging {
 
   def concatBinaryAsString: Boolean = getConf(CONCAT_BINARY_AS_STRING)
 
+  def partitionOverwriteMode: PartitionOverwriteMode.Value =
+    PartitionOverwriteMode.withName(getConf(PARTITION_OVERWRITE_MODE))
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index ad24e280d942a..dd7ef0d15c140 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.command._
+import org.apache.spark.sql.internal.SQLConf.PartitionOverwriteMode
 import org.apache.spark.sql.util.SchemaUtils
 
 /**
@@ -89,13 +90,19 @@ case class InsertIntoHadoopFsRelationCommand(
     }
 
     val pathExists = fs.exists(qualifiedOutputPath)
-    // If we are appending data to an existing dir.
-    val isAppend = pathExists && (mode == SaveMode.Append)
+
+    val enableDynamicOverwrite =
+      sparkSession.sessionState.conf.partitionOverwriteMode == PartitionOverwriteMode.DYNAMIC
+    // This config only makes sense when we are overwriting a partitioned dataset with dynamic
+    // partition columns.
+    val dynamicPartitionOverwrite = enableDynamicOverwrite && mode == SaveMode.Overwrite &&
+      staticPartitions.size < partitionColumns.length
 
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = java.util.UUID.randomUUID().toString,
-      outputPath = outputPath.toString)
+      outputPath = outputPath.toString,
+      dynamicPartitionOverwrite = dynamicPartitionOverwrite)
 
     val doInsertion = (mode, pathExists) match {
       case (SaveMode.ErrorIfExists, true) =>
@@ -103,6 +110,9 @@ case class InsertIntoHadoopFsRelationCommand(
       case (SaveMode.Overwrite, true) =>
         if (ifPartitionNotExists && matchingPartitions.nonEmpty) {
           false
+        } else if (dynamicPartitionOverwrite) {
+          // For dynamic partition overwrite, do not delete partition directories ahead.
+          true
         } else {
           deleteMatchingPartitions(fs, qualifiedOutputPath, customPartitionLocations, committer)
           true
@@ -126,7 +136,9 @@ case class InsertIntoHadoopFsRelationCommand(
               catalogTable.get.identifier, newPartitions.toSeq.map(p => (p, None)),
               ifNotExists = true).run(sparkSession)
           }
-          if (mode == SaveMode.Overwrite) {
+          // For dynamic partition overwrite, we never remove partitions but only update existing
+          // ones.
+          if (mode == SaveMode.Overwrite && !dynamicPartitionOverwrite) {
             val deletedPartitions = initialMatchingPartitions.toSet -- updatedPartitions
             if (deletedPartitions.nonEmpty) {
               AlterTableDropPartitionCommand(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 40825a1f724b1..39c594a9bc618 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -29,11 +29,15 @@ import org.apache.spark.sql.internal.SQLConf
  * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual
  * Hadoop output committer using an option specified in SQLConf.
  */
-class SQLHadoopMapReduceCommitProtocol(jobId: String, path: String)
-  extends HadoopMapReduceCommitProtocol(jobId, path) with Serializable with Logging {
+class SQLHadoopMapReduceCommitProtocol(
+    jobId: String,
+    path: String,
+    dynamicPartitionOverwrite: Boolean = false)
+  extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite)
+    with Serializable with Logging {
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
-    var committer = context.getOutputFormatClass.newInstance().getOutputCommitter(context)
+    var committer = super.setupCommitter(context)
 
     val configuration = context.getConfiguration
     val clazz =
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index 8b7e2e5f45946..fef01c860db6e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -21,6 +21,8 @@ import java.io.File
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.{AnalysisException, Row}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.SQLConf.PartitionOverwriteMode
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.util.Utils
 
@@ -442,4 +444,80 @@ class InsertSuite extends DataSourceTest with SharedSQLContext {
       assert(e.contains("Only Data Sources providing FileFormat are supported"))
     }
   }
+
+  test("SPARK-20236: dynamic partition overwrite without catalog table") {
+    withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> PartitionOverwriteMode.DYNAMIC.toString) {
+      withTempPath { path =>
+        Seq((1, 1, 1)).toDF("i", "part1", "part2")
+          .write.partitionBy("part1", "part2").parquet(path.getAbsolutePath)
+        checkAnswer(spark.read.parquet(path.getAbsolutePath), Row(1, 1, 1))
+
+        Seq((2, 1, 1)).toDF("i", "part1", "part2")
+          .write.partitionBy("part1", "part2").mode("overwrite").parquet(path.getAbsolutePath)
+        checkAnswer(spark.read.parquet(path.getAbsolutePath), Row(2, 1, 1))
+
+        Seq((2, 2, 2)).toDF("i", "part1", "part2")
+          .write.partitionBy("part1", "part2").mode("overwrite").parquet(path.getAbsolutePath)
+        checkAnswer(spark.read.parquet(path.getAbsolutePath), Row(2, 1, 1) :: Row(2, 2, 2) :: Nil)
+      }
+    }
+  }
+
+  test("SPARK-20236: dynamic partition overwrite") {
+    withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> PartitionOverwriteMode.DYNAMIC.toString) {
+      withTable("t") {
+        sql(
+          """
+            |create table t(i int, part1 int, part2 int) using parquet
+            |partitioned by (part1, part2)
+          """.stripMargin)
+
+        sql("insert into t partition(part1=1, part2=1) select 1")
+        checkAnswer(spark.table("t"), Row(1, 1, 1))
+
+        sql("insert overwrite table t partition(part1=1, part2=1) select 2")
+        checkAnswer(spark.table("t"), Row(2, 1, 1))
+
+        sql("insert overwrite table t partition(part1=2, part2) select 2, 2")
+        checkAnswer(spark.table("t"), Row(2, 1, 1) :: Row(2, 2, 2) :: Nil)
+
+        sql("insert overwrite table t partition(part1=1, part2=2) select 3")
+        checkAnswer(spark.table("t"), Row(2, 1, 1) :: Row(2, 2, 2) :: Row(3, 1, 2) :: Nil)
+
+        sql("insert overwrite table t partition(part1=1, part2) select 4, 1")
+        checkAnswer(spark.table("t"), Row(4, 1, 1) :: Row(2, 2, 2) :: Row(3, 1, 2) :: Nil)
+      }
+    }
+  }
+
+  test("SPARK-20236: dynamic partition overwrite with customer partition path") {
+    withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> PartitionOverwriteMode.DYNAMIC.toString) {
+      withTable("t") {
+        sql(
+          """
+            |create table t(i int, part1 int, part2 int) using parquet
+            |partitioned by (part1, part2)
+          """.stripMargin)
+
+        val path1 = Utils.createTempDir()
+        sql(s"alter table t add partition(part1=1, part2=1) location '$path1'")
+        sql(s"insert into t partition(part1=1, part2=1) select 1")
+        checkAnswer(spark.table("t"), Row(1, 1, 1))
+
+        sql("insert overwrite table t partition(part1=1, part2=1) select 2")
+        checkAnswer(spark.table("t"), Row(2, 1, 1))
+
+        sql("insert overwrite table t partition(part1=2, part2) select 2, 2")
+        checkAnswer(spark.table("t"), Row(2, 1, 1) :: Row(2, 2, 2) :: Nil)
+
+        val path2 = Utils.createTempDir()
+        sql(s"alter table t add partition(part1=1, part2=2) location '$path2'")
+        sql("insert overwrite table t partition(part1=1, part2=2) select 3")
+        checkAnswer(spark.table("t"), Row(2, 1, 1) :: Row(2, 2, 2) :: Row(3, 1, 2) :: Nil)
+
+        sql("insert overwrite table t partition(part1=1, part2) select 4, 1")
+        checkAnswer(spark.table("t"), Row(4, 1, 1) :: Row(2, 2, 2) :: Row(3, 1, 2) :: Nil)
+      }
+    }
+  }
 }

From 27c949d673e45fdbbae0f2c08969b9d51222dd8d Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 2 Jan 2018 09:19:18 +0800
Subject: [PATCH 005/566] [SPARK-22932][SQL] Refactor AnalysisContext

## What changes were proposed in this pull request?
Add a `reset` function to ensure the state in `AnalysisContext ` is per-query.

## How was this patch tested?
The existing test cases

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20127 from gatorsmile/refactorAnalysisContext.
---
 .../sql/catalyst/analysis/Analyzer.scala      | 25 +++++++++++++++----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 6d294d48c0ee7..35b35110e491f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -52,6 +52,7 @@ object SimpleAnalyzer extends Analyzer(
 /**
  * Provides a way to keep state during the analysis, this enables us to decouple the concerns
  * of analysis environment from the catalog.
+ * The state that is kept here is per-query.
  *
  * Note this is thread local.
  *
@@ -70,6 +71,8 @@ object AnalysisContext {
   }
 
   def get: AnalysisContext = value.get()
+  def reset(): Unit = value.remove()
+
   private def set(context: AnalysisContext): Unit = value.set(context)
 
   def withAnalysisContext[A](database: Option[String])(f: => A): A = {
@@ -95,6 +98,17 @@ class Analyzer(
     this(catalog, conf, conf.optimizerMaxIterations)
   }
 
+  override def execute(plan: LogicalPlan): LogicalPlan = {
+    AnalysisContext.reset()
+    try {
+      executeSameContext(plan)
+    } finally {
+      AnalysisContext.reset()
+    }
+  }
+
+  private def executeSameContext(plan: LogicalPlan): LogicalPlan = super.execute(plan)
+
   def resolver: Resolver = conf.resolver
 
   protected val fixedPoint = FixedPoint(maxIterations)
@@ -176,7 +190,7 @@ class Analyzer(
       case With(child, relations) =>
         substituteCTE(child, relations.foldLeft(Seq.empty[(String, LogicalPlan)]) {
           case (resolved, (name, relation)) =>
-            resolved :+ name -> execute(substituteCTE(relation, resolved))
+            resolved :+ name -> executeSameContext(substituteCTE(relation, resolved))
         })
       case other => other
     }
@@ -600,7 +614,7 @@ class Analyzer(
               "avoid errors. Increase the value of spark.sql.view.maxNestedViewDepth to work " +
               "aroud this.")
           }
-          execute(child)
+          executeSameContext(child)
         }
         view.copy(child = newChild)
       case p @ SubqueryAlias(_, view: View) =>
@@ -1269,7 +1283,7 @@ class Analyzer(
       do {
         // Try to resolve the subquery plan using the regular analyzer.
         previous = current
-        current = execute(current)
+        current = executeSameContext(current)
 
         // Use the outer references to resolve the subquery plan if it isn't resolved yet.
         val i = plans.iterator
@@ -1392,7 +1406,7 @@ class Analyzer(
               grouping,
               Alias(cond, "havingCondition")() :: Nil,
               child)
-          val resolvedOperator = execute(aggregatedCondition)
+          val resolvedOperator = executeSameContext(aggregatedCondition)
           def resolvedAggregateFilter =
             resolvedOperator
               .asInstanceOf[Aggregate]
@@ -1450,7 +1464,8 @@ class Analyzer(
           val aliasedOrdering =
             unresolvedSortOrders.map(o => Alias(o.child, "aggOrder")())
           val aggregatedOrdering = aggregate.copy(aggregateExpressions = aliasedOrdering)
-          val resolvedAggregate: Aggregate = execute(aggregatedOrdering).asInstanceOf[Aggregate]
+          val resolvedAggregate: Aggregate =
+            executeSameContext(aggregatedOrdering).asInstanceOf[Aggregate]
           val resolvedAliasedOrdering: Seq[Alias] =
             resolvedAggregate.aggregateExpressions.asInstanceOf[Seq[Alias]]
 

From 79f7263daa5f83e2026fda9a8bbb1090a1333f80 Mon Sep 17 00:00:00 2001
From: chetkhatri <ckhatrimanjal@gmail.com>
Date: Wed, 3 Jan 2018 11:31:32 -0600
Subject: [PATCH 006/566] [SPARK-22896] Improvement in String interpolation

## What changes were proposed in this pull request?

* String interpolation in ml pipeline example has been corrected as per scala standard.

## How was this patch tested?
* manually tested.

Author: chetkhatri <ckhatrimanjal@gmail.com>

Closes #20070 from chetkhatri/mllib-chetan-contrib.

(cherry picked from commit 9a2b65a3c0c36316aae0a53aa0f61c5044c2ceff)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../spark/examples/ml/JavaQuantileDiscretizerExample.java | 2 +-
 .../apache/spark/examples/SimpleSkewedGroupByTest.scala   | 4 ----
 .../org/apache/spark/examples/graphx/Analytics.scala      | 6 ++++--
 .../org/apache/spark/examples/graphx/SynthBenchmark.scala | 6 +++---
 .../apache/spark/examples/ml/ChiSquareTestExample.scala   | 6 +++---
 .../org/apache/spark/examples/ml/CorrelationExample.scala | 4 ++--
 .../org/apache/spark/examples/ml/DataFrameExample.scala   | 4 ++--
 .../examples/ml/DecisionTreeClassificationExample.scala   | 4 ++--
 .../spark/examples/ml/DecisionTreeRegressionExample.scala | 4 ++--
 .../apache/spark/examples/ml/DeveloperApiExample.scala    | 6 +++---
 .../examples/ml/EstimatorTransformerParamExample.scala    | 6 +++---
 .../ml/GradientBoostedTreeClassifierExample.scala         | 4 ++--
 .../examples/ml/GradientBoostedTreeRegressorExample.scala | 4 ++--
 ...ulticlassLogisticRegressionWithElasticNetExample.scala | 2 +-
 .../ml/MultilayerPerceptronClassifierExample.scala        | 2 +-
 .../org/apache/spark/examples/ml/NaiveBayesExample.scala  | 2 +-
 .../spark/examples/ml/QuantileDiscretizerExample.scala    | 4 ++--
 .../spark/examples/ml/RandomForestClassifierExample.scala | 4 ++--
 .../spark/examples/ml/RandomForestRegressorExample.scala  | 4 ++--
 .../apache/spark/examples/ml/VectorIndexerExample.scala   | 4 ++--
 .../spark/examples/mllib/AssociationRulesExample.scala    | 6 +++---
 .../mllib/BinaryClassificationMetricsExample.scala        | 4 ++--
 .../mllib/DecisionTreeClassificationExample.scala         | 4 ++--
 .../examples/mllib/DecisionTreeRegressionExample.scala    | 4 ++--
 .../org/apache/spark/examples/mllib/FPGrowthExample.scala | 2 +-
 .../mllib/GradientBoostingClassificationExample.scala     | 4 ++--
 .../mllib/GradientBoostingRegressionExample.scala         | 4 ++--
 .../spark/examples/mllib/HypothesisTestingExample.scala   | 2 +-
 .../spark/examples/mllib/IsotonicRegressionExample.scala  | 2 +-
 .../org/apache/spark/examples/mllib/KMeansExample.scala   | 2 +-
 .../org/apache/spark/examples/mllib/LBFGSExample.scala    | 2 +-
 .../examples/mllib/LatentDirichletAllocationExample.scala | 8 +++++---
 .../examples/mllib/LinearRegressionWithSGDExample.scala   | 2 +-
 .../org/apache/spark/examples/mllib/PCAExample.scala      | 4 ++--
 .../spark/examples/mllib/PMMLModelExportExample.scala     | 2 +-
 .../apache/spark/examples/mllib/PrefixSpanExample.scala   | 4 ++--
 .../mllib/RandomForestClassificationExample.scala         | 4 ++--
 .../examples/mllib/RandomForestRegressionExample.scala    | 4 ++--
 .../spark/examples/mllib/RecommendationExample.scala      | 2 +-
 .../apache/spark/examples/mllib/SVMWithSGDExample.scala   | 2 +-
 .../org/apache/spark/examples/mllib/SimpleFPGrowth.scala  | 8 +++-----
 .../spark/examples/mllib/StratifiedSamplingExample.scala  | 4 ++--
 .../org/apache/spark/examples/mllib/TallSkinnyPCA.scala   | 2 +-
 .../org/apache/spark/examples/mllib/TallSkinnySVD.scala   | 2 +-
 .../apache/spark/examples/streaming/CustomReceiver.scala  | 6 +++---
 .../apache/spark/examples/streaming/RawNetworkGrep.scala  | 2 +-
 .../examples/streaming/RecoverableNetworkWordCount.scala  | 8 ++++----
 .../streaming/clickstream/PageViewGenerator.scala         | 4 ++--
 .../examples/streaming/clickstream/PageViewStream.scala   | 4 ++--
 49 files changed, 94 insertions(+), 96 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java
index dd20cac621102..43cc30c1a899b 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java
@@ -66,7 +66,7 @@ public static void main(String[] args) {
       .setNumBuckets(3);
 
     Dataset<Row> result = discretizer.fit(df).transform(df);
-    result.show();
+    result.show(false);
     // $example off$
     spark.stop();
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala b/examples/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala
index e64dcbd182d94..2332a661f26a0 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala
@@ -60,10 +60,6 @@ object SimpleSkewedGroupByTest {
     pairs1.count
 
     println(s"RESULT: ${pairs1.groupByKey(numReducers).count}")
-    // Print how many keys each reducer got (for debugging)
-    // println("RESULT: " + pairs1.groupByKey(numReducers)
-    //                           .map{case (k,v) => (k, v.size)}
-    //                           .collectAsMap)
 
     spark.stop()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala b/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala
index 92936bd30dbc0..815404d1218b7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala
@@ -145,9 +145,11 @@ object Analytics extends Logging {
           // TriangleCount requires the graph to be partitioned
           .partitionBy(partitionStrategy.getOrElse(RandomVertexCut)).cache()
         val triangles = TriangleCount.run(graph)
-        println("Triangles: " + triangles.vertices.map {
+        val triangleTypes = triangles.vertices.map {
           case (vid, data) => data.toLong
-        }.reduce(_ + _) / 3)
+        }.reduce(_ + _) / 3
+
+        println(s"Triangles: ${triangleTypes}")
         sc.stop()
 
       case _ =>
diff --git a/examples/src/main/scala/org/apache/spark/examples/graphx/SynthBenchmark.scala b/examples/src/main/scala/org/apache/spark/examples/graphx/SynthBenchmark.scala
index 6d2228c8742aa..57b2edf992208 100644
--- a/examples/src/main/scala/org/apache/spark/examples/graphx/SynthBenchmark.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/graphx/SynthBenchmark.scala
@@ -52,7 +52,7 @@ object SynthBenchmark {
       arg =>
         arg.dropWhile(_ == '-').split('=') match {
           case Array(opt, v) => (opt -> v)
-          case _ => throw new IllegalArgumentException("Invalid argument: " + arg)
+          case _ => throw new IllegalArgumentException(s"Invalid argument: $arg")
         }
     }
 
@@ -76,7 +76,7 @@ object SynthBenchmark {
       case ("sigma", v) => sigma = v.toDouble
       case ("degFile", v) => degFile = v
       case ("seed", v) => seed = v.toInt
-      case (opt, _) => throw new IllegalArgumentException("Invalid option: " + opt)
+      case (opt, _) => throw new IllegalArgumentException(s"Invalid option: $opt")
     }
 
     val conf = new SparkConf()
@@ -86,7 +86,7 @@ object SynthBenchmark {
     val sc = new SparkContext(conf)
 
     // Create the graph
-    println(s"Creating graph...")
+    println("Creating graph...")
     val unpartitionedGraph = GraphGenerators.logNormalGraph(sc, numVertices,
       numEPart.getOrElse(sc.defaultParallelism), mu, sigma, seed)
     // Repartition the graph
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ChiSquareTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ChiSquareTestExample.scala
index dcee1e427ce58..5146fd0316467 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/ChiSquareTestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ChiSquareTestExample.scala
@@ -52,9 +52,9 @@ object ChiSquareTestExample {
 
     val df = data.toDF("label", "features")
     val chi = ChiSquareTest.test(df, "features", "label").head
-    println("pValues = " + chi.getAs[Vector](0))
-    println("degreesOfFreedom = " + chi.getSeq[Int](1).mkString("[", ",", "]"))
-    println("statistics = " + chi.getAs[Vector](2))
+    println(s"pValues = ${chi.getAs[Vector](0)}")
+    println(s"degreesOfFreedom ${chi.getSeq[Int](1).mkString("[", ",", "]")}")
+    println(s"statistics ${chi.getAs[Vector](2)}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationExample.scala
index 3f57dc342eb00..d7f1fc8ed74d7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationExample.scala
@@ -51,10 +51,10 @@ object CorrelationExample {
 
     val df = data.map(Tuple1.apply).toDF("features")
     val Row(coeff1: Matrix) = Correlation.corr(df, "features").head
-    println("Pearson correlation matrix:\n" + coeff1.toString)
+    println(s"Pearson correlation matrix:\n $coeff1")
 
     val Row(coeff2: Matrix) = Correlation.corr(df, "features", "spearman").head
-    println("Spearman correlation matrix:\n" + coeff2.toString)
+    println(s"Spearman correlation matrix:\n $coeff2")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
index 0658bddf16961..ee4469faab3a0 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
@@ -47,7 +47,7 @@ object DataFrameExample {
     val parser = new OptionParser[Params]("DataFrameExample") {
       head("DataFrameExample: an example app using DataFrame for ML.")
       opt[String]("input")
-        .text(s"input path to dataframe")
+        .text("input path to dataframe")
         .action((x, c) => c.copy(input = x))
       checkConfig { params =>
         success
@@ -93,7 +93,7 @@ object DataFrameExample {
     // Load the records back.
     println(s"Loading Parquet file with UDT from $outputDir.")
     val newDF = spark.read.parquet(outputDir)
-    println(s"Schema from Parquet:")
+    println("Schema from Parquet:")
     newDF.printSchema()
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
index bc6d3275933ea..276cedab11abc 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
@@ -83,10 +83,10 @@ object DecisionTreeClassificationExample {
       .setPredictionCol("prediction")
       .setMetricName("accuracy")
     val accuracy = evaluator.evaluate(predictions)
-    println("Test Error = " + (1.0 - accuracy))
+    println(s"Test Error = ${(1.0 - accuracy)}")
 
     val treeModel = model.stages(2).asInstanceOf[DecisionTreeClassificationModel]
-    println("Learned classification tree model:\n" + treeModel.toDebugString)
+    println(s"Learned classification tree model:\n ${treeModel.toDebugString}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
index ee61200ad1d0c..aaaecaea47081 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
@@ -73,10 +73,10 @@ object DecisionTreeRegressionExample {
       .setPredictionCol("prediction")
       .setMetricName("rmse")
     val rmse = evaluator.evaluate(predictions)
-    println("Root Mean Squared Error (RMSE) on test data = " + rmse)
+    println(s"Root Mean Squared Error (RMSE) on test data = $rmse")
 
     val treeModel = model.stages(1).asInstanceOf[DecisionTreeRegressionModel]
-    println("Learned regression tree model:\n" + treeModel.toDebugString)
+    println(s"Learned regression tree model:\n ${treeModel.toDebugString}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
index d94d837d10e96..2dc11b07d88ef 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
@@ -53,7 +53,7 @@ object DeveloperApiExample {
     // Create a LogisticRegression instance. This instance is an Estimator.
     val lr = new MyLogisticRegression()
     // Print out the parameters, documentation, and any default values.
-    println("MyLogisticRegression parameters:\n" + lr.explainParams() + "\n")
+    println(s"MyLogisticRegression parameters:\n ${lr.explainParams()}")
 
     // We may set parameters using setter methods.
     lr.setMaxIter(10)
@@ -169,10 +169,10 @@ private class MyLogisticRegressionModel(
     Vectors.dense(-margin, margin)
   }
 
-  /** Number of classes the label can take. 2 indicates binary classification. */
+  // Number of classes the label can take. 2 indicates binary classification.
   override val numClasses: Int = 2
 
-  /** Number of features the model was trained on. */
+  // Number of features the model was trained on.
   override val numFeatures: Int = coefficients.size
 
   /**
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala
index f18d86e1a6921..e5d91f132a3f2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala
@@ -46,7 +46,7 @@ object EstimatorTransformerParamExample {
     // Create a LogisticRegression instance. This instance is an Estimator.
     val lr = new LogisticRegression()
     // Print out the parameters, documentation, and any default values.
-    println("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
+    println(s"LogisticRegression parameters:\n ${lr.explainParams()}\n")
 
     // We may set parameters using setter methods.
     lr.setMaxIter(10)
@@ -58,7 +58,7 @@ object EstimatorTransformerParamExample {
     // we can view the parameters it used during fit().
     // This prints the parameter (name: value) pairs, where names are unique IDs for this
     // LogisticRegression instance.
-    println("Model 1 was fit using parameters: " + model1.parent.extractParamMap)
+    println(s"Model 1 was fit using parameters: ${model1.parent.extractParamMap}")
 
     // We may alternatively specify parameters using a ParamMap,
     // which supports several methods for specifying parameters.
@@ -73,7 +73,7 @@ object EstimatorTransformerParamExample {
     // Now learn a new model using the paramMapCombined parameters.
     // paramMapCombined overrides all parameters set earlier via lr.set* methods.
     val model2 = lr.fit(training, paramMapCombined)
-    println("Model 2 was fit using parameters: " + model2.parent.extractParamMap)
+    println(s"Model 2 was fit using parameters: ${model2.parent.extractParamMap}")
 
     // Prepare test data.
     val test = spark.createDataFrame(Seq(
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala
index 3656773c8b817..ef78c0a1145ef 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala
@@ -86,10 +86,10 @@ object GradientBoostedTreeClassifierExample {
       .setPredictionCol("prediction")
       .setMetricName("accuracy")
     val accuracy = evaluator.evaluate(predictions)
-    println("Test Error = " + (1.0 - accuracy))
+    println(s"Test Error = ${1.0 - accuracy}")
 
     val gbtModel = model.stages(2).asInstanceOf[GBTClassificationModel]
-    println("Learned classification GBT model:\n" + gbtModel.toDebugString)
+    println(s"Learned classification GBT model:\n ${gbtModel.toDebugString}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala
index e53aab7f326d3..3feb2343f6a85 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala
@@ -73,10 +73,10 @@ object GradientBoostedTreeRegressorExample {
       .setPredictionCol("prediction")
       .setMetricName("rmse")
     val rmse = evaluator.evaluate(predictions)
-    println("Root Mean Squared Error (RMSE) on test data = " + rmse)
+    println(s"Root Mean Squared Error (RMSE) on test data = $rmse")
 
     val gbtModel = model.stages(1).asInstanceOf[GBTRegressionModel]
-    println("Learned regression GBT model:\n" + gbtModel.toDebugString)
+    println(s"Learned regression GBT model:\n ${gbtModel.toDebugString}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MulticlassLogisticRegressionWithElasticNetExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MulticlassLogisticRegressionWithElasticNetExample.scala
index 42f0ace7a353d..3e61dbe628c20 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MulticlassLogisticRegressionWithElasticNetExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MulticlassLogisticRegressionWithElasticNetExample.scala
@@ -48,7 +48,7 @@ object MulticlassLogisticRegressionWithElasticNetExample {
 
     // Print the coefficients and intercept for multinomial logistic regression
     println(s"Coefficients: \n${lrModel.coefficientMatrix}")
-    println(s"Intercepts: ${lrModel.interceptVector}")
+    println(s"Intercepts: \n${lrModel.interceptVector}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
index 6fce82d294f8d..646f46a925062 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
@@ -66,7 +66,7 @@ object MultilayerPerceptronClassifierExample {
     val evaluator = new MulticlassClassificationEvaluator()
       .setMetricName("accuracy")
 
-    println("Test set accuracy = " + evaluator.evaluate(predictionAndLabels))
+    println(s"Test set accuracy = ${evaluator.evaluate(predictionAndLabels)}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
index bd9fcc420a66c..50c70c626b128 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
@@ -52,7 +52,7 @@ object NaiveBayesExample {
       .setPredictionCol("prediction")
       .setMetricName("accuracy")
     val accuracy = evaluator.evaluate(predictions)
-    println("Test set accuracy = " + accuracy)
+    println(s"Test set accuracy = $accuracy")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala
index aedb9e7d3bb70..0fe16fb6dfa9f 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala
@@ -36,7 +36,7 @@ object QuantileDiscretizerExample {
     // Output of QuantileDiscretizer for such small datasets can depend on the number of
     // partitions. Here we force a single partition to ensure consistent results.
     // Note this is not necessary for normal use cases
-        .repartition(1)
+      .repartition(1)
 
     // $example on$
     val discretizer = new QuantileDiscretizer()
@@ -45,7 +45,7 @@ object QuantileDiscretizerExample {
       .setNumBuckets(3)
 
     val result = discretizer.fit(df).transform(df)
-    result.show()
+    result.show(false)
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala
index 5eafda8ce4285..6265f83902528 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala
@@ -85,10 +85,10 @@ object RandomForestClassifierExample {
       .setPredictionCol("prediction")
       .setMetricName("accuracy")
     val accuracy = evaluator.evaluate(predictions)
-    println("Test Error = " + (1.0 - accuracy))
+    println(s"Test Error = ${(1.0 - accuracy)}")
 
     val rfModel = model.stages(2).asInstanceOf[RandomForestClassificationModel]
-    println("Learned classification forest model:\n" + rfModel.toDebugString)
+    println(s"Learned classification forest model:\n ${rfModel.toDebugString}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala
index 9a0a001c26ef5..2679fcb353a8a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala
@@ -72,10 +72,10 @@ object RandomForestRegressorExample {
       .setPredictionCol("prediction")
       .setMetricName("rmse")
     val rmse = evaluator.evaluate(predictions)
-    println("Root Mean Squared Error (RMSE) on test data = " + rmse)
+    println(s"Root Mean Squared Error (RMSE) on test data = $rmse")
 
     val rfModel = model.stages(1).asInstanceOf[RandomForestRegressionModel]
-    println("Learned regression forest model:\n" + rfModel.toDebugString)
+    println(s"Learned regression forest model:\n ${rfModel.toDebugString}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala
index afa761aee0b98..96bb8ea2338af 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala
@@ -41,8 +41,8 @@ object VectorIndexerExample {
     val indexerModel = indexer.fit(data)
 
     val categoricalFeatures: Set[Int] = indexerModel.categoryMaps.keys.toSet
-    println(s"Chose ${categoricalFeatures.size} categorical features: " +
-      categoricalFeatures.mkString(", "))
+    println(s"Chose ${categoricalFeatures.size} " +
+      s"categorical features: ${categoricalFeatures.mkString(", ")}")
 
     // Create new column "indexed" with categorical values transformed to indices
     val indexedData = indexerModel.transform(data)
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/AssociationRulesExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/AssociationRulesExample.scala
index ff44de56839e5..a07535bb5a38d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/AssociationRulesExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/AssociationRulesExample.scala
@@ -42,9 +42,8 @@ object AssociationRulesExample {
     val results = ar.run(freqItemsets)
 
     results.collect().foreach { rule =>
-      println("[" + rule.antecedent.mkString(",")
-        + "=>"
-        + rule.consequent.mkString(",") + "]," + rule.confidence)
+    println(s"[${rule.antecedent.mkString(",")}=>${rule.consequent.mkString(",")} ]" +
+        s" ${rule.confidence}")
     }
     // $example off$
 
@@ -53,3 +52,4 @@ object AssociationRulesExample {
 
 }
 // scalastyle:on println
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala
index b9263ac6fcff6..c6312d71cc912 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala
@@ -86,7 +86,7 @@ object BinaryClassificationMetricsExample {
 
     // AUPRC
     val auPRC = metrics.areaUnderPR
-    println("Area under precision-recall curve = " + auPRC)
+    println(s"Area under precision-recall curve = $auPRC")
 
     // Compute thresholds used in ROC and PR curves
     val thresholds = precision.map(_._1)
@@ -96,7 +96,7 @@ object BinaryClassificationMetricsExample {
 
     // AUROC
     val auROC = metrics.areaUnderROC
-    println("Area under ROC = " + auROC)
+    println(s"Area under ROC = $auROC")
     // $example off$
     sc.stop()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeClassificationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeClassificationExample.scala
index b50b4592777ce..c2f89b72c9a2e 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeClassificationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeClassificationExample.scala
@@ -55,8 +55,8 @@ object DecisionTreeClassificationExample {
       (point.label, prediction)
     }
     val testErr = labelAndPreds.filter(r => r._1 != r._2).count().toDouble / testData.count()
-    println("Test Error = " + testErr)
-    println("Learned classification tree model:\n" + model.toDebugString)
+    println(s"Test Error = $testErr")
+    println(s"Learned classification tree model:\n ${model.toDebugString}")
 
     // Save and load model
     model.save(sc, "target/tmp/myDecisionTreeClassificationModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRegressionExample.scala
index 2af45afae3d5b..1ecf6426e1f95 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRegressionExample.scala
@@ -54,8 +54,8 @@ object DecisionTreeRegressionExample {
       (point.label, prediction)
     }
     val testMSE = labelsAndPredictions.map{ case (v, p) => math.pow(v - p, 2) }.mean()
-    println("Test Mean Squared Error = " + testMSE)
-    println("Learned regression tree model:\n" + model.toDebugString)
+    println(s"Test Mean Squared Error = $testMSE")
+    println(s"Learned regression tree model:\n ${model.toDebugString}")
 
     // Save and load model
     model.save(sc, "target/tmp/myDecisionTreeRegressionModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
index 6435abc127752..f724ee1030f04 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
@@ -74,7 +74,7 @@ object FPGrowthExample {
     println(s"Number of frequent itemsets: ${model.freqItemsets.count()}")
 
     model.freqItemsets.collect().foreach { itemset =>
-      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
+      println(s"${itemset.items.mkString("[", ",", "]")}, ${itemset.freq}")
     }
 
     sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingClassificationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingClassificationExample.scala
index 00bb3348d2a36..3c56e1941aeca 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingClassificationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingClassificationExample.scala
@@ -54,8 +54,8 @@ object GradientBoostingClassificationExample {
       (point.label, prediction)
     }
     val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
-    println("Test Error = " + testErr)
-    println("Learned classification GBT model:\n" + model.toDebugString)
+    println(s"Test Error = $testErr")
+    println(s"Learned classification GBT model:\n ${model.toDebugString}")
 
     // Save and load model
     model.save(sc, "target/tmp/myGradientBoostingClassificationModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingRegressionExample.scala
index d8c263460839b..c288bf29bf255 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingRegressionExample.scala
@@ -53,8 +53,8 @@ object GradientBoostingRegressionExample {
       (point.label, prediction)
     }
     val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean()
-    println("Test Mean Squared Error = " + testMSE)
-    println("Learned regression GBT model:\n" + model.toDebugString)
+    println(s"Test Mean Squared Error = $testMSE")
+    println(s"Learned regression GBT model:\n ${model.toDebugString}")
 
     // Save and load model
     model.save(sc, "target/tmp/myGradientBoostingRegressionModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
index 0d391a3637c07..add1719739539 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
@@ -68,7 +68,7 @@ object HypothesisTestingExample {
     // against the label.
     val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs)
     featureTestResults.zipWithIndex.foreach { case (k, v) =>
-      println("Column " + (v + 1).toString + ":")
+      println(s"Column ${(v + 1)} :")
       println(k)
     }  // summary of the test
     // $example off$
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala
index 4aee951f5b04c..a10d6f0dda880 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala
@@ -56,7 +56,7 @@ object IsotonicRegressionExample {
 
     // Calculate mean squared error between predicted and real labels.
     val meanSquaredError = predictionAndLabel.map { case (p, l) => math.pow((p - l), 2) }.mean()
-    println("Mean Squared Error = " + meanSquaredError)
+    println(s"Mean Squared Error = $meanSquaredError")
 
     // Save and load model
     model.save(sc, "target/tmp/myIsotonicRegressionModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/KMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/KMeansExample.scala
index c4d71d862f375..b0a6f1671a898 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/KMeansExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/KMeansExample.scala
@@ -43,7 +43,7 @@ object KMeansExample {
 
     // Evaluate clustering by computing Within Set Sum of Squared Errors
     val WSSSE = clusters.computeCost(parsedData)
-    println("Within Set Sum of Squared Errors = " + WSSSE)
+    println(s"Within Set Sum of Squared Errors = $WSSSE")
 
     // Save and load model
     clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LBFGSExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LBFGSExample.scala
index fedcefa098381..123782fa6b9cf 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/LBFGSExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LBFGSExample.scala
@@ -82,7 +82,7 @@ object LBFGSExample {
 
     println("Loss of each step in training process")
     loss.foreach(println)
-    println("Area under ROC = " + auROC)
+    println(s"Area under ROC = $auROC")
     // $example off$
 
     sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LatentDirichletAllocationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LatentDirichletAllocationExample.scala
index f2c8ec01439f1..d25962c5500ed 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/LatentDirichletAllocationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LatentDirichletAllocationExample.scala
@@ -42,11 +42,13 @@ object LatentDirichletAllocationExample {
     val ldaModel = new LDA().setK(3).run(corpus)
 
     // Output topics. Each is a distribution over words (matching word count vectors)
-    println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize + " words):")
+    println(s"Learned topics (as distributions over vocab of ${ldaModel.vocabSize} words):")
     val topics = ldaModel.topicsMatrix
     for (topic <- Range(0, 3)) {
-      print("Topic " + topic + ":")
-      for (word <- Range(0, ldaModel.vocabSize)) { print(" " + topics(word, topic)); }
+      print(s"Topic $topic :")
+      for (word <- Range(0, ldaModel.vocabSize)) {
+        print(s"${topics(word, topic)}")
+      }
       println()
     }
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegressionWithSGDExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegressionWithSGDExample.scala
index d399618094487..449b725d1d173 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegressionWithSGDExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegressionWithSGDExample.scala
@@ -52,7 +52,7 @@ object LinearRegressionWithSGDExample {
       (point.label, prediction)
     }
     val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean()
-    println("training Mean Squared Error = " + MSE)
+    println(s"training Mean Squared Error $MSE")
 
     // Save and load model
     model.save(sc, "target/tmp/scalaLinearRegressionWithSGDModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala
index eb36697d94ba1..eff2393cc3abe 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala
@@ -65,8 +65,8 @@ object PCAExample {
     val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean()
     val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean()
 
-    println("Mean Squared Error = " + MSE)
-    println("PCA Mean Squared Error = " + MSE_pca)
+    println(s"Mean Squared Error = $MSE")
+    println(s"PCA Mean Squared Error = $MSE_pca")
     // $example off$
 
     sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/PMMLModelExportExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/PMMLModelExportExample.scala
index d74d74a37fb11..96deafd469bc7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/PMMLModelExportExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/PMMLModelExportExample.scala
@@ -41,7 +41,7 @@ object PMMLModelExportExample {
     val clusters = KMeans.train(parsedData, numClusters, numIterations)
 
     // Export to PMML to a String in PMML format
-    println("PMML Model:\n" + clusters.toPMML)
+    println(s"PMML Model:\n ${clusters.toPMML}")
 
     // Export the model to a local file in PMML format
     clusters.toPMML("/tmp/kmeans.xml")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/PrefixSpanExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/PrefixSpanExample.scala
index 69c72c4336576..8b789277774af 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/PrefixSpanExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/PrefixSpanExample.scala
@@ -42,8 +42,8 @@ object PrefixSpanExample {
     val model = prefixSpan.run(sequences)
     model.freqSequences.collect().foreach { freqSequence =>
       println(
-        freqSequence.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]") +
-          ", " + freqSequence.freq)
+        s"${freqSequence.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]")}," +
+          s" ${freqSequence.freq}")
     }
     // $example off$
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestClassificationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestClassificationExample.scala
index f1ebdf1a733ed..246e71de25615 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestClassificationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestClassificationExample.scala
@@ -55,8 +55,8 @@ object RandomForestClassificationExample {
       (point.label, prediction)
     }
     val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
-    println("Test Error = " + testErr)
-    println("Learned classification forest model:\n" + model.toDebugString)
+    println(s"Test Error = $testErr")
+    println(s"Learned classification forest model:\n ${model.toDebugString}")
 
     // Save and load model
     model.save(sc, "target/tmp/myRandomForestClassificationModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestRegressionExample.scala
index 11d612e651b4b..770e30276bc30 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestRegressionExample.scala
@@ -55,8 +55,8 @@ object RandomForestRegressionExample {
       (point.label, prediction)
     }
     val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean()
-    println("Test Mean Squared Error = " + testMSE)
-    println("Learned regression forest model:\n" + model.toDebugString)
+    println(s"Test Mean Squared Error = $testMSE")
+    println(s"Learned regression forest model:\n ${model.toDebugString}")
 
     // Save and load model
     model.save(sc, "target/tmp/myRandomForestRegressionModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RecommendationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RecommendationExample.scala
index 6df742d737e70..0bb2b8c8c2b43 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RecommendationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RecommendationExample.scala
@@ -56,7 +56,7 @@ object RecommendationExample {
       val err = (r1 - r2)
       err * err
     }.mean()
-    println("Mean Squared Error = " + MSE)
+    println(s"Mean Squared Error = $MSE")
 
     // Save and load model
     model.save(sc, "target/tmp/myCollaborativeFilter")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SVMWithSGDExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SVMWithSGDExample.scala
index b73fe9b2b3faa..285e2ce512639 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/SVMWithSGDExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SVMWithSGDExample.scala
@@ -57,7 +57,7 @@ object SVMWithSGDExample {
     val metrics = new BinaryClassificationMetrics(scoreAndLabels)
     val auROC = metrics.areaUnderROC()
 
-    println("Area under ROC = " + auROC)
+    println(s"Area under ROC = $auROC")
 
     // Save and load model
     model.save(sc, "target/tmp/scalaSVMWithSGDModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SimpleFPGrowth.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SimpleFPGrowth.scala
index b5c3033bcba09..694c3bb18b045 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/SimpleFPGrowth.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SimpleFPGrowth.scala
@@ -42,15 +42,13 @@ object SimpleFPGrowth {
     val model = fpg.run(transactions)
 
     model.freqItemsets.collect().foreach { itemset =>
-      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
+      println(s"${itemset.items.mkString("[", ",", "]")},${itemset.freq}")
     }
 
     val minConfidence = 0.8
     model.generateAssociationRules(minConfidence).collect().foreach { rule =>
-      println(
-        rule.antecedent.mkString("[", ",", "]")
-          + " => " + rule.consequent .mkString("[", ",", "]")
-          + ", " + rule.confidence)
+      println(s"${rule.antecedent.mkString("[", ",", "]")}=> " +
+        s"${rule.consequent .mkString("[", ",", "]")},${rule.confidence}")
     }
     // $example off$
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
index 16b074ef60699..3d41bef0af88c 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
@@ -41,10 +41,10 @@ object StratifiedSamplingExample {
     val exactSample = data.sampleByKeyExact(withReplacement = false, fractions = fractions)
     // $example off$
 
-    println("approxSample size is " + approxSample.collect().size.toString)
+    println(s"approxSample size is ${approxSample.collect().size}")
     approxSample.collect().foreach(println)
 
-    println("exactSample its size is " + exactSample.collect().size.toString)
+    println(s"exactSample its size is ${exactSample.collect().size}")
     exactSample.collect().foreach(println)
 
     sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala
index 03bc675299c5a..071d341b81614 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala
@@ -54,7 +54,7 @@ object TallSkinnyPCA {
     // Compute principal components.
     val pc = mat.computePrincipalComponents(mat.numCols().toInt)
 
-    println("Principal components are:\n" + pc)
+    println(s"Principal components are:\n $pc")
 
     sc.stop()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala
index 067e49b9599e7..8ae6de16d80e7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala
@@ -54,7 +54,7 @@ object TallSkinnySVD {
     // Compute SVD.
     val svd = mat.computeSVD(mat.numCols().toInt)
 
-    println("Singular values are " + svd.s)
+    println(s"Singular values are ${svd.s}")
 
     sc.stop()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala
index 43044d01b1204..25c7bf2871972 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala
@@ -82,9 +82,9 @@ class CustomReceiver(host: String, port: Int)
    var socket: Socket = null
    var userInput: String = null
    try {
-     logInfo("Connecting to " + host + ":" + port)
+     logInfo(s"Connecting to $host : $port")
      socket = new Socket(host, port)
-     logInfo("Connected to " + host + ":" + port)
+     logInfo(s"Connected to $host : $port")
      val reader = new BufferedReader(
        new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8))
      userInput = reader.readLine()
@@ -98,7 +98,7 @@ class CustomReceiver(host: String, port: Int)
      restart("Trying to connect again")
    } catch {
      case e: java.net.ConnectException =>
-       restart("Error connecting to " + host + ":" + port, e)
+       restart(s"Error connecting to $host : $port", e)
      case t: Throwable =>
        restart("Error receiving data", t)
    }
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/RawNetworkGrep.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/RawNetworkGrep.scala
index 5322929d177b4..437ccf0898d7c 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/RawNetworkGrep.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/RawNetworkGrep.scala
@@ -54,7 +54,7 @@ object RawNetworkGrep {
       ssc.rawSocketStream[String](host, port, StorageLevel.MEMORY_ONLY_SER_2)).toArray
     val union = ssc.union(rawStreams)
     union.filter(_.contains("the")).count().foreachRDD(r =>
-      println("Grep count: " + r.collect().mkString))
+      println(s"Grep count: ${r.collect().mkString}"))
     ssc.start()
     ssc.awaitTermination()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala
index 49c0427321133..f018f3a26d2e9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala
@@ -130,10 +130,10 @@ object RecoverableNetworkWordCount {
           true
         }
       }.collect().mkString("[", ", ", "]")
-      val output = "Counts at time " + time + " " + counts
+      val output = s"Counts at time $time $counts"
       println(output)
-      println("Dropped " + droppedWordsCounter.value + " word(s) totally")
-      println("Appending to " + outputFile.getAbsolutePath)
+      println(s"Dropped ${droppedWordsCounter.value} word(s) totally")
+      println(s"Appending to ${outputFile.getAbsolutePath}")
       Files.append(output + "\n", outputFile, Charset.defaultCharset())
     }
     ssc
@@ -141,7 +141,7 @@ object RecoverableNetworkWordCount {
 
   def main(args: Array[String]) {
     if (args.length != 4) {
-      System.err.println("Your arguments were " + args.mkString("[", ", ", "]"))
+      System.err.println(s"Your arguments were ${args.mkString("[", ", ", "]")}")
       System.err.println(
         """
           |Usage: RecoverableNetworkWordCount <hostname> <port> <checkpoint-directory>
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala
index 0ddd065f0db2b..2108bc63edea2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala
@@ -90,13 +90,13 @@ object PageViewGenerator {
     val viewsPerSecond = args(1).toFloat
     val sleepDelayMs = (1000.0 / viewsPerSecond).toInt
     val listener = new ServerSocket(port)
-    println("Listening on port: " + port)
+    println(s"Listening on port: $port")
 
     while (true) {
       val socket = listener.accept()
       new Thread() {
         override def run(): Unit = {
-          println("Got client connected from: " + socket.getInetAddress)
+          println(s"Got client connected from: ${socket.getInetAddress}")
           val out = new PrintWriter(socket.getOutputStream(), true)
 
           while (true) {
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewStream.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewStream.scala
index 1ba093f57b32c..b8e7c7e9e9152 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewStream.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewStream.scala
@@ -104,8 +104,8 @@ object PageViewStream {
           .foreachRDD((rdd, time) => rdd.join(userList)
             .map(_._2._2)
             .take(10)
-            .foreach(u => println("Saw user %s at time %s".format(u, time))))
-      case _ => println("Invalid metric entered: " + metric)
+            .foreach(u => println(s"Saw user $u at time $time")))
+      case _ => println(s"Invalid metric entered: $metric")
     }
 
     ssc.start()

From a51212b642f05f28447b80aa29f5482de2c27f58 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 4 Jan 2018 07:28:53 +0800
Subject: [PATCH 007/566] [SPARK-20960][SQL] make ColumnVector public

## What changes were proposed in this pull request?

move `ColumnVector` and related classes to `org.apache.spark.sql.vectorized`, and improve the document.

## How was this patch tested?

existing tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20116 from cloud-fan/column-vector.

(cherry picked from commit b297029130735316e1ac1144dee44761a12bfba7)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../VectorizedParquetRecordReader.java        |  7 ++-
 .../vectorized/ColumnVectorUtils.java         |  2 +
 .../vectorized/MutableColumnarRow.java        |  4 ++
 .../vectorized/WritableColumnVector.java      |  7 ++-
 .../vectorized/ArrowColumnVector.java         | 62 +------------------
 .../vectorized/ColumnVector.java              | 31 ++++++----
 .../vectorized/ColumnarArray.java             |  7 +--
 .../vectorized/ColumnarBatch.java             | 34 +++-------
 .../vectorized/ColumnarRow.java               |  7 +--
 .../sql/execution/ColumnarBatchScan.scala     |  4 +-
 .../aggregate/HashAggregateExec.scala         |  2 +-
 .../VectorizedHashMapGenerator.scala          |  3 +-
 .../sql/execution/arrow/ArrowConverters.scala |  2 +-
 .../columnar/InMemoryTableScanExec.scala      |  1 +
 .../execution/datasources/FileScanRDD.scala   |  2 +-
 .../execution/python/ArrowPythonRunner.scala  |  2 +-
 .../execution/arrow/ArrowWriterSuite.scala    |  2 +-
 .../vectorized/ArrowColumnVectorSuite.scala   |  1 +
 .../vectorized/ColumnVectorSuite.scala        |  2 +-
 .../vectorized/ColumnarBatchSuite.scala       |  6 +-
 20 files changed, 63 insertions(+), 125 deletions(-)
 rename sql/core/src/main/java/org/apache/spark/sql/{execution => }/vectorized/ArrowColumnVector.java (94%)
 rename sql/core/src/main/java/org/apache/spark/sql/{execution => }/vectorized/ColumnVector.java (79%)
 rename sql/core/src/main/java/org/apache/spark/sql/{execution => }/vectorized/ColumnarArray.java (95%)
 rename sql/core/src/main/java/org/apache/spark/sql/{execution => }/vectorized/ColumnarBatch.java (73%)
 rename sql/core/src/main/java/org/apache/spark/sql/{execution => }/vectorized/ColumnarRow.java (96%)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
index 6c157e85d411f..cd745b1f0e4e3 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
@@ -31,10 +31,10 @@
 import org.apache.spark.memory.MemoryMode;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.execution.vectorized.ColumnVectorUtils;
-import org.apache.spark.sql.execution.vectorized.ColumnarBatch;
 import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
 import org.apache.spark.sql.execution.vectorized.OffHeapColumnVector;
 import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
 
@@ -248,7 +248,10 @@ public void enableReturningBatches() {
    * Advances to the next batch of rows. Returns false if there are no more.
    */
   public boolean nextBatch() throws IOException {
-    columnarBatch.reset();
+    for (WritableColumnVector vector : columnVectors) {
+      vector.reset();
+    }
+    columnarBatch.setNumRows(0);
     if (rowsReturned >= totalRowCount) return false;
     checkEndOfRowGroup();
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
index bc62bc43484e5..b5cbe8e2839ba 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
@@ -28,6 +28,8 @@
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.util.DateTimeUtils;
 import org.apache.spark.sql.types.*;
+import org.apache.spark.sql.vectorized.ColumnarArray;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.apache.spark.unsafe.types.CalendarInterval;
 import org.apache.spark.unsafe.types.UTF8String;
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
index 06602c147dfe9..70057a9def6c0 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
@@ -23,6 +23,10 @@
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
 import org.apache.spark.sql.catalyst.util.MapData;
 import org.apache.spark.sql.types.*;
+import org.apache.spark.sql.vectorized.ColumnarArray;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
+import org.apache.spark.sql.vectorized.ColumnarRow;
+import org.apache.spark.sql.vectorized.ColumnVector;
 import org.apache.spark.unsafe.types.CalendarInterval;
 import org.apache.spark.unsafe.types.UTF8String;
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index 5f6f125976e12..d2ae32b06f83b 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -23,6 +23,7 @@
 
 import org.apache.spark.sql.internal.SQLConf;
 import org.apache.spark.sql.types.*;
+import org.apache.spark.sql.vectorized.ColumnVector;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.types.UTF8String;
 
@@ -585,11 +586,11 @@ public final int appendArray(int length) {
   public final int appendStruct(boolean isNull) {
     if (isNull) {
       appendNull();
-      for (ColumnVector c: childColumns) {
+      for (WritableColumnVector c: childColumns) {
         if (c.type instanceof StructType) {
-          ((WritableColumnVector) c).appendStruct(true);
+          c.appendStruct(true);
         } else {
-          ((WritableColumnVector) c).appendNull();
+          c.appendNull();
         }
       }
     } else {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
similarity index 94%
rename from sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
rename to sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
index af5673e26a501..708333213f3f1 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.execution.vectorized;
+package org.apache.spark.sql.vectorized;
 
 import org.apache.arrow.vector.*;
 import org.apache.arrow.vector.complex.*;
@@ -34,11 +34,7 @@ public final class ArrowColumnVector extends ColumnVector {
   private ArrowColumnVector[] childColumns;
 
   private void ensureAccessible(int index) {
-    int valueCount = accessor.getValueCount();
-    if (index < 0 || index >= valueCount) {
-      throw new IndexOutOfBoundsException(
-        String.format("index: %d, valueCount: %d", index, valueCount));
-    }
+    ensureAccessible(index, 1);
   }
 
   private void ensureAccessible(int index, int count) {
@@ -64,20 +60,12 @@ public void close() {
     accessor.close();
   }
 
-  //
-  // APIs dealing with nulls
-  //
-
   @Override
   public boolean isNullAt(int rowId) {
     ensureAccessible(rowId);
     return accessor.isNullAt(rowId);
   }
 
-  //
-  // APIs dealing with Booleans
-  //
-
   @Override
   public boolean getBoolean(int rowId) {
     ensureAccessible(rowId);
@@ -94,10 +82,6 @@ public boolean[] getBooleans(int rowId, int count) {
     return array;
   }
 
-  //
-  // APIs dealing with Bytes
-  //
-
   @Override
   public byte getByte(int rowId) {
     ensureAccessible(rowId);
@@ -114,10 +98,6 @@ public byte[] getBytes(int rowId, int count) {
     return array;
   }
 
-  //
-  // APIs dealing with Shorts
-  //
-
   @Override
   public short getShort(int rowId) {
     ensureAccessible(rowId);
@@ -134,10 +114,6 @@ public short[] getShorts(int rowId, int count) {
     return array;
   }
 
-  //
-  // APIs dealing with Ints
-  //
-
   @Override
   public int getInt(int rowId) {
     ensureAccessible(rowId);
@@ -154,10 +130,6 @@ public int[] getInts(int rowId, int count) {
     return array;
   }
 
-  //
-  // APIs dealing with Longs
-  //
-
   @Override
   public long getLong(int rowId) {
     ensureAccessible(rowId);
@@ -174,10 +146,6 @@ public long[] getLongs(int rowId, int count) {
     return array;
   }
 
-  //
-  // APIs dealing with floats
-  //
-
   @Override
   public float getFloat(int rowId) {
     ensureAccessible(rowId);
@@ -194,10 +162,6 @@ public float[] getFloats(int rowId, int count) {
     return array;
   }
 
-  //
-  // APIs dealing with doubles
-  //
-
   @Override
   public double getDouble(int rowId) {
     ensureAccessible(rowId);
@@ -214,10 +178,6 @@ public double[] getDoubles(int rowId, int count) {
     return array;
   }
 
-  //
-  // APIs dealing with Arrays
-  //
-
   @Override
   public int getArrayLength(int rowId) {
     ensureAccessible(rowId);
@@ -230,45 +190,27 @@ public int getArrayOffset(int rowId) {
     return accessor.getArrayOffset(rowId);
   }
 
-  //
-  // APIs dealing with Decimals
-  //
-
   @Override
   public Decimal getDecimal(int rowId, int precision, int scale) {
     ensureAccessible(rowId);
     return accessor.getDecimal(rowId, precision, scale);
   }
 
-  //
-  // APIs dealing with UTF8Strings
-  //
-
   @Override
   public UTF8String getUTF8String(int rowId) {
     ensureAccessible(rowId);
     return accessor.getUTF8String(rowId);
   }
 
-  //
-  // APIs dealing with Binaries
-  //
-
   @Override
   public byte[] getBinary(int rowId) {
     ensureAccessible(rowId);
     return accessor.getBinary(rowId);
   }
 
-  /**
-   * Returns the data for the underlying array.
-   */
   @Override
   public ArrowColumnVector arrayData() { return childColumns[0]; }
 
-  /**
-   * Returns the ordinal's child data column.
-   */
   @Override
   public ArrowColumnVector getChildColumn(int ordinal) { return childColumns[ordinal]; }
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
similarity index 79%
rename from sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
rename to sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
index dc7c1269bedd9..d1196e1299fee 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.sql.execution.vectorized;
+package org.apache.spark.sql.vectorized;
 
 import org.apache.spark.sql.catalyst.util.MapData;
 import org.apache.spark.sql.types.DataType;
@@ -22,24 +22,31 @@
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
- * This class represents in-memory values of a column and provides the main APIs to access the data.
- * It supports all the types and contains get APIs as well as their batched versions. The batched
- * versions are considered to be faster and preferable whenever possible.
+ * An interface representing in-memory columnar data in Spark. This interface defines the main APIs
+ * to access the data, as well as their batched versions. The batched versions are considered to be
+ * faster and preferable whenever possible.
  *
- * To handle nested schemas, ColumnVector has two types: Arrays and Structs. In both cases these
- * columns have child columns. All of the data are stored in the child columns and the parent column
- * only contains nullability. In the case of Arrays, the lengths and offsets are saved in the child
- * column and are encoded identically to INTs.
+ * Most of the APIs take the rowId as a parameter. This is the batch local 0-based row id for values
+ * in this ColumnVector.
  *
- * Maps are just a special case of a two field struct.
+ * ColumnVector supports all the data types including nested types. To handle nested types,
+ * ColumnVector can have children and is a tree structure. For struct type, it stores the actual
+ * data of each field in the corresponding child ColumnVector, and only stores null information in
+ * the parent ColumnVector. For array type, it stores the actual array elements in the child
+ * ColumnVector, and stores null information, array offsets and lengths in the parent ColumnVector.
  *
- * Most of the APIs take the rowId as a parameter. This is the batch local 0-based row id for values
- * in the current batch.
+ * ColumnVector is expected to be reused during the entire data loading process, to avoid allocating
+ * memory again and again.
+ *
+ * ColumnVector is meant to maximize CPU efficiency but not to minimize storage footprint.
+ * Implementations should prefer computing efficiency over storage efficiency when design the
+ * format. Since it is expected to reuse the ColumnVector instance while loading data, the storage
+ * footprint is negligible.
  */
 public abstract class ColumnVector implements AutoCloseable {
 
   /**
-   * Returns the data type of this column.
+   * Returns the data type of this column vector.
    */
   public final DataType dataType() { return type; }
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarArray.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
similarity index 95%
rename from sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarArray.java
rename to sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
index cbc39d1d0aec2..0d89a52e7a4fe 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarArray.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.sql.execution.vectorized;
+package org.apache.spark.sql.vectorized;
 
 import org.apache.spark.sql.catalyst.util.ArrayData;
 import org.apache.spark.sql.catalyst.util.MapData;
@@ -23,8 +23,7 @@
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
- * Array abstraction in {@link ColumnVector}. The instance of this class is intended
- * to be reused, callers should copy the data out if it needs to be stored.
+ * Array abstraction in {@link ColumnVector}.
  */
 public final class ColumnarArray extends ArrayData {
   // The data for this array. This array contains elements from
@@ -33,7 +32,7 @@ public final class ColumnarArray extends ArrayData {
   private final int offset;
   private final int length;
 
-  ColumnarArray(ColumnVector data, int offset, int length) {
+  public ColumnarArray(ColumnVector data, int offset, int length) {
     this.data = data;
     this.offset = offset;
     this.length = length;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java
similarity index 73%
rename from sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
rename to sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java
index a9d09aa679726..9ae1c6d9993f0 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java
@@ -14,26 +14,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.sql.execution.vectorized;
+package org.apache.spark.sql.vectorized;
 
 import java.util.*;
 
 import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.execution.vectorized.MutableColumnarRow;
 import org.apache.spark.sql.types.StructType;
 
 /**
- * This class is the in memory representation of rows as they are streamed through operators. It
- * is designed to maximize CPU efficiency and not storage footprint. Since it is expected that
- * each operator allocates one of these objects, the storage footprint on the task is negligible.
- *
- * The layout is a columnar with values encoded in their native format. Each RowBatch contains
- * a horizontal partitioning of the data, split into columns.
- *
- * The ColumnarBatch supports either on heap or offheap modes with (mostly) the identical API.
- *
- * TODO:
- *  - There are many TODOs for the existing APIs. They should throw a not implemented exception.
- *  - Compaction: The batch and columns should be able to compact based on a selection vector.
+ * This class wraps multiple ColumnVectors as a row-wise table. It provides a row view of this
+ * batch so that Spark can access the data row by row. Instance of it is meant to be reused during
+ * the entire data loading process.
  */
 public final class ColumnarBatch {
   public static final int DEFAULT_BATCH_SIZE = 4 * 1024;
@@ -57,7 +49,7 @@ public void close() {
   }
 
   /**
-   * Returns an iterator over the rows in this batch. This skips rows that are filtered out.
+   * Returns an iterator over the rows in this batch.
    */
   public Iterator<InternalRow> rowIterator() {
     final int maxRows = numRows;
@@ -87,19 +79,7 @@ public void remove() {
   }
 
   /**
-   * Resets the batch for writing.
-   */
-  public void reset() {
-    for (int i = 0; i < numCols(); ++i) {
-      if (columns[i] instanceof WritableColumnVector) {
-        ((WritableColumnVector) columns[i]).reset();
-      }
-    }
-    this.numRows = 0;
-  }
-
-  /**
-   * Sets the number of rows that are valid.
+   * Sets the number of rows in this batch.
    */
   public void setNumRows(int numRows) {
     assert(numRows <= this.capacity);
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarRow.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
similarity index 96%
rename from sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarRow.java
rename to sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
index 8bb33ed5b78c0..3c6656dec77cd 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.sql.execution.vectorized;
+package org.apache.spark.sql.vectorized;
 
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
@@ -24,8 +24,7 @@
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
- * Row abstraction in {@link ColumnVector}. The instance of this class is intended
- * to be reused, callers should copy the data out if it needs to be stored.
+ * Row abstraction in {@link ColumnVector}.
  */
 public final class ColumnarRow extends InternalRow {
   // The data for this row.
@@ -34,7 +33,7 @@ public final class ColumnarRow extends InternalRow {
   private final int rowId;
   private final int numFields;
 
-  ColumnarRow(ColumnVector data, int rowId) {
+  public ColumnarRow(ColumnVector data, int rowId) {
     assert (data.dataType() instanceof StructType);
     this.data = data;
     this.rowId = rowId;
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
index 782cec5e292ba..5617046e1396e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
@@ -20,13 +20,13 @@ package org.apache.spark.sql.execution
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
 import org.apache.spark.sql.execution.metric.SQLMetrics
-import org.apache.spark.sql.execution.vectorized.{ColumnarBatch, ColumnVector}
 import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
 
 
 /**
  * Helper trait for abstracting scan functionality using
- * [[org.apache.spark.sql.execution.vectorized.ColumnarBatch]]es.
+ * [[ColumnarBatch]]es.
  */
 private[sql] trait ColumnarBatchScan extends CodegenSupport {
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
index 9a6f1c6dfa6a9..ce3c68810f3b6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
-import org.apache.spark.sql.execution.vectorized.{ColumnarRow, MutableColumnarRow}
+import org.apache.spark.sql.execution.vectorized.MutableColumnarRow
 import org.apache.spark.sql.types.{DecimalType, StringType, StructType}
 import org.apache.spark.unsafe.KVIterator
 import org.apache.spark.util.Utils
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
index 0380ee8b09d63..0cf9b53ce1d5d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
@@ -20,8 +20,9 @@ package org.apache.spark.sql.execution.aggregate
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
-import org.apache.spark.sql.execution.vectorized.{ColumnarBatch, MutableColumnarRow, OnHeapColumnVector}
+import org.apache.spark.sql.execution.vectorized.{MutableColumnarRow, OnHeapColumnVector}
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.ColumnarBatch
 
 /**
  * This is a helper class to generate an append-only vectorized hash map that can act as a 'cache'
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
index bcfc412430263..bcd1aa0890ba3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
@@ -32,8 +32,8 @@ import org.apache.spark.TaskContext
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.execution.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
 import org.apache.spark.util.Utils
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
index 3e73393b12850..933b9753faa61 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
@@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partition
 import org.apache.spark.sql.execution.{ColumnarBatchScan, LeafExecNode, WholeStageCodegenExec}
 import org.apache.spark.sql.execution.vectorized._
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
 
 
 case class InMemoryTableScanExec(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
index 8731ee88f87f2..835ce98462477 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
@@ -26,7 +26,7 @@ import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.{InputFileBlockHolder, RDD}
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.execution.vectorized.ColumnarBatch
+import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.NextIterator
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
index 5cc8ed3535654..dc5ba96e69aec 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
@@ -30,8 +30,8 @@ import org.apache.spark._
 import org.apache.spark.api.python._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.arrow.{ArrowUtils, ArrowWriter}
-import org.apache.spark.sql.execution.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
 import org.apache.spark.util.Utils
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala
index 508c116aae92e..c42bc60a59d67 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala
@@ -20,8 +20,8 @@ package org.apache.spark.sql.execution.arrow
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.ArrayData
-import org.apache.spark.sql.execution.vectorized.ArrowColumnVector
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.ArrowColumnVector
 import org.apache.spark.unsafe.types.UTF8String
 
 class ArrowWriterSuite extends SparkFunSuite {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
index 03490ad15a655..7304803a092c0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
@@ -23,6 +23,7 @@ import org.apache.arrow.vector.complex._
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.execution.arrow.ArrowUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.ArrowColumnVector
 import org.apache.spark.unsafe.types.UTF8String
 
 class ArrowColumnVectorSuite extends SparkFunSuite {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
index 54b31cee031f6..944240f3bade5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
@@ -21,10 +21,10 @@ import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow
-import org.apache.spark.sql.catalyst.util.ArrayData
 import org.apache.spark.sql.execution.columnar.ColumnAccessor
 import org.apache.spark.sql.execution.columnar.compression.ColumnBuilderHelper
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.ColumnarArray
 import org.apache.spark.unsafe.types.UTF8String
 
 class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index 7848ebdcab6d0..675f06b31b970 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -33,6 +33,7 @@ import org.apache.spark.sql.{RandomDataGenerator, Row}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.arrow.ArrowUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.unsafe.types.CalendarInterval
 
@@ -918,10 +919,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       assert(it.hasNext == false)
 
       // Reset and add 3 rows
-      batch.reset()
-      assert(batch.numRows() == 0)
-      assert(batch.rowIterator().hasNext == false)
-
+      columns.foreach(_.reset())
       // Add rows [NULL, 2.2, 2, "abc"], [3, NULL, 3, ""], [4, 4.4, 4, "world]
       columns(0).putNull(0)
       columns(1).putDouble(0, 2.2)

From f51c8fde8bf08705bacf8a93b5dba685ebbcec17 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 4 Jan 2018 13:14:52 +0800
Subject: [PATCH 008/566] [SPARK-22944][SQL] improve FoldablePropagation

## What changes were proposed in this pull request?

`FoldablePropagation` is a little tricky as it needs to handle attributes that are miss-derived from children, e.g. outer join outputs. This rule does a kind of stop-able tree transform, to skip to apply this rule when hit a node which may have miss-derived attributes.

Logically we should be able to apply this rule above the unsupported nodes, by just treating the unsupported nodes as leaf nodes. This PR improves this rule to not stop the tree transformation, but reduce the foldable expressions that we want to propagate.

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20139 from cloud-fan/foldable.

(cherry picked from commit 7d045c5f00e2c7c67011830e2169a4e130c3ace8)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../sql/catalyst/optimizer/expressions.scala  | 65 +++++++++++--------
 .../optimizer/FoldablePropagationSuite.scala  | 23 ++++++-
 2 files changed, 58 insertions(+), 30 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index 7d830bbb7dc32..1c0b7bd806801 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -506,18 +506,21 @@ object NullPropagation extends Rule[LogicalPlan] {
 
 
 /**
- * Propagate foldable expressions:
  * Replace attributes with aliases of the original foldable expressions if possible.
- * Other optimizations will take advantage of the propagated foldable expressions.
- *
+ * Other optimizations will take advantage of the propagated foldable expressions. For example,
+ * this rule can optimize
  * {{{
  *   SELECT 1.0 x, 'abc' y, Now() z ORDER BY x, y, 3
- *   ==>  SELECT 1.0 x, 'abc' y, Now() z ORDER BY 1.0, 'abc', Now()
  * }}}
+ * to
+ * {{{
+ *   SELECT 1.0 x, 'abc' y, Now() z ORDER BY 1.0, 'abc', Now()
+ * }}}
+ * and other rules can further optimize it and remove the ORDER BY operator.
  */
 object FoldablePropagation extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = {
-    val foldableMap = AttributeMap(plan.flatMap {
+    var foldableMap = AttributeMap(plan.flatMap {
       case Project(projectList, _) => projectList.collect {
         case a: Alias if a.child.foldable => (a.toAttribute, a)
       }
@@ -530,38 +533,44 @@ object FoldablePropagation extends Rule[LogicalPlan] {
     if (foldableMap.isEmpty) {
       plan
     } else {
-      var stop = false
       CleanupAliases(plan.transformUp {
-        // A leaf node should not stop the folding process (note that we are traversing up the
-        // tree, starting at the leaf nodes); so we are allowing it.
-        case l: LeafNode =>
-          l
-
         // We can only propagate foldables for a subset of unary nodes.
-        case u: UnaryNode if !stop && canPropagateFoldables(u) =>
+        case u: UnaryNode if foldableMap.nonEmpty && canPropagateFoldables(u) =>
           u.transformExpressions(replaceFoldable)
 
-        // Allow inner joins. We do not allow outer join, although its output attributes are
-        // derived from its children, they are actually different attributes: the output of outer
-        // join is not always picked from its children, but can also be null.
+        // Join derives the output attributes from its child while they are actually not the
+        // same attributes. For example, the output of outer join is not always picked from its
+        // children, but can also be null. We should exclude these miss-derived attributes when
+        // propagating the foldable expressions.
         // TODO(cloud-fan): It seems more reasonable to use new attributes as the output attributes
         // of outer join.
-        case j @ Join(_, _, Inner, _) if !stop =>
-          j.transformExpressions(replaceFoldable)
-
-        // We can fold the projections an expand holds. However expand changes the output columns
-        // and often reuses the underlying attributes; so we cannot assume that a column is still
-        // foldable after the expand has been applied.
-        // TODO(hvanhovell): Expand should use new attributes as the output attributes.
-        case expand: Expand if !stop =>
-          val newExpand = expand.copy(projections = expand.projections.map { projection =>
+        case j @ Join(left, right, joinType, _) if foldableMap.nonEmpty =>
+          val newJoin = j.transformExpressions(replaceFoldable)
+          val missDerivedAttrsSet: AttributeSet = AttributeSet(joinType match {
+            case _: InnerLike | LeftExistence(_) => Nil
+            case LeftOuter => right.output
+            case RightOuter => left.output
+            case FullOuter => left.output ++ right.output
+          })
+          foldableMap = AttributeMap(foldableMap.baseMap.values.filterNot {
+            case (attr, _) => missDerivedAttrsSet.contains(attr)
+          }.toSeq)
+          newJoin
+
+        // We can not replace the attributes in `Expand.output`. If there are other non-leaf
+        // operators that have the `output` field, we should put them here too.
+        case expand: Expand if foldableMap.nonEmpty =>
+          expand.copy(projections = expand.projections.map { projection =>
             projection.map(_.transform(replaceFoldable))
           })
-          stop = true
-          newExpand
 
-        case other =>
-          stop = true
+        // For other plans, they are not safe to apply foldable propagation, and they should not
+        // propagate foldable expressions from children.
+        case other if foldableMap.nonEmpty =>
+          val childrenOutputSet = AttributeSet(other.children.flatMap(_.output))
+          foldableMap = AttributeMap(foldableMap.baseMap.values.filterNot {
+            case (attr, _) => childrenOutputSet.contains(attr)
+          }.toSeq)
           other
       })
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala
index dccb32f0379a8..c28844642aed0 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala
@@ -147,8 +147,8 @@ class FoldablePropagationSuite extends PlanTest {
   test("Propagate in expand") {
     val c1 = Literal(1).as('a)
     val c2 = Literal(2).as('b)
-    val a1 = c1.toAttribute.withNullability(true)
-    val a2 = c2.toAttribute.withNullability(true)
+    val a1 = c1.toAttribute.newInstance().withNullability(true)
+    val a2 = c2.toAttribute.newInstance().withNullability(true)
     val expand = Expand(
       Seq(Seq(Literal(null), 'b), Seq('a, Literal(null))),
       Seq(a1, a2),
@@ -161,4 +161,23 @@ class FoldablePropagationSuite extends PlanTest {
     val correctAnswer = correctExpand.where(a1.isNotNull).select(a1, a2).analyze
     comparePlans(optimized, correctAnswer)
   }
+
+  test("Propagate above outer join") {
+    val left = LocalRelation('a.int).select('a, Literal(1).as('b))
+    val right = LocalRelation('c.int).select('c, Literal(1).as('d))
+
+    val join = left.join(
+      right,
+      joinType = LeftOuter,
+      condition = Some('a === 'c && 'b === 'd))
+    val query = join.select(('b + 3).as('res)).analyze
+    val optimized = Optimize.execute(query)
+
+    val correctAnswer = left.join(
+      right,
+      joinType = LeftOuter,
+      condition = Some('a === 'c && Literal(1) === Literal(1)))
+      .select((Literal(1) + 3).as('res)).analyze
+    comparePlans(optimized, correctAnswer)
+  }
 }

From 1860a43e9affb7619be0a5a1c786e264d09bc446 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Wed, 3 Jan 2018 21:43:14 -0800
Subject: [PATCH 009/566] [SPARK-22933][SPARKR] R Structured Streaming API for
 withWatermark, trigger, partitionBy

## What changes were proposed in this pull request?

R Structured Streaming API for withWatermark, trigger, partitionBy

## How was this patch tested?

manual, unit tests

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #20129 from felixcheung/rwater.

(cherry picked from commit df95a908baf78800556636a76d58bba9b3dd943f)
Signed-off-by: Felix Cheung <felixcheung@apache.org>
---
 R/pkg/NAMESPACE                               |   1 +
 R/pkg/R/DataFrame.R                           |  96 +++++++++++++++-
 R/pkg/R/SQLContext.R                          |   4 +-
 R/pkg/R/generics.R                            |   6 +
 R/pkg/tests/fulltests/test_streaming.R        | 107 ++++++++++++++++++
 python/pyspark/sql/streaming.py               |   4 +
 .../sql/execution/streaming/Triggers.scala    |   2 +-
 7 files changed, 214 insertions(+), 6 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 3219c6f0cc47b..c51eb0f39c4b1 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -179,6 +179,7 @@ exportMethods("arrange",
               "with",
               "withColumn",
               "withColumnRenamed",
+              "withWatermark",
               "write.df",
               "write.jdbc",
               "write.json",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index fe238f6dd4eb0..9956f7eda91e6 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -3661,7 +3661,8 @@ setMethod("getNumPartitions",
 #' isStreaming
 #'
 #' Returns TRUE if this SparkDataFrame contains one or more sources that continuously return data
-#' as it arrives.
+#' as it arrives. A dataset that reads data from a streaming source must be executed as a
+#' \code{StreamingQuery} using \code{write.stream}.
 #'
 #' @param x A SparkDataFrame
 #' @return TRUE if this SparkDataFrame is from a streaming source
@@ -3707,7 +3708,17 @@ setMethod("isStreaming",
 #' @param df a streaming SparkDataFrame.
 #' @param source a name for external data source.
 #' @param outputMode one of 'append', 'complete', 'update'.
-#' @param ... additional argument(s) passed to the method.
+#' @param partitionBy a name or a list of names of columns to partition the output by on the file
+#'        system. If specified, the output is laid out on the file system similar to Hive's
+#'        partitioning scheme.
+#' @param trigger.processingTime a processing time interval as a string, e.g. '5 seconds',
+#'        '1 minute'. This is a trigger that runs a query periodically based on the processing
+#'        time. If value is '0 seconds', the query will run as fast as possible, this is the
+#'        default. Only one trigger can be set.
+#' @param trigger.once a logical, must be set to \code{TRUE}. This is a trigger that processes only
+#'        one batch of data in a streaming query then terminates the query. Only one trigger can be
+#'        set.
+#' @param ... additional external data source specific named options.
 #'
 #' @family SparkDataFrame functions
 #' @seealso \link{read.stream}
@@ -3725,7 +3736,8 @@ setMethod("isStreaming",
 #' # console
 #' q <- write.stream(wordCounts, "console", outputMode = "complete")
 #' # text stream
-#' q <- write.stream(df, "text", path = "/home/user/out", checkpointLocation = "/home/user/cp")
+#' q <- write.stream(df, "text", path = "/home/user/out", checkpointLocation = "/home/user/cp"
+#'                   partitionBy = c("year", "month"), trigger.processingTime = "30 seconds")
 #' # memory stream
 #' q <- write.stream(wordCounts, "memory", queryName = "outs", outputMode = "complete")
 #' head(sql("SELECT * from outs"))
@@ -3737,7 +3749,8 @@ setMethod("isStreaming",
 #' @note experimental
 setMethod("write.stream",
           signature(df = "SparkDataFrame"),
-          function(df, source = NULL, outputMode = NULL, ...) {
+          function(df, source = NULL, outputMode = NULL, partitionBy = NULL,
+                   trigger.processingTime = NULL, trigger.once = NULL, ...) {
             if (!is.null(source) && !is.character(source)) {
               stop("source should be character, NULL or omitted. It is the data source specified ",
                    "in 'spark.sql.sources.default' configuration by default.")
@@ -3748,12 +3761,43 @@ setMethod("write.stream",
             if (is.null(source)) {
               source <- getDefaultSqlSource()
             }
+            cols <- NULL
+            if (!is.null(partitionBy)) {
+              if (!all(sapply(partitionBy, function(c) { is.character(c) }))) {
+                stop("All partitionBy column names should be characters.")
+              }
+              cols <- as.list(partitionBy)
+            }
+            jtrigger <- NULL
+            if (!is.null(trigger.processingTime) && !is.na(trigger.processingTime)) {
+              if (!is.null(trigger.once)) {
+                stop("Multiple triggers not allowed.")
+              }
+              interval <- as.character(trigger.processingTime)
+              if (nchar(interval) == 0) {
+                stop("Value for trigger.processingTime must be a non-empty string.")
+              }
+              jtrigger <- handledCallJStatic("org.apache.spark.sql.streaming.Trigger",
+                                             "ProcessingTime",
+                                             interval)
+            } else if (!is.null(trigger.once) && !is.na(trigger.once)) {
+              if (!is.logical(trigger.once) || !trigger.once) {
+                stop("Value for trigger.once must be TRUE.")
+              }
+              jtrigger <- callJStatic("org.apache.spark.sql.streaming.Trigger", "Once")
+            }
             options <- varargsToStrEnv(...)
             write <- handledCallJMethod(df@sdf, "writeStream")
             write <- callJMethod(write, "format", source)
             if (!is.null(outputMode)) {
               write <- callJMethod(write, "outputMode", outputMode)
             }
+            if (!is.null(cols)) {
+              write <- callJMethod(write, "partitionBy", cols)
+            }
+            if (!is.null(jtrigger)) {
+              write <- callJMethod(write, "trigger", jtrigger)
+            }
             write <- callJMethod(write, "options", options)
             ssq <- handledCallJMethod(write, "start")
             streamingQuery(ssq)
@@ -3967,3 +4011,47 @@ setMethod("broadcast",
             sdf <- callJStatic("org.apache.spark.sql.functions", "broadcast", x@sdf)
             dataFrame(sdf)
           })
+
+#' withWatermark
+#'
+#' Defines an event time watermark for this streaming SparkDataFrame. A watermark tracks a point in
+#' time before which we assume no more late data is going to arrive.
+#'
+#' Spark will use this watermark for several purposes:
+#' \itemize{
+#'  \item{-} To know when a given time window aggregation can be finalized and thus can be emitted
+#' when using output modes that do not allow updates.
+#'  \item{-} To minimize the amount of state that we need to keep for on-going aggregations.
+#' }
+#' The current watermark is computed by looking at the \code{MAX(eventTime)} seen across
+#' all of the partitions in the query minus a user specified \code{delayThreshold}. Due to the cost
+#' of coordinating this value across partitions, the actual watermark used is only guaranteed
+#' to be at least \code{delayThreshold} behind the actual event time.  In some cases we may still
+#' process records that arrive more than \code{delayThreshold} late.
+#'
+#' @param x a streaming SparkDataFrame
+#' @param eventTime a string specifying the name of the Column that contains the event time of the
+#'                  row.
+#' @param delayThreshold a string specifying the minimum delay to wait to data to arrive late,
+#'                       relative to the latest record that has been processed in the form of an
+#'                       interval (e.g. "1 minute" or "5 hours"). NOTE: This should not be negative.
+#' @return a SparkDataFrame.
+#' @aliases withWatermark,SparkDataFrame,character,character-method
+#' @family SparkDataFrame functions
+#' @rdname withWatermark
+#' @name withWatermark
+#' @export
+#' @examples
+#' \dontrun{
+#' sparkR.session()
+#' schema <- structType(structField("time", "timestamp"), structField("value", "double"))
+#' df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1)
+#' df <- withWatermark(df, "time", "10 minutes")
+#' }
+#' @note withWatermark since 2.3.0
+setMethod("withWatermark",
+          signature(x = "SparkDataFrame", eventTime = "character", delayThreshold = "character"),
+          function(x, eventTime, delayThreshold) {
+            sdf <- callJMethod(x@sdf, "withWatermark", eventTime, delayThreshold)
+            dataFrame(sdf)
+          })
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 3b7f71bbbffb8..9d0a2d5e074e4 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -727,7 +727,9 @@ read.jdbc <- function(url, tableName,
 #' @param schema The data schema defined in structType or a DDL-formatted string, this is
 #'               required for file-based streaming data source
 #' @param ... additional external data source specific named options, for instance \code{path} for
-#'        file-based streaming data source
+#'        file-based streaming data source. \code{timeZone} to indicate a timezone to be used to 
+#'        parse timestamps in the JSON/CSV data sources or partition values; If it isn't set, it
+#'        uses the default value, session local timezone.
 #' @return SparkDataFrame
 #' @rdname read.stream
 #' @name read.stream
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 5369c32544e5e..e0dde3339fabc 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -799,6 +799,12 @@ setGeneric("withColumn", function(x, colName, col) { standardGeneric("withColumn
 setGeneric("withColumnRenamed",
            function(x, existingCol, newCol) { standardGeneric("withColumnRenamed") })
 
+#' @rdname withWatermark
+#' @export
+setGeneric("withWatermark", function(x, eventTime, delayThreshold) {
+  standardGeneric("withWatermark")
+})
+
 #' @rdname write.df
 #' @export
 setGeneric("write.df", function(df, path = NULL, ...) { standardGeneric("write.df") })
diff --git a/R/pkg/tests/fulltests/test_streaming.R b/R/pkg/tests/fulltests/test_streaming.R
index 54f40bbd5f517..a354d50c6b54e 100644
--- a/R/pkg/tests/fulltests/test_streaming.R
+++ b/R/pkg/tests/fulltests/test_streaming.R
@@ -172,6 +172,113 @@ test_that("Terminated by error", {
   stopQuery(q)
 })
 
+test_that("PartitionBy", {
+  parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
+  checkpointPath <- tempfile(pattern = "sparkr-test", fileext = ".checkpoint")
+  textPath <- tempfile(pattern = "sparkr-test", fileext = ".text")
+  df <- read.df(jsonPath, "json", stringSchema)
+  write.df(df, parquetPath, "parquet", "overwrite")
+
+  df <- read.stream(path = parquetPath, schema = stringSchema)
+
+  expect_error(write.stream(df, "json", path = textPath, checkpointLocation = "append",
+                            partitionBy = c(1, 2)),
+                            "All partitionBy column names should be characters")
+
+  q <- write.stream(df, "json", path = textPath, checkpointLocation = "append",
+                    partitionBy = "name")
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+
+  dirs <- list.files(textPath)
+  expect_equal(length(dirs[substring(dirs, 1, nchar("name=")) == "name="]), 3)
+
+  unlink(checkpointPath)
+  unlink(textPath)
+  unlink(parquetPath)
+})
+
+test_that("Watermark", {
+  parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
+  schema <- structType(structField("value", "string"))
+  t <- Sys.time()
+  df <- as.DataFrame(lapply(list(t), as.character), schema)
+  write.df(df, parquetPath, "parquet", "append")
+  df <- read.stream(path = parquetPath, schema = "value STRING")
+  df <- withColumn(df, "eventTime", cast(df$value, "timestamp"))
+  df <- withWatermark(df, "eventTime", "10 seconds")
+  counts <- count(group_by(df, "eventTime"))
+  q <- write.stream(counts, "memory", queryName = "times", outputMode = "append")
+
+  # first events
+  df <- as.DataFrame(lapply(list(t + 1, t, t + 2), as.character), schema)
+  write.df(df, parquetPath, "parquet", "append")
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+
+  # advance watermark to 15
+  df <- as.DataFrame(lapply(list(t + 25), as.character), schema)
+  write.df(df, parquetPath, "parquet", "append")
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+
+  # old events, should be dropped
+  df <- as.DataFrame(lapply(list(t), as.character), schema)
+  write.df(df, parquetPath, "parquet", "append")
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+
+  # evict events less than previous watermark
+  df <- as.DataFrame(lapply(list(t + 25), as.character), schema)
+  write.df(df, parquetPath, "parquet", "append")
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+
+  times <- collect(sql("SELECT * FROM times"))
+  # looks like write timing can affect the first bucket; but it should be t
+  expect_equal(times[order(times$eventTime),][1, 2], 2)
+
+  stopQuery(q)
+  unlink(parquetPath)
+})
+
+test_that("Trigger", {
+  parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
+  schema <- structType(structField("value", "string"))
+  df <- as.DataFrame(lapply(list(Sys.time()), as.character), schema)
+  write.df(df, parquetPath, "parquet", "append")
+  df <- read.stream(path = parquetPath, schema = "value STRING")
+
+  expect_error(write.stream(df, "memory", queryName = "times", outputMode = "append",
+               trigger.processingTime = "", trigger.once = ""), "Multiple triggers not allowed.")
+
+  expect_error(write.stream(df, "memory", queryName = "times", outputMode = "append",
+               trigger.processingTime = ""),
+               "Value for trigger.processingTime must be a non-empty string.")
+
+  expect_error(write.stream(df, "memory", queryName = "times", outputMode = "append",
+               trigger.processingTime = "invalid"), "illegal argument")
+
+  expect_error(write.stream(df, "memory", queryName = "times", outputMode = "append",
+               trigger.once = ""), "Value for trigger.once must be TRUE.")
+
+  expect_error(write.stream(df, "memory", queryName = "times", outputMode = "append",
+               trigger.once = FALSE), "Value for trigger.once must be TRUE.")
+
+  q <- write.stream(df, "memory", queryName = "times", outputMode = "append", trigger.once = TRUE)
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+  df <- as.DataFrame(lapply(list(Sys.time()), as.character), schema)
+  write.df(df, parquetPath, "parquet", "append")
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+
+  expect_equal(nrow(collect(sql("SELECT * FROM times"))), 1)
+
+  stopQuery(q)
+  unlink(parquetPath)
+})
+
 unlink(jsonPath)
 unlink(jsonPathNa)
 
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index fb228f99ba7ab..24ae3776a217b 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -793,6 +793,10 @@ def trigger(self, processingTime=None, once=None):
         .. note:: Evolving.
 
         :param processingTime: a processing time interval as a string, e.g. '5 seconds', '1 minute'.
+                               Set a trigger that runs a query periodically based on the processing
+                               time. Only one trigger can be set.
+        :param once: if set to True, set a trigger that processes only one batch of data in a
+                     streaming query then terminates the query. Only one trigger can be set.
 
         >>> # trigger the query for execution every 5 seconds
         >>> writer = sdf.writeStream.trigger(processingTime='5 seconds')
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala
index 271bc4da99c08..19e3e55cb2829 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala
@@ -21,7 +21,7 @@ import org.apache.spark.annotation.{Experimental, InterfaceStability}
 import org.apache.spark.sql.streaming.Trigger
 
 /**
- * A [[Trigger]] that process only one batch of data in a streaming query then terminates
+ * A [[Trigger]] that processes only one batch of data in a streaming query then terminates
  * the query.
  */
 @Experimental

From a7cfd6beaf35f79a744047a4a09714ef1da60293 Mon Sep 17 00:00:00 2001
From: Kent Yao <yaooqinn@hotmail.com>
Date: Thu, 4 Jan 2018 19:10:10 +0800
Subject: [PATCH 010/566] [SPARK-22950][SQL] Handle ChildFirstURLClassLoader's
 parent

## What changes were proposed in this pull request?

ChildFirstClassLoader's parent is set to null, so we can't get jars from its parent. This will cause ClassNotFoundException during HiveClient initialization with builtin hive jars, where we may should use spark context loader instead.

## How was this patch tested?

add new ut
cc cloud-fan gatorsmile

Author: Kent Yao <yaooqinn@hotmail.com>

Closes #20145 from yaooqinn/SPARK-22950.

(cherry picked from commit 9fa703e89318922393bae03c0db4575f4f4b4c56)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../org/apache/spark/sql/hive/HiveUtils.scala |  4 +++-
 .../spark/sql/hive/HiveUtilsSuite.scala       | 20 +++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
index c489690af8cd1..c7717d70c996f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
@@ -47,7 +47,7 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.SQLConf._
 import org.apache.spark.sql.internal.StaticSQLConf.{CATALOG_IMPLEMENTATION, WAREHOUSE_PATH}
 import org.apache.spark.sql.types._
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{ChildFirstURLClassLoader, Utils}
 
 
 private[spark] object HiveUtils extends Logging {
@@ -312,6 +312,8 @@ private[spark] object HiveUtils extends Logging {
       // starting from the given classLoader.
       def allJars(classLoader: ClassLoader): Array[URL] = classLoader match {
         case null => Array.empty[URL]
+        case childFirst: ChildFirstURLClassLoader =>
+          childFirst.getURLs() ++ allJars(Utils.getSparkClassLoader)
         case urlClassLoader: URLClassLoader =>
           urlClassLoader.getURLs ++ allJars(urlClassLoader.getParent)
         case other => allJars(other.getParent)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala
index fdbfcf1a68440..8697d47e89e89 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala
@@ -17,11 +17,16 @@
 
 package org.apache.spark.sql.hive
 
+import java.net.URL
+
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader}
 
 class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
@@ -42,4 +47,19 @@ class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton
       assert(hiveConf("foo") === "bar")
     }
   }
+
+  test("ChildFirstURLClassLoader's parent is null, get spark classloader instead") {
+    val conf = new SparkConf
+    val contextClassLoader = Thread.currentThread().getContextClassLoader
+    val loader = new ChildFirstURLClassLoader(Array(), contextClassLoader)
+    try {
+      Thread.currentThread().setContextClassLoader(loader)
+      HiveUtils.newClientForMetadata(
+        conf,
+        SparkHadoopUtil.newConfiguration(conf),
+        HiveUtils.newTemporaryConfiguration(useInMemoryDerby = true))
+    } finally {
+      Thread.currentThread().setContextClassLoader(contextClassLoader)
+    }
+  }
 }

From eb99b8adecc050240ce9d5e0b92a20f018df465e Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 4 Jan 2018 19:17:22 +0800
Subject: [PATCH 011/566] [SPARK-22945][SQL] add java UDF APIs in the functions
 object

## What changes were proposed in this pull request?

Currently Scala users can use UDF like
```
val foo = udf((i: Int) => Math.random() + i).asNondeterministic
df.select(foo('a))
```
Python users can also do it with similar APIs. However Java users can't do it, we should add Java UDF APIs in the functions object.

## How was this patch tested?

new tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20141 from cloud-fan/udf.

(cherry picked from commit d5861aba9d80ca15ad3f22793b79822e470d6913)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/sql/UDFRegistration.scala    |  90 ++---
 .../sql/expressions/UserDefinedFunction.scala |   1 +
 .../org/apache/spark/sql/functions.scala      | 313 ++++++++++++++----
 .../apache/spark/sql/JavaDataFrameSuite.java  |  11 +
 .../scala/org/apache/spark/sql/UDFSuite.scala |  12 +-
 5 files changed, 315 insertions(+), 112 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index dc2468a721e41..f94baef39dfad 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql
 
-import java.lang.reflect.{ParameterizedType, Type}
+import java.lang.reflect.ParameterizedType
 
 import scala.reflect.runtime.universe.TypeTag
 import scala.util.Try
@@ -110,29 +110,29 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
 
   /* register 0-22 were generated by this script
 
-    (0 to 22).map { x =>
+    (0 to 22).foreach { x =>
       val types = (1 to x).foldRight("RT")((i, s) => {s"A$i, $s"})
-      val typeTags = (1 to x).map(i => s"A${i}: TypeTag").foldLeft("RT: TypeTag")(_ + ", " + _)
+      val typeTags = (1 to x).map(i => s"A$i: TypeTag").foldLeft("RT: TypeTag")(_ + ", " + _)
       val inputTypes = (1 to x).foldRight("Nil")((i, s) => {s"ScalaReflection.schemaFor[A$i].dataType :: $s"})
       println(s"""
-        /**
-         * Registers a deterministic Scala closure of ${x} arguments as user-defined function (UDF).
-         * @tparam RT return type of UDF.
-         * @since 1.3.0
-         */
-        def register[$typeTags](name: String, func: Function$x[$types]): UserDefinedFunction = {
-          val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
-          val inputTypes = Try($inputTypes).toOption
-          def builder(e: Seq[Expression]) = if (e.length == $x) {
-            ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
-          } else {
-             throw new AnalysisException("Invalid number of arguments for function " + name +
-               ". Expected: $x; Found: " + e.length)
-          }
-          functionRegistry.createOrReplaceTempFunction(name, builder)
-          val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-          if (nullable) udf else udf.asNonNullable()
-        }""")
+        |/**
+        | * Registers a deterministic Scala closure of $x arguments as user-defined function (UDF).
+        | * @tparam RT return type of UDF.
+        | * @since 1.3.0
+        | */
+        |def register[$typeTags](name: String, func: Function$x[$types]): UserDefinedFunction = {
+        |  val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
+        |  val inputTypes = Try($inputTypes).toOption
+        |  def builder(e: Seq[Expression]) = if (e.length == $x) {
+        |    ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
+        |  } else {
+        |    throw new AnalysisException("Invalid number of arguments for function " + name +
+        |      ". Expected: $x; Found: " + e.length)
+        |  }
+        |  functionRegistry.createOrReplaceTempFunction(name, builder)
+        |  val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+        |  if (nullable) udf else udf.asNonNullable()
+        |}""".stripMargin)
     }
 
     (0 to 22).foreach { i =>
@@ -144,7 +144,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
       val funcCall = if (i == 0) "() => func" else "func"
       println(s"""
         |/**
-        | * Register a user-defined function with ${i} arguments.
+        | * Register a deterministic Java UDF$i instance as user-defined function (UDF).
         | * @since $version
         | */
         |def register(name: String, f: UDF$i[$extTypeArgs], returnType: DataType): Unit = {
@@ -689,7 +689,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 0 arguments.
+   * Register a deterministic Java UDF0 instance as user-defined function (UDF).
    * @since 2.3.0
    */
   def register(name: String, f: UDF0[_], returnType: DataType): Unit = {
@@ -704,7 +704,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 1 arguments.
+   * Register a deterministic Java UDF1 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF1[_, _], returnType: DataType): Unit = {
@@ -719,7 +719,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 2 arguments.
+   * Register a deterministic Java UDF2 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF2[_, _, _], returnType: DataType): Unit = {
@@ -734,7 +734,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 3 arguments.
+   * Register a deterministic Java UDF3 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF3[_, _, _, _], returnType: DataType): Unit = {
@@ -749,7 +749,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 4 arguments.
+   * Register a deterministic Java UDF4 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF4[_, _, _, _, _], returnType: DataType): Unit = {
@@ -764,7 +764,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 5 arguments.
+   * Register a deterministic Java UDF5 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF5[_, _, _, _, _, _], returnType: DataType): Unit = {
@@ -779,7 +779,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 6 arguments.
+   * Register a deterministic Java UDF6 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF6[_, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -794,7 +794,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 7 arguments.
+   * Register a deterministic Java UDF7 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF7[_, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -809,7 +809,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 8 arguments.
+   * Register a deterministic Java UDF8 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF8[_, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -824,7 +824,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 9 arguments.
+   * Register a deterministic Java UDF9 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF9[_, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -839,7 +839,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 10 arguments.
+   * Register a deterministic Java UDF10 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF10[_, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -854,7 +854,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 11 arguments.
+   * Register a deterministic Java UDF11 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF11[_, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -869,7 +869,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 12 arguments.
+   * Register a deterministic Java UDF12 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF12[_, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -884,7 +884,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 13 arguments.
+   * Register a deterministic Java UDF13 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF13[_, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -899,7 +899,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 14 arguments.
+   * Register a deterministic Java UDF14 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -914,7 +914,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 15 arguments.
+   * Register a deterministic Java UDF15 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -929,7 +929,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 16 arguments.
+   * Register a deterministic Java UDF16 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -944,7 +944,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 17 arguments.
+   * Register a deterministic Java UDF17 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -959,7 +959,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 18 arguments.
+   * Register a deterministic Java UDF18 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -974,7 +974,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 19 arguments.
+   * Register a deterministic Java UDF19 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -989,7 +989,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 20 arguments.
+   * Register a deterministic Java UDF20 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -1004,7 +1004,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 21 arguments.
+   * Register a deterministic Java UDF21 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -1019,7 +1019,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 22 arguments.
+   * Register a deterministic Java UDF22 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
index 03b654f830520..40a058d2cadd2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
@@ -66,6 +66,7 @@ case class UserDefinedFunction protected[sql] (
    *
    * @since 1.3.0
    */
+  @scala.annotation.varargs
   def apply(exprs: Column*): Column = {
     Column(ScalaUDF(
       f,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 530a525a01dec..0d11682d80a3c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -24,6 +24,7 @@ import scala.util.Try
 import scala.util.control.NonFatal
 
 import org.apache.spark.annotation.InterfaceStability
+import org.apache.spark.sql.api.java._
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis.{Star, UnresolvedFunction}
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
@@ -32,7 +33,6 @@ import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.logical.{HintInfo, ResolvedHint}
 import org.apache.spark.sql.execution.SparkSqlParser
 import org.apache.spark.sql.expressions.UserDefinedFunction
-import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -3254,42 +3254,66 @@ object functions {
    */
   def map_values(e: Column): Column = withExpr { MapValues(e.expr) }
 
-  //////////////////////////////////////////////////////////////////////////////////////////////
-  //////////////////////////////////////////////////////////////////////////////////////////////
-
   // scalastyle:off line.size.limit
   // scalastyle:off parameter.number
 
   /* Use the following code to generate:
-  (0 to 10).map { x =>
+
+  (0 to 10).foreach { x =>
     val types = (1 to x).foldRight("RT")((i, s) => {s"A$i, $s"})
     val typeTags = (1 to x).map(i => s"A$i: TypeTag").foldLeft("RT: TypeTag")(_ + ", " + _)
     val inputTypes = (1 to x).foldRight("Nil")((i, s) => {s"ScalaReflection.schemaFor(typeTag[A$i]).dataType :: $s"})
     println(s"""
-    /**
-     * Defines a deterministic user-defined function of ${x} arguments as user-defined
-     * function (UDF). The data types are automatically inferred based on the function's
-     * signature. To change a UDF to nondeterministic, call the API
-     * `UserDefinedFunction.asNondeterministic()`.
-     *
-     * @group udf_funcs
-     * @since 1.3.0
-     */
-    def udf[$typeTags](f: Function$x[$types]): UserDefinedFunction = {
-      val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
-      val inputTypes = Try($inputTypes).toOption
-      val udf = UserDefinedFunction(f, dataType, inputTypes)
-      if (nullable) udf else udf.asNonNullable()
-    }""")
+      |/**
+      | * Defines a Scala closure of $x arguments as user-defined function (UDF).
+      | * The data types are automatically inferred based on the Scala closure's
+      | * signature. By default the returned UDF is deterministic. To change it to
+      | * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
+      | *
+      | * @group udf_funcs
+      | * @since 1.3.0
+      | */
+      |def udf[$typeTags](f: Function$x[$types]): UserDefinedFunction = {
+      |  val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
+      |  val inputTypes = Try($inputTypes).toOption
+      |  val udf = UserDefinedFunction(f, dataType, inputTypes)
+      |  if (nullable) udf else udf.asNonNullable()
+      |}""".stripMargin)
+  }
+
+  (0 to 10).foreach { i =>
+    val extTypeArgs = (0 to i).map(_ => "_").mkString(", ")
+    val anyTypeArgs = (0 to i).map(_ => "Any").mkString(", ")
+    val anyCast = s".asInstanceOf[UDF$i[$anyTypeArgs]]"
+    val anyParams = (1 to i).map(_ => "_: Any").mkString(", ")
+    val funcCall = if (i == 0) "() => func" else "func"
+    println(s"""
+      |/**
+      | * Defines a Java UDF$i instance as user-defined function (UDF).
+      | * The caller must specify the output data type, and there is no automatic input type coercion.
+      | * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+      | * API `UserDefinedFunction.asNondeterministic()`.
+      | *
+      | * @group udf_funcs
+      | * @since 2.3.0
+      | */
+      |def udf(f: UDF$i[$extTypeArgs], returnType: DataType): UserDefinedFunction = {
+      |  val func = f$anyCast.call($anyParams)
+      |  UserDefinedFunction($funcCall, returnType, inputTypes = None)
+      |}""".stripMargin)
   }
 
   */
 
+  //////////////////////////////////////////////////////////////////////////////////////////////
+  // Scala UDF functions
+  //////////////////////////////////////////////////////////////////////////////////////////////
+
   /**
-   * Defines a deterministic user-defined function of 0 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 0 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3302,10 +3326,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 1 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 1 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3318,10 +3342,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 2 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 2 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3334,10 +3358,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 3 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 3 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3350,10 +3374,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 4 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 4 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3366,10 +3390,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 5 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 5 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3382,10 +3406,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 6 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 6 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3398,10 +3422,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 7 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 7 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3414,10 +3438,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 8 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 8 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3430,10 +3454,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 9 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 9 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3446,10 +3470,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 10 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 10 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3461,13 +3485,172 @@ object functions {
     if (nullable) udf else udf.asNonNullable()
   }
 
+  //////////////////////////////////////////////////////////////////////////////////////////////
+  // Java UDF functions
+  //////////////////////////////////////////////////////////////////////////////////////////////
+
+  /**
+   * Defines a Java UDF0 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF0[_], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF0[Any]].call()
+    UserDefinedFunction(() => func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF1 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF1[_, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF1[Any, Any]].call(_: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF2 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF2[_, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF2[Any, Any, Any]].call(_: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF3 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF3[_, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF3[Any, Any, Any, Any]].call(_: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF4 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF4[_, _, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF4[Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF5 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF5[_, _, _, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF5[Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF6 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF6[_, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF6[Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF7 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF7[_, _, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF7[Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF8 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF8[_, _, _, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF8[Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF9 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF9[_, _, _, _, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF9[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF10 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF10[_, _, _, _, _, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
   // scalastyle:on parameter.number
   // scalastyle:on line.size.limit
 
   /**
    * Defines a deterministic user-defined function (UDF) using a Scala closure. For this variant,
    * the caller must specify the output data type, and there is no automatic input type coercion.
-   * To change a UDF to nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
    *
    * @param f  A closure in Scala
    * @param dataType  The output data type of the UDF
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
index b007093dad84b..4f8a31f185724 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -36,6 +36,7 @@
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.expressions.UserDefinedFunction;
 import org.apache.spark.sql.test.TestSparkSession;
 import org.apache.spark.sql.types.*;
 import org.apache.spark.util.sketch.BloomFilter;
@@ -455,4 +456,14 @@ public void testCircularReferenceBean() {
     CircularReference1Bean bean = new CircularReference1Bean();
     spark.createDataFrame(Arrays.asList(bean), CircularReference1Bean.class);
   }
+
+  @Test
+  public void testUDF() {
+    UserDefinedFunction foo = udf((Integer i, String s) -> i.toString() + s, DataTypes.StringType);
+    Dataset<Row> df = spark.table("testData").select(foo.apply(col("key"), col("value")));
+    String[] result = df.collectAsList().stream().map(row -> row.getString(0)).toArray(String[]::new);
+    String[] expected = spark.table("testData").collectAsList().stream()
+      .map(row -> row.get(0).toString() + row.getString(1)).toArray(String[]::new);
+    Assert.assertArrayEquals(expected, result);
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index 7f1c009ca6e7a..db37be68e42e6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -17,12 +17,13 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.api.java._
 import org.apache.spark.sql.catalyst.plans.logical.Project
 import org.apache.spark.sql.execution.command.ExplainCommand
-import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.functions.udf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.test.SQLTestData._
-import org.apache.spark.sql.types.DataTypes
+import org.apache.spark.sql.types.{DataTypes, DoubleType}
 
 private case class FunctionResult(f1: String, f2: String)
 
@@ -128,6 +129,13 @@ class UDFSuite extends QueryTest with SharedSQLContext {
     val df2 = testData.select(bar())
     assert(df2.logicalPlan.asInstanceOf[Project].projectList.forall(!_.deterministic))
     assert(df2.head().getDouble(0) >= 0.0)
+
+    val javaUdf = udf(new UDF0[Double] {
+      override def call(): Double = Math.random()
+    }, DoubleType).asNondeterministic()
+    val df3 = testData.select(javaUdf())
+    assert(df3.logicalPlan.asInstanceOf[Project].projectList.forall(!_.deterministic))
+    assert(df3.head().getDouble(0) >= 0.0)
   }
 
   test("TwoArgument UDF") {

From 1f5e3540c7535ceaea66ebd5ee2f598e8b3ba1a5 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 4 Jan 2018 21:07:31 +0800
Subject: [PATCH 012/566] [SPARK-22939][PYSPARK] Support Spark UDF in
 registerFunction

## What changes were proposed in this pull request?
```Python
import random
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, StringType
random_udf = udf(lambda: int(random.random() * 100), IntegerType()).asNondeterministic()
spark.catalog.registerFunction("random_udf", random_udf, StringType())
spark.sql("SELECT random_udf()").collect()
```

We will get the following error.
```
Py4JError: An error occurred while calling o29.__getnewargs__. Trace:
py4j.Py4JException: Method __getnewargs__([]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
	at py4j.Gateway.invoke(Gateway.java:274)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)
```

This PR is to support it.

## How was this patch tested?
WIP

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20137 from gatorsmile/registerFunction.

(cherry picked from commit 5aadbc929cb194e06dbd3bab054a161569289af5)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 python/pyspark/sql/catalog.py | 27 +++++++++++++++----
 python/pyspark/sql/context.py | 16 +++++++++---
 python/pyspark/sql/tests.py   | 49 +++++++++++++++++++++++++----------
 python/pyspark/sql/udf.py     | 21 ++++++++++-----
 4 files changed, 84 insertions(+), 29 deletions(-)

diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py
index 659bc65701a0c..156603128d063 100644
--- a/python/pyspark/sql/catalog.py
+++ b/python/pyspark/sql/catalog.py
@@ -227,15 +227,15 @@ def dropGlobalTempView(self, viewName):
     @ignore_unicode_prefix
     @since(2.0)
     def registerFunction(self, name, f, returnType=StringType()):
-        """Registers a python function (including lambda function) as a UDF
-        so it can be used in SQL statements.
+        """Registers a Python function (including lambda function) or a :class:`UserDefinedFunction`
+        as a UDF. The registered UDF can be used in SQL statement.
 
         In addition to a name and the function itself, the return type can be optionally specified.
         When the return type is not given it default to a string and conversion will automatically
         be done.  For any other return type, the produced object must match the specified type.
 
         :param name: name of the UDF
-        :param f: python function
+        :param f: a Python function, or a wrapped/native UserDefinedFunction
         :param returnType: a :class:`pyspark.sql.types.DataType` object
         :return: a wrapped :class:`UserDefinedFunction`
 
@@ -255,9 +255,26 @@ def registerFunction(self, name, f, returnType=StringType()):
         >>> _ = spark.udf.register("stringLengthInt", len, IntegerType())
         >>> spark.sql("SELECT stringLengthInt('test')").collect()
         [Row(stringLengthInt(test)=4)]
+
+        >>> import random
+        >>> from pyspark.sql.functions import udf
+        >>> from pyspark.sql.types import IntegerType, StringType
+        >>> random_udf = udf(lambda: random.randint(0, 100), IntegerType()).asNondeterministic()
+        >>> newRandom_udf = spark.catalog.registerFunction("random_udf", random_udf, StringType())
+        >>> spark.sql("SELECT random_udf()").collect()  # doctest: +SKIP
+        [Row(random_udf()=u'82')]
+        >>> spark.range(1).select(newRandom_udf()).collect()  # doctest: +SKIP
+        [Row(random_udf()=u'62')]
         """
-        udf = UserDefinedFunction(f, returnType=returnType, name=name,
-                                  evalType=PythonEvalType.SQL_BATCHED_UDF)
+
+        # This is to check whether the input function is a wrapped/native UserDefinedFunction
+        if hasattr(f, 'asNondeterministic'):
+            udf = UserDefinedFunction(f.func, returnType=returnType, name=name,
+                                      evalType=PythonEvalType.SQL_BATCHED_UDF,
+                                      deterministic=f.deterministic)
+        else:
+            udf = UserDefinedFunction(f, returnType=returnType, name=name,
+                                      evalType=PythonEvalType.SQL_BATCHED_UDF)
         self._jsparkSession.udf().registerPython(name, udf._judf)
         return udf._wrapped()
 
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index b1e723cdecef3..b8d86cc098e94 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -175,15 +175,15 @@ def range(self, start, end=None, step=1, numPartitions=None):
     @ignore_unicode_prefix
     @since(1.2)
     def registerFunction(self, name, f, returnType=StringType()):
-        """Registers a python function (including lambda function) as a UDF
-        so it can be used in SQL statements.
+        """Registers a Python function (including lambda function) or a :class:`UserDefinedFunction`
+        as a UDF. The registered UDF can be used in SQL statement.
 
         In addition to a name and the function itself, the return type can be optionally specified.
         When the return type is not given it default to a string and conversion will automatically
         be done.  For any other return type, the produced object must match the specified type.
 
         :param name: name of the UDF
-        :param f: python function
+        :param f: a Python function, or a wrapped/native UserDefinedFunction
         :param returnType: a :class:`pyspark.sql.types.DataType` object
         :return: a wrapped :class:`UserDefinedFunction`
 
@@ -203,6 +203,16 @@ def registerFunction(self, name, f, returnType=StringType()):
         >>> _ = sqlContext.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
         >>> sqlContext.sql("SELECT stringLengthInt('test')").collect()
         [Row(stringLengthInt(test)=4)]
+
+        >>> import random
+        >>> from pyspark.sql.functions import udf
+        >>> from pyspark.sql.types import IntegerType, StringType
+        >>> random_udf = udf(lambda: random.randint(0, 100), IntegerType()).asNondeterministic()
+        >>> newRandom_udf = sqlContext.registerFunction("random_udf", random_udf, StringType())
+        >>> sqlContext.sql("SELECT random_udf()").collect()  # doctest: +SKIP
+        [Row(random_udf()=u'82')]
+        >>> sqlContext.range(1).select(newRandom_udf()).collect()  # doctest: +SKIP
+        [Row(random_udf()=u'62')]
         """
         return self.sparkSession.catalog.registerFunction(name, f, returnType)
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 67bdb3d72d93b..6dc767f9ec46e 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -378,6 +378,41 @@ def test_udf2(self):
         [res] = self.spark.sql("SELECT strlen(a) FROM test WHERE strlen(a) > 1").collect()
         self.assertEqual(4, res[0])
 
+    def test_udf3(self):
+        twoargs = self.spark.catalog.registerFunction(
+            "twoArgs", UserDefinedFunction(lambda x, y: len(x) + y), IntegerType())
+        self.assertEqual(twoargs.deterministic, True)
+        [row] = self.spark.sql("SELECT twoArgs('test', 1)").collect()
+        self.assertEqual(row[0], 5)
+
+    def test_nondeterministic_udf(self):
+        from pyspark.sql.functions import udf
+        import random
+        udf_random_col = udf(lambda: int(100 * random.random()), IntegerType()).asNondeterministic()
+        self.assertEqual(udf_random_col.deterministic, False)
+        df = self.spark.createDataFrame([Row(1)]).select(udf_random_col().alias('RAND'))
+        udf_add_ten = udf(lambda rand: rand + 10, IntegerType())
+        [row] = df.withColumn('RAND_PLUS_TEN', udf_add_ten('RAND')).collect()
+        self.assertEqual(row[0] + 10, row[1])
+
+    def test_nondeterministic_udf2(self):
+        import random
+        from pyspark.sql.functions import udf
+        random_udf = udf(lambda: random.randint(6, 6), IntegerType()).asNondeterministic()
+        self.assertEqual(random_udf.deterministic, False)
+        random_udf1 = self.spark.catalog.registerFunction("randInt", random_udf, StringType())
+        self.assertEqual(random_udf1.deterministic, False)
+        [row] = self.spark.sql("SELECT randInt()").collect()
+        self.assertEqual(row[0], "6")
+        [row] = self.spark.range(1).select(random_udf1()).collect()
+        self.assertEqual(row[0], "6")
+        [row] = self.spark.range(1).select(random_udf()).collect()
+        self.assertEqual(row[0], 6)
+        # render_doc() reproduces the help() exception without printing output
+        pydoc.render_doc(udf(lambda: random.randint(6, 6), IntegerType()))
+        pydoc.render_doc(random_udf)
+        pydoc.render_doc(random_udf1)
+
     def test_chained_udf(self):
         self.spark.catalog.registerFunction("double", lambda x: x + x, IntegerType())
         [row] = self.spark.sql("SELECT double(1)").collect()
@@ -435,15 +470,6 @@ def test_udf_with_array_type(self):
         self.assertEqual(list(range(3)), l1)
         self.assertEqual(1, l2)
 
-    def test_nondeterministic_udf(self):
-        from pyspark.sql.functions import udf
-        import random
-        udf_random_col = udf(lambda: int(100 * random.random()), IntegerType()).asNondeterministic()
-        df = self.spark.createDataFrame([Row(1)]).select(udf_random_col().alias('RAND'))
-        udf_add_ten = udf(lambda rand: rand + 10, IntegerType())
-        [row] = df.withColumn('RAND_PLUS_TEN', udf_add_ten('RAND')).collect()
-        self.assertEqual(row[0] + 10, row[1])
-
     def test_broadcast_in_udf(self):
         bar = {"a": "aa", "b": "bb", "c": "abc"}
         foo = self.sc.broadcast(bar)
@@ -567,7 +593,6 @@ def test_read_multiple_orc_file(self):
 
     def test_udf_with_input_file_name(self):
         from pyspark.sql.functions import udf, input_file_name
-        from pyspark.sql.types import StringType
         sourceFile = udf(lambda path: path, StringType())
         filePath = "python/test_support/sql/people1.json"
         row = self.spark.read.json(filePath).select(sourceFile(input_file_name())).first()
@@ -575,7 +600,6 @@ def test_udf_with_input_file_name(self):
 
     def test_udf_with_input_file_name_for_hadooprdd(self):
         from pyspark.sql.functions import udf, input_file_name
-        from pyspark.sql.types import StringType
 
         def filename(path):
             return path
@@ -635,7 +659,6 @@ def test_udf_with_string_return_type(self):
 
     def test_udf_shouldnt_accept_noncallable_object(self):
         from pyspark.sql.functions import UserDefinedFunction
-        from pyspark.sql.types import StringType
 
         non_callable = None
         self.assertRaises(TypeError, UserDefinedFunction, non_callable, StringType())
@@ -1299,7 +1322,6 @@ def test_between_function(self):
                          df.filter(df.a.between(df.b, df.c)).collect())
 
     def test_struct_type(self):
-        from pyspark.sql.types import StructType, StringType, StructField
         struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None)
         struct2 = StructType([StructField("f1", StringType(), True),
                               StructField("f2", StringType(), True, None)])
@@ -1368,7 +1390,6 @@ def test_parse_datatype_string(self):
             _parse_datatype_string("a INT, c DOUBLE"))
 
     def test_metadata_null(self):
-        from pyspark.sql.types import StructType, StringType, StructField
         schema = StructType([StructField("f1", StringType(), True, None),
                              StructField("f2", StringType(), True, {'a': None})])
         rdd = self.sc.parallelize([["a", "b"], ["c", "d"]])
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index 54b5a8656e1c8..5e75eb6545333 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -56,7 +56,8 @@ def _create_udf(f, returnType, evalType):
             )
 
     # Set the name of the UserDefinedFunction object to be the name of function f
-    udf_obj = UserDefinedFunction(f, returnType=returnType, name=None, evalType=evalType)
+    udf_obj = UserDefinedFunction(
+        f, returnType=returnType, name=None, evalType=evalType, deterministic=True)
     return udf_obj._wrapped()
 
 
@@ -67,8 +68,10 @@ class UserDefinedFunction(object):
     .. versionadded:: 1.3
     """
     def __init__(self, func,
-                 returnType=StringType(), name=None,
-                 evalType=PythonEvalType.SQL_BATCHED_UDF):
+                 returnType=StringType(),
+                 name=None,
+                 evalType=PythonEvalType.SQL_BATCHED_UDF,
+                 deterministic=True):
         if not callable(func):
             raise TypeError(
                 "Invalid function: not a function or callable (__call__ is not defined): "
@@ -92,7 +95,7 @@ def __init__(self, func,
             func.__name__ if hasattr(func, '__name__')
             else func.__class__.__name__)
         self.evalType = evalType
-        self._deterministic = True
+        self.deterministic = deterministic
 
     @property
     def returnType(self):
@@ -130,7 +133,7 @@ def _create_judf(self):
         wrapped_func = _wrap_function(sc, self.func, self.returnType)
         jdt = spark._jsparkSession.parseDataType(self.returnType.json())
         judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction(
-            self._name, wrapped_func, jdt, self.evalType, self._deterministic)
+            self._name, wrapped_func, jdt, self.evalType, self.deterministic)
         return judf
 
     def __call__(self, *cols):
@@ -138,6 +141,9 @@ def __call__(self, *cols):
         sc = SparkContext._active_spark_context
         return Column(judf.apply(_to_seq(sc, cols, _to_java_column)))
 
+    # This function is for improving the online help system in the interactive interpreter.
+    # For example, the built-in help / pydoc.help. It wraps the UDF with the docstring and
+    # argument annotation. (See: SPARK-19161)
     def _wrapped(self):
         """
         Wrap this udf with a function and attach docstring from func
@@ -162,7 +168,8 @@ def wrapper(*args):
         wrapper.func = self.func
         wrapper.returnType = self.returnType
         wrapper.evalType = self.evalType
-        wrapper.asNondeterministic = self.asNondeterministic
+        wrapper.deterministic = self.deterministic
+        wrapper.asNondeterministic = lambda: self.asNondeterministic()._wrapped()
 
         return wrapper
 
@@ -172,5 +179,5 @@ def asNondeterministic(self):
 
         .. versionadded:: 2.3
         """
-        self._deterministic = False
+        self.deterministic = False
         return self

From bcfeef5a944d56af1a5106f5c07296ea2c262991 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Thu, 4 Jan 2018 21:15:10 +0800
Subject: [PATCH 013/566] [SPARK-22771][SQL] Add a missing return statement in
 Concat.checkInputDataTypes

## What changes were proposed in this pull request?
This pr is a follow-up to fix a bug left in #19977.

## How was this patch tested?
Added tests in `StringExpressionsSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #20149 from maropu/SPARK-22771-FOLLOWUP.

(cherry picked from commit 6f68316e98fad72b171df422566e1fc9a7bbfcde)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../sql/catalyst/expressions/stringExpressions.scala |  2 +-
 .../expressions/StringExpressionsSuite.scala         | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index b0da55a4a961b..41dc762154a4c 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -58,7 +58,7 @@ case class Concat(children: Seq[Expression]) extends Expression {
     } else {
       val childTypes = children.map(_.dataType)
       if (childTypes.exists(tpe => !Seq(StringType, BinaryType).contains(tpe))) {
-        TypeCheckResult.TypeCheckFailure(
+        return TypeCheckResult.TypeCheckFailure(
           s"input to function $prettyName should have StringType or BinaryType, but it's " +
             childTypes.map(_.simpleString).mkString("[", ", ", "]"))
       }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 54cde77176e27..97ddbeba2c5ca 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -51,6 +51,18 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Concat(strs.map(Literal.create(_, StringType))), strs.mkString, EmptyRow)
   }
 
+  test("SPARK-22771 Check Concat.checkInputDataTypes results") {
+    assert(Concat(Seq.empty[Expression]).checkInputDataTypes().isSuccess)
+    assert(Concat(Literal.create("a") :: Literal.create("b") :: Nil)
+      .checkInputDataTypes().isSuccess)
+    assert(Concat(Literal.create("a".getBytes) :: Literal.create("b".getBytes) :: Nil)
+      .checkInputDataTypes().isSuccess)
+    assert(Concat(Literal.create(1) :: Literal.create(2) :: Nil)
+      .checkInputDataTypes().isFailure)
+    assert(Concat(Literal.create("a") :: Literal.create("b".getBytes) :: Nil)
+      .checkInputDataTypes().isFailure)
+  }
+
   test("concat_ws") {
     def testConcatWs(expected: String, sep: String, inputs: Any*): Unit = {
       val inputExprs = inputs.map {

From cd92913f345c8d932d3c651626c7f803e6abdcdb Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Thu, 4 Jan 2018 11:39:42 -0800
Subject: [PATCH 014/566] [SPARK-21475][CORE][2ND ATTEMPT] Change to use NIO's
 Files API for external shuffle service

## What changes were proposed in this pull request?

This PR is the second attempt of #18684 , NIO's Files API doesn't override `skip` method for `InputStream`, so it will bring in performance issue (mentioned in #20119). But using `FileInputStream`/`FileOutputStream` will also bring in memory issue (https://dzone.com/articles/fileinputstream-fileoutputstream-considered-harmful), which is severe for long running external shuffle service. So here in this proposal, only fixing the external shuffle service related code.

## How was this patch tested?

Existing tests.

Author: jerryshao <sshao@hortonworks.com>

Closes #20144 from jerryshao/SPARK-21475-v2.

(cherry picked from commit 93f92c0ed7442a4382e97254307309977ff676f8)
Signed-off-by: Shixiong Zhu <zsxwing@gmail.com>
---
 .../apache/spark/network/buffer/FileSegmentManagedBuffer.java | 3 ++-
 .../apache/spark/network/shuffle/ShuffleIndexInformation.java | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java b/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java
index c20fab83c3460..8b8f9892847c3 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java
@@ -24,6 +24,7 @@
 import java.io.RandomAccessFile;
 import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;
+import java.nio.file.StandardOpenOption;
 
 import com.google.common.base.Objects;
 import com.google.common.io.ByteStreams;
@@ -132,7 +133,7 @@ public Object convertToNetty() throws IOException {
     if (conf.lazyFileDescriptor()) {
       return new DefaultFileRegion(file, offset, length);
     } else {
-      FileChannel fileChannel = new FileInputStream(file).getChannel();
+      FileChannel fileChannel = FileChannel.open(file.toPath(), StandardOpenOption.READ);
       return new DefaultFileRegion(fileChannel, offset, length);
     }
   }
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java
index eacf485344b76..386738ece51a6 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java
@@ -19,10 +19,10 @@
 
 import java.io.DataInputStream;
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.LongBuffer;
+import java.nio.file.Files;
 
 /**
  * Keeps the index information for a particular map output
@@ -39,7 +39,7 @@ public ShuffleIndexInformation(File indexFile) throws IOException {
     offsets = buffer.asLongBuffer();
     DataInputStream dis = null;
     try {
-      dis = new DataInputStream(new FileInputStream(indexFile));
+      dis = new DataInputStream(Files.newInputStream(indexFile.toPath()));
       dis.readFully(buffer.array());
     } finally {
       if (dis != null) {

From bc4bef472de0e99f74a80954d694c3d1744afe3a Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 4 Jan 2018 16:19:00 -0600
Subject: [PATCH 015/566] [SPARK-22850][CORE] Ensure queued events are
 delivered to all event queues.

The code in LiveListenerBus was queueing events before start in the
queues themselves; so in situations like the following:

   bus.post(someEvent)
   bus.addToEventLogQueue(listener)
   bus.start()

"someEvent" would not be delivered to "listener" if that was the first
listener in the queue, because the queue wouldn't exist when the
event was posted.

This change buffers the events before starting the bus in the bus itself,
so that they can be delivered to all registered queues when the bus is
started.

Also tweaked the unit tests to cover the behavior above.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20039 from vanzin/SPARK-22850.

(cherry picked from commit d2cddc88eac32f26b18ec26bb59e85c6f09a8c88)
Signed-off-by: Imran Rashid <irashid@cloudera.com>
---
 .../spark/scheduler/LiveListenerBus.scala     | 45 ++++++++++++++++---
 .../spark/scheduler/SparkListenerSuite.scala  | 21 +++++----
 2 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
index 23121402b1025..ba6387a8f08ad 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
@@ -62,6 +62,9 @@ private[spark] class LiveListenerBus(conf: SparkConf) {
 
   private val queues = new CopyOnWriteArrayList[AsyncEventQueue]()
 
+  // Visible for testing.
+  @volatile private[scheduler] var queuedEvents = new mutable.ListBuffer[SparkListenerEvent]()
+
   /** Add a listener to queue shared by all non-internal listeners. */
   def addToSharedQueue(listener: SparkListenerInterface): Unit = {
     addToQueue(listener, SHARED_QUEUE)
@@ -125,13 +128,39 @@ private[spark] class LiveListenerBus(conf: SparkConf) {
 
   /** Post an event to all queues. */
   def post(event: SparkListenerEvent): Unit = {
-    if (!stopped.get()) {
-      metrics.numEventsPosted.inc()
-      val it = queues.iterator()
-      while (it.hasNext()) {
-        it.next().post(event)
+    if (stopped.get()) {
+      return
+    }
+
+    metrics.numEventsPosted.inc()
+
+    // If the event buffer is null, it means the bus has been started and we can avoid
+    // synchronization and post events directly to the queues. This should be the most
+    // common case during the life of the bus.
+    if (queuedEvents == null) {
+      postToQueues(event)
+      return
+    }
+
+    // Otherwise, need to synchronize to check whether the bus is started, to make sure the thread
+    // calling start() picks up the new event.
+    synchronized {
+      if (!started.get()) {
+        queuedEvents += event
+        return
       }
     }
+
+    // If the bus was already started when the check above was made, just post directly to the
+    // queues.
+    postToQueues(event)
+  }
+
+  private def postToQueues(event: SparkListenerEvent): Unit = {
+    val it = queues.iterator()
+    while (it.hasNext()) {
+      it.next().post(event)
+    }
   }
 
   /**
@@ -149,7 +178,11 @@ private[spark] class LiveListenerBus(conf: SparkConf) {
     }
 
     this.sparkContext = sc
-    queues.asScala.foreach(_.start(sc))
+    queues.asScala.foreach { q =>
+      q.start(sc)
+      queuedEvents.foreach(q.post)
+    }
+    queuedEvents = null
     metricsSystem.registerSource(metrics)
   }
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
index 1beb36afa95f0..da6ecb82c7e42 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
@@ -48,7 +48,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     bus.metrics.metricRegistry.counter(s"queue.$SHARED_QUEUE.numDroppedEvents").getCount
   }
 
-  private def queueSize(bus: LiveListenerBus): Int = {
+  private def sharedQueueSize(bus: LiveListenerBus): Int = {
     bus.metrics.metricRegistry.getGauges().get(s"queue.$SHARED_QUEUE.size").getValue()
       .asInstanceOf[Int]
   }
@@ -73,12 +73,11 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     val conf = new SparkConf()
     val counter = new BasicJobCounter
     val bus = new LiveListenerBus(conf)
-    bus.addToSharedQueue(counter)
 
     // Metrics are initially empty.
     assert(bus.metrics.numEventsPosted.getCount === 0)
     assert(numDroppedEvents(bus) === 0)
-    assert(queueSize(bus) === 0)
+    assert(bus.queuedEvents.size === 0)
     assert(eventProcessingTimeCount(bus) === 0)
 
     // Post five events:
@@ -87,7 +86,10 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     // Five messages should be marked as received and queued, but no messages should be posted to
     // listeners yet because the the listener bus hasn't been started.
     assert(bus.metrics.numEventsPosted.getCount === 5)
-    assert(queueSize(bus) === 5)
+    assert(bus.queuedEvents.size === 5)
+
+    // Add the counter to the bus after messages have been queued for later delivery.
+    bus.addToSharedQueue(counter)
     assert(counter.count === 0)
 
     // Starting listener bus should flush all buffered events
@@ -95,9 +97,12 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     Mockito.verify(mockMetricsSystem).registerSource(bus.metrics)
     bus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     assert(counter.count === 5)
-    assert(queueSize(bus) === 0)
+    assert(sharedQueueSize(bus) === 0)
     assert(eventProcessingTimeCount(bus) === 5)
 
+    // After the bus is started, there should be no more queued events.
+    assert(bus.queuedEvents === null)
+
     // After listener bus has stopped, posting events should not increment counter
     bus.stop()
     (1 to 5).foreach { _ => bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded)) }
@@ -188,18 +193,18 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     // Post a message to the listener bus and wait for processing to begin:
     bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded))
     listenerStarted.acquire()
-    assert(queueSize(bus) === 0)
+    assert(sharedQueueSize(bus) === 0)
     assert(numDroppedEvents(bus) === 0)
 
     // If we post an additional message then it should remain in the queue because the listener is
     // busy processing the first event:
     bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded))
-    assert(queueSize(bus) === 1)
+    assert(sharedQueueSize(bus) === 1)
     assert(numDroppedEvents(bus) === 0)
 
     // The queue is now full, so any additional events posted to the listener will be dropped:
     bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded))
-    assert(queueSize(bus) === 1)
+    assert(sharedQueueSize(bus) === 1)
     assert(numDroppedEvents(bus) === 1)
 
     // Allow the the remaining events to be processed so we can stop the listener bus:

From 2ab4012adda941ebd637bd248f65cefdf4aaf110 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 4 Jan 2018 15:00:09 -0800
Subject: [PATCH 016/566] [SPARK-22948][K8S] Move SparkPodInitContainer to
 correct package.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20156 from vanzin/SPARK-22948.

(cherry picked from commit 95f9659abe8845f9f3f42fd7ababd79e55c52489)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 dev/sparktestsupport/modules.py                                 | 2 +-
 .../spark/deploy/{rest => }/k8s/SparkPodInitContainer.scala     | 2 +-
 .../deploy/{rest => }/k8s/SparkPodInitContainerSuite.scala      | 2 +-
 .../docker/src/main/dockerfiles/init-container/Dockerfile       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/{rest => }/k8s/SparkPodInitContainer.scala (99%)
 rename resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/{rest => }/k8s/SparkPodInitContainerSuite.scala (98%)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index f834563da9dda..7164180a6a7b0 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -539,7 +539,7 @@ def __hash__(self):
 kubernetes = Module(
     name="kubernetes",
     dependencies=[],
-    source_file_regexes=["resource-managers/kubernetes/core"],
+    source_file_regexes=["resource-managers/kubernetes"],
     build_profile_flags=["-Pkubernetes"],
     sbt_test_goals=["kubernetes/test"]
 )
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkPodInitContainer.scala
similarity index 99%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainer.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkPodInitContainer.scala
index 4a4b628aedbbf..c0f08786b76a1 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkPodInitContainer.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.k8s
+package org.apache.spark.deploy.k8s
 
 import java.io.File
 import java.util.concurrent.TimeUnit
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SparkPodInitContainerSuite.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainerSuite.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SparkPodInitContainerSuite.scala
index 6c557ec4a7c9a..e0f29ecd0fb53 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainerSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SparkPodInitContainerSuite.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.k8s
+package org.apache.spark.deploy.k8s
 
 import java.io.File
 import java.util.UUID
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
index 055493188fcb7..047056ab2633b 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
@@ -21,4 +21,4 @@ FROM spark-base
 # command should be invoked from the top level directory of the Spark distribution. E.g.:
 # docker build -t spark-init:latest -f kubernetes/dockerfiles/init-container/Dockerfile .
 
-ENTRYPOINT [ "/opt/entrypoint.sh", "/opt/spark/bin/spark-class", "org.apache.spark.deploy.rest.k8s.SparkPodInitContainer" ]
+ENTRYPOINT [ "/opt/entrypoint.sh", "/opt/spark/bin/spark-class", "org.apache.spark.deploy.k8s.SparkPodInitContainer" ]

From 84707f0c6afa9c5417e271657ff930930f82213c Mon Sep 17 00:00:00 2001
From: Yinan Li <liyinan926@gmail.com>
Date: Thu, 4 Jan 2018 15:35:20 -0800
Subject: [PATCH 017/566] [SPARK-22953][K8S] Avoids adding duplicated secret
 volumes when init-container is used

## What changes were proposed in this pull request?

User-specified secrets are mounted into both the main container and init-container (when it is used) in a Spark driver/executor pod, using the `MountSecretsBootstrap`. Because `MountSecretsBootstrap` always adds new secret volumes for the secrets to the pod, the same secret volumes get added twice, one when mounting the secrets to the main container, and the other when mounting the secrets to the init-container. This PR fixes the issue by separating `MountSecretsBootstrap.mountSecrets` out into two methods: `addSecretVolumes` for adding secret volumes to a pod and `mountSecrets` for mounting secret volumes to a container, respectively. `addSecretVolumes` is only called once for each pod, whereas `mountSecrets` is called individually for the main container and the init-container (if it is used).

Ref: https://github.com/apache-spark-on-k8s/spark/issues/594.

## How was this patch tested?
Unit tested and manually tested.

vanzin This replaces https://github.com/apache/spark/pull/20148.
hex108 foxish kimoonkim

Author: Yinan Li <liyinan926@gmail.com>

Closes #20159 from liyinan926/master.

(cherry picked from commit e288fc87a027ec1e1a21401d1f151df20dbfecf3)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../deploy/k8s/MountSecretsBootstrap.scala    | 30 ++++++++++++-------
 .../k8s/submit/DriverConfigOrchestrator.scala | 16 +++++-----
 .../steps/BasicDriverConfigurationStep.scala  |  2 +-
 .../submit/steps/DriverMountSecretsStep.scala |  4 +--
 .../InitContainerMountSecretsStep.scala       | 11 +++----
 .../cluster/k8s/ExecutorPodFactory.scala      |  6 ++--
 .../k8s/{submit => }/SecretVolumeUtils.scala  | 18 +++++------
 .../BasicDriverConfigurationStepSuite.scala   |  4 +--
 .../steps/DriverMountSecretsStepSuite.scala   |  4 +--
 .../InitContainerMountSecretsStepSuite.scala  |  7 +----
 .../cluster/k8s/ExecutorPodFactorySuite.scala | 14 +++++----
 11 files changed, 61 insertions(+), 55 deletions(-)
 rename resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/{submit => }/SecretVolumeUtils.scala (71%)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/MountSecretsBootstrap.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/MountSecretsBootstrap.scala
index 8286546ce0641..c35e7db51d407 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/MountSecretsBootstrap.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/MountSecretsBootstrap.scala
@@ -24,26 +24,36 @@ import io.fabric8.kubernetes.api.model.{Container, ContainerBuilder, Pod, PodBui
 private[spark] class MountSecretsBootstrap(secretNamesToMountPaths: Map[String, String]) {
 
   /**
-   * Mounts Kubernetes secrets as secret volumes into the given container in the given pod.
+   * Add new secret volumes for the secrets specified in secretNamesToMountPaths into the given pod.
    *
    * @param pod the pod into which the secret volumes are being added.
-   * @param container the container into which the secret volumes are being mounted.
-   * @return the updated pod and container with the secrets mounted.
+   * @return the updated pod with the secret volumes added.
    */
-  def mountSecrets(pod: Pod, container: Container): (Pod, Container) = {
+  def addSecretVolumes(pod: Pod): Pod = {
     var podBuilder = new PodBuilder(pod)
     secretNamesToMountPaths.keys.foreach { name =>
       podBuilder = podBuilder
         .editOrNewSpec()
           .addNewVolume()
-          .withName(secretVolumeName(name))
-          .withNewSecret()
-            .withSecretName(name)
-            .endSecret()
-          .endVolume()
+            .withName(secretVolumeName(name))
+            .withNewSecret()
+              .withSecretName(name)
+              .endSecret()
+            .endVolume()
           .endSpec()
     }
 
+    podBuilder.build()
+  }
+
+  /**
+   * Mounts Kubernetes secret volumes of the secrets specified in secretNamesToMountPaths into the
+   * given container.
+   *
+   * @param container the container into which the secret volumes are being mounted.
+   * @return the updated container with the secrets mounted.
+   */
+  def mountSecrets(container: Container): Container = {
     var containerBuilder = new ContainerBuilder(container)
     secretNamesToMountPaths.foreach { case (name, path) =>
       containerBuilder = containerBuilder
@@ -53,7 +63,7 @@ private[spark] class MountSecretsBootstrap(secretNamesToMountPaths: Map[String,
           .endVolumeMount()
     }
 
-    (podBuilder.build(), containerBuilder.build())
+    containerBuilder.build()
   }
 
   private def secretVolumeName(secretName: String): String = {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala
index 00c9c4ee49177..c9cc300d65569 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala
@@ -127,6 +127,12 @@ private[spark] class DriverConfigOrchestrator(
       Nil
     }
 
+    val mountSecretsStep = if (secretNamesToMountPaths.nonEmpty) {
+      Seq(new DriverMountSecretsStep(new MountSecretsBootstrap(secretNamesToMountPaths)))
+    } else {
+      Nil
+    }
+
     val initContainerBootstrapStep = if (existNonContainerLocalFiles(sparkJars ++ sparkFiles)) {
       val orchestrator = new InitContainerConfigOrchestrator(
         sparkJars,
@@ -147,19 +153,13 @@ private[spark] class DriverConfigOrchestrator(
       Nil
     }
 
-    val mountSecretsStep = if (secretNamesToMountPaths.nonEmpty) {
-      Seq(new DriverMountSecretsStep(new MountSecretsBootstrap(secretNamesToMountPaths)))
-    } else {
-      Nil
-    }
-
     Seq(
       initialSubmissionStep,
       serviceBootstrapStep,
       kubernetesCredentialsStep) ++
       dependencyResolutionStep ++
-      initContainerBootstrapStep ++
-      mountSecretsStep
+      mountSecretsStep ++
+      initContainerBootstrapStep
   }
 
   private def existNonContainerLocalFiles(files: Seq[String]): Boolean = {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala
index b7a69a7dfd472..eca46b84c6066 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala
@@ -119,7 +119,7 @@ private[spark] class BasicDriverConfigurationStep(
         .endEnv()
       .addNewEnv()
         .withName(ENV_DRIVER_ARGS)
-        .withValue(appArgs.map(arg => "\"" + arg + "\"").mkString(" "))
+        .withValue(appArgs.mkString(" "))
         .endEnv()
       .addNewEnv()
         .withName(ENV_DRIVER_BIND_ADDRESS)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStep.scala
index f872e0f4b65d1..91e9a9f211335 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStep.scala
@@ -28,8 +28,8 @@ private[spark] class DriverMountSecretsStep(
     bootstrap: MountSecretsBootstrap) extends DriverConfigurationStep {
 
   override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
-    val (pod, container) = bootstrap.mountSecrets(
-      driverSpec.driverPod, driverSpec.driverContainer)
+    val pod = bootstrap.addSecretVolumes(driverSpec.driverPod)
+    val container = bootstrap.mountSecrets(driverSpec.driverContainer)
     driverSpec.copy(
       driverPod = pod,
       driverContainer = container
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStep.scala
index c0e7bb20cce8c..0daa7b95e8aae 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStep.scala
@@ -28,12 +28,9 @@ private[spark] class InitContainerMountSecretsStep(
     bootstrap: MountSecretsBootstrap) extends InitContainerConfigurationStep {
 
   override def configureInitContainer(spec: InitContainerSpec) : InitContainerSpec = {
-    val (driverPod, initContainer) = bootstrap.mountSecrets(
-      spec.driverPod,
-      spec.initContainer)
-    spec.copy(
-      driverPod = driverPod,
-      initContainer = initContainer
-    )
+    // Mount the secret volumes given that the volumes have already been added to the driver pod
+    // when mounting the secrets into the main driver container.
+    val initContainer = bootstrap.mountSecrets(spec.initContainer)
+    spec.copy(initContainer = initContainer)
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
index ba5d891f4c77e..066d7e9f70ca5 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
@@ -214,7 +214,7 @@ private[spark] class ExecutorPodFactory(
 
     val (maybeSecretsMountedPod, maybeSecretsMountedContainer) =
       mountSecretsBootstrap.map { bootstrap =>
-        bootstrap.mountSecrets(executorPod, containerWithLimitCores)
+        (bootstrap.addSecretVolumes(executorPod), bootstrap.mountSecrets(containerWithLimitCores))
       }.getOrElse((executorPod, containerWithLimitCores))
 
     val (bootstrappedPod, bootstrappedContainer) =
@@ -227,7 +227,9 @@ private[spark] class ExecutorPodFactory(
 
         val (pod, mayBeSecretsMountedInitContainer) =
           initContainerMountSecretsBootstrap.map { bootstrap =>
-            bootstrap.mountSecrets(podWithInitContainer.pod, podWithInitContainer.initContainer)
+            // Mount the secret volumes given that the volumes have already been added to the
+            // executor pod when mounting the secrets into the main executor container.
+            (podWithInitContainer.pod, bootstrap.mountSecrets(podWithInitContainer.initContainer))
           }.getOrElse((podWithInitContainer.pod, podWithInitContainer.initContainer))
 
         val bootstrappedPod = KubernetesUtils.appendInitContainer(
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/SecretVolumeUtils.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SecretVolumeUtils.scala
similarity index 71%
rename from resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/SecretVolumeUtils.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SecretVolumeUtils.scala
index 8388c16ded268..16780584a674a 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/SecretVolumeUtils.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SecretVolumeUtils.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.k8s.submit
+package org.apache.spark.deploy.k8s
 
 import scala.collection.JavaConverters._
 
@@ -22,15 +22,15 @@ import io.fabric8.kubernetes.api.model.{Container, Pod}
 
 private[spark] object SecretVolumeUtils {
 
-  def podHasVolume(driverPod: Pod, volumeName: String): Boolean = {
-    driverPod.getSpec.getVolumes.asScala.exists(volume => volume.getName == volumeName)
+  def podHasVolume(pod: Pod, volumeName: String): Boolean = {
+    pod.getSpec.getVolumes.asScala.exists { volume =>
+      volume.getName == volumeName
+    }
   }
 
-  def containerHasVolume(
-      driverContainer: Container,
-      volumeName: String,
-      mountPath: String): Boolean = {
-    driverContainer.getVolumeMounts.asScala.exists(volumeMount =>
-      volumeMount.getName == volumeName && volumeMount.getMountPath == mountPath)
+  def containerHasVolume(container: Container, volumeName: String, mountPath: String): Boolean = {
+    container.getVolumeMounts.asScala.exists { volumeMount =>
+      volumeMount.getName == volumeName && volumeMount.getMountPath == mountPath
+    }
   }
 }
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala
index e864c6a16eeb1..8ee629ac8ddc1 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala
@@ -33,7 +33,7 @@ class BasicDriverConfigurationStepSuite extends SparkFunSuite {
   private val CONTAINER_IMAGE_PULL_POLICY = "IfNotPresent"
   private val APP_NAME = "spark-test"
   private val MAIN_CLASS = "org.apache.spark.examples.SparkPi"
-  private val APP_ARGS = Array("arg1", "arg2", "arg 3")
+  private val APP_ARGS = Array("arg1", "arg2", "\"arg 3\"")
   private val CUSTOM_ANNOTATION_KEY = "customAnnotation"
   private val CUSTOM_ANNOTATION_VALUE = "customAnnotationValue"
   private val DRIVER_CUSTOM_ENV_KEY1 = "customDriverEnv1"
@@ -82,7 +82,7 @@ class BasicDriverConfigurationStepSuite extends SparkFunSuite {
     assert(envs(ENV_SUBMIT_EXTRA_CLASSPATH) === "/opt/spark/spark-examples.jar")
     assert(envs(ENV_DRIVER_MEMORY) === "256M")
     assert(envs(ENV_DRIVER_MAIN_CLASS) === MAIN_CLASS)
-    assert(envs(ENV_DRIVER_ARGS) === "\"arg1\" \"arg2\" \"arg 3\"")
+    assert(envs(ENV_DRIVER_ARGS) === "arg1 arg2 \"arg 3\"")
     assert(envs(DRIVER_CUSTOM_ENV_KEY1) === "customDriverEnv1")
     assert(envs(DRIVER_CUSTOM_ENV_KEY2) === "customDriverEnv2")
 
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStepSuite.scala
index 9ec0cb55de5aa..960d0bda1d011 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStepSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStepSuite.scala
@@ -17,8 +17,8 @@
 package org.apache.spark.deploy.k8s.submit.steps
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.k8s.MountSecretsBootstrap
-import org.apache.spark.deploy.k8s.submit.{KubernetesDriverSpec, SecretVolumeUtils}
+import org.apache.spark.deploy.k8s.{MountSecretsBootstrap, SecretVolumeUtils}
+import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
 
 class DriverMountSecretsStepSuite extends SparkFunSuite {
 
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStepSuite.scala
index eab4e17659456..7ac0bde80dfe6 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStepSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStepSuite.scala
@@ -19,8 +19,7 @@ package org.apache.spark.deploy.k8s.submit.steps.initcontainer
 import io.fabric8.kubernetes.api.model.{ContainerBuilder, PodBuilder}
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.deploy.k8s.MountSecretsBootstrap
-import org.apache.spark.deploy.k8s.submit.SecretVolumeUtils
+import org.apache.spark.deploy.k8s.{MountSecretsBootstrap, SecretVolumeUtils}
 
 class InitContainerMountSecretsStepSuite extends SparkFunSuite {
 
@@ -44,12 +43,8 @@ class InitContainerMountSecretsStepSuite extends SparkFunSuite {
     val initContainerMountSecretsStep = new InitContainerMountSecretsStep(mountSecretsBootstrap)
     val configuredInitContainerSpec = initContainerMountSecretsStep.configureInitContainer(
       baseInitContainerSpec)
-
-    val podWithSecretsMounted = configuredInitContainerSpec.driverPod
     val initContainerWithSecretsMounted = configuredInitContainerSpec.initContainer
 
-    Seq(s"$SECRET_FOO-volume", s"$SECRET_BAR-volume").foreach(volumeName =>
-      assert(SecretVolumeUtils.podHasVolume(podWithSecretsMounted, volumeName)))
     Seq(s"$SECRET_FOO-volume", s"$SECRET_BAR-volume").foreach(volumeName =>
       assert(SecretVolumeUtils.containerHasVolume(
         initContainerWithSecretsMounted, volumeName, SECRET_MOUNT_PATH)))
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
index 7121a802c69c1..884da8aabd880 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
@@ -25,7 +25,7 @@ import org.mockito.Mockito._
 import org.scalatest.{BeforeAndAfter, BeforeAndAfterEach}
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.k8s.{InitContainerBootstrap, MountSecretsBootstrap, PodWithDetachedInitContainer}
+import org.apache.spark.deploy.k8s.{InitContainerBootstrap, MountSecretsBootstrap, PodWithDetachedInitContainer, SecretVolumeUtils}
 import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.Constants._
 
@@ -165,17 +165,19 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
 
     val factory = new ExecutorPodFactory(
       conf,
-      None,
+      Some(secretsBootstrap),
       Some(initContainerBootstrap),
       Some(secretsBootstrap))
     val executor = factory.createExecutorPod(
       "1", "dummy", "dummy", Seq[(String, String)](), driverPod, Map[String, Int]())
 
+    assert(executor.getSpec.getVolumes.size() === 1)
+    assert(SecretVolumeUtils.podHasVolume(executor, "secret1-volume"))
+    assert(SecretVolumeUtils.containerHasVolume(
+      executor.getSpec.getContainers.get(0), "secret1-volume", "/var/secret1"))
     assert(executor.getSpec.getInitContainers.size() === 1)
-    assert(executor.getSpec.getInitContainers.get(0).getVolumeMounts.get(0).getName
-      === "secret1-volume")
-    assert(executor.getSpec.getInitContainers.get(0).getVolumeMounts.get(0)
-      .getMountPath === "/var/secret1")
+    assert(SecretVolumeUtils.containerHasVolume(
+      executor.getSpec.getInitContainers.get(0), "secret1-volume", "/var/secret1"))
 
     checkOwnerReferences(executor, driverPodUid)
   }

From ea9da6152af9223787cffd83d489741b4cc5aa34 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 4 Jan 2018 16:34:56 -0800
Subject: [PATCH 018/566] [SPARK-22960][K8S] Make build-push-docker-images.sh
 more dev-friendly.

- Make it possible to build images from a git clone.
- Make it easy to use minikube to test things.

Also fixed what seemed like a bug: the base image wasn't getting the tag
provided in the command line. Adding the tag allows users to use multiple
Spark builds in the same kubernetes cluster.

Tested by deploying images on minikube and running spark-submit from a dev
environment; also by building the images with different tags and verifying
"docker images" in minikube.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20154 from vanzin/SPARK-22960.

(cherry picked from commit 0428368c2c5e135f99f62be20877bbbda43be310)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 docs/running-on-kubernetes.md                 |   9 +-
 .../src/main/dockerfiles/driver/Dockerfile    |   3 +-
 .../src/main/dockerfiles/executor/Dockerfile  |   3 +-
 .../dockerfiles/init-container/Dockerfile     |   3 +-
 .../main/dockerfiles/spark-base/Dockerfile    |   7 +-
 sbin/build-push-docker-images.sh              | 120 +++++++++++++++---
 6 files changed, 117 insertions(+), 28 deletions(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index e491329136a3c..2d69f636472ae 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -16,6 +16,9 @@ Kubernetes scheduler that has been added to Spark.
 you may setup a test cluster on your local machine using
 [minikube](https://kubernetes.io/docs/getting-started-guides/minikube/).
   * We recommend using the latest release of minikube with the DNS addon enabled.
+  * Be aware that the default minikube configuration is not enough for running Spark applications.
+  We recommend 3 CPUs and 4g of memory to be able to start a simple Spark application with a single
+  executor.
 * You must have appropriate permissions to list, create, edit and delete
 [pods](https://kubernetes.io/docs/user-guide/pods/) in your cluster. You can verify that you can list these resources
 by running `kubectl auth can-i <list|create|edit|delete> pods`.
@@ -197,7 +200,7 @@ kubectl port-forward <driver-pod-name> 4040:4040
 
 Then, the Spark driver UI can be accessed on `http://localhost:4040`.
 
-### Debugging 
+### Debugging
 
 There may be several kinds of failures. If the Kubernetes API server rejects the request made from spark-submit, or the
 connection is refused for a different reason, the submission logic should indicate the error encountered. However, if there
@@ -215,8 +218,8 @@ If the pod has encountered a runtime error, the status can be probed further usi
 kubectl logs <spark-driver-pod>
 ```
 
-Status and logs of failed executor pods can be checked in similar ways. Finally, deleting the driver pod will clean up the entire spark 
-application, includling all executors, associated service, etc. The driver pod can be thought of as the Kubernetes representation of 
+Status and logs of failed executor pods can be checked in similar ways. Finally, deleting the driver pod will clean up the entire spark
+application, including all executors, associated service, etc. The driver pod can be thought of as the Kubernetes representation of
 the Spark application.
 
 ## Kubernetes Features
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
index 45fbcd9cd0deb..ff5289e10c21e 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
@@ -15,7 +15,8 @@
 # limitations under the License.
 #
 
-FROM spark-base
+ARG base_image
+FROM ${base_image}
 
 # Before building the docker image, first build and make a Spark distribution following
 # the instructions in http://spark.apache.org/docs/latest/building-spark.html.
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
index 0f806cf7e148e..3eabb42d4d852 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
@@ -15,7 +15,8 @@
 # limitations under the License.
 #
 
-FROM spark-base
+ARG base_image
+FROM ${base_image}
 
 # Before building the docker image, first build and make a Spark distribution following
 # the instructions in http://spark.apache.org/docs/latest/building-spark.html.
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
index 047056ab2633b..e0a249e0ac71f 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
@@ -15,7 +15,8 @@
 # limitations under the License.
 #
 
-FROM spark-base
+ARG base_image
+FROM ${base_image}
 
 # If this docker file is being used in the context of building your images from a Spark distribution, the docker build
 # command should be invoked from the top level directory of the Spark distribution. E.g.:
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile
index 222e777db3a82..da1d6b9e161cc 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile
@@ -17,6 +17,9 @@
 
 FROM openjdk:8-alpine
 
+ARG spark_jars
+ARG img_path
+
 # Before building the docker image, first build and make a Spark distribution following
 # the instructions in http://spark.apache.org/docs/latest/building-spark.html.
 # If this docker file is being used in the context of building your images from a Spark
@@ -34,11 +37,11 @@ RUN set -ex && \
     ln -sv /bin/bash /bin/sh && \
     chgrp root /etc/passwd && chmod ug+rw /etc/passwd
 
-COPY jars /opt/spark/jars
+COPY ${spark_jars} /opt/spark/jars
 COPY bin /opt/spark/bin
 COPY sbin /opt/spark/sbin
 COPY conf /opt/spark/conf
-COPY kubernetes/dockerfiles/spark-base/entrypoint.sh /opt/
+COPY ${img_path}/spark-base/entrypoint.sh /opt/
 
 ENV SPARK_HOME /opt/spark
 
diff --git a/sbin/build-push-docker-images.sh b/sbin/build-push-docker-images.sh
index b3137598692d8..bb8806dd33f37 100755
--- a/sbin/build-push-docker-images.sh
+++ b/sbin/build-push-docker-images.sh
@@ -19,29 +19,94 @@
 # This script builds and pushes docker images when run from a release of Spark
 # with Kubernetes support.
 
-declare -A path=( [spark-driver]=kubernetes/dockerfiles/driver/Dockerfile \
-                  [spark-executor]=kubernetes/dockerfiles/executor/Dockerfile \
-                  [spark-init]=kubernetes/dockerfiles/init-container/Dockerfile )
+function error {
+  echo "$@" 1>&2
+  exit 1
+}
+
+# Detect whether this is a git clone or a Spark distribution and adjust paths
+# accordingly.
+if [ -z "${SPARK_HOME}" ]; then
+  SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
+. "${SPARK_HOME}/bin/load-spark-env.sh"
+
+if [ -f "$SPARK_HOME/RELEASE" ]; then
+  IMG_PATH="kubernetes/dockerfiles"
+  SPARK_JARS="jars"
+else
+  IMG_PATH="resource-managers/kubernetes/docker/src/main/dockerfiles"
+  SPARK_JARS="assembly/target/scala-$SPARK_SCALA_VERSION/jars"
+fi
+
+if [ ! -d "$IMG_PATH" ]; then
+  error "Cannot find docker images. This script must be run from a runnable distribution of Apache Spark."
+fi
+
+declare -A path=( [spark-driver]="$IMG_PATH/driver/Dockerfile" \
+                  [spark-executor]="$IMG_PATH/executor/Dockerfile" \
+                  [spark-init]="$IMG_PATH/init-container/Dockerfile" )
+
+function image_ref {
+  local image="$1"
+  local add_repo="${2:-1}"
+  if [ $add_repo = 1 ] && [ -n "$REPO" ]; then
+    image="$REPO/$image"
+  fi
+  if [ -n "$TAG" ]; then
+    image="$image:$TAG"
+  fi
+  echo "$image"
+}
 
 function build {
-  docker build -t spark-base -f kubernetes/dockerfiles/spark-base/Dockerfile .
+  local base_image="$(image_ref spark-base 0)"
+  docker build --build-arg "spark_jars=$SPARK_JARS" \
+    --build-arg "img_path=$IMG_PATH" \
+    -t "$base_image" \
+    -f "$IMG_PATH/spark-base/Dockerfile" .
   for image in "${!path[@]}"; do
-    docker build -t ${REPO}/$image:${TAG} -f ${path[$image]} .
+    docker build --build-arg "base_image=$base_image" -t "$(image_ref $image)" -f ${path[$image]} .
   done
 }
 
-
 function push {
   for image in "${!path[@]}"; do
-    docker push ${REPO}/$image:${TAG}
+    docker push "$(image_ref $image)"
   done
 }
 
 function usage {
-  echo "This script must be run from a runnable distribution of Apache Spark."
-  echo "Usage: ./sbin/build-push-docker-images.sh -r <repo> -t <tag> build"
-  echo "       ./sbin/build-push-docker-images.sh -r <repo> -t <tag> push"
-  echo "for example: ./sbin/build-push-docker-images.sh -r docker.io/myrepo -t v2.3.0 push"
+  cat <<EOF
+Usage: $0 [options] [command]
+Builds or pushes the built-in Spark Docker images.
+
+Commands:
+  build       Build images.
+  push        Push images to a registry. Requires a repository address to be provided, both
+              when building and when pushing the images.
+
+Options:
+  -r repo     Repository address.
+  -t tag      Tag to apply to built images, or to identify images to be pushed.
+  -m          Use minikube's Docker daemon.
+
+Using minikube when building images will do so directly into minikube's Docker daemon.
+There is no need to push the images into minikube in that case, they'll be automatically
+available when running applications inside the minikube cluster.
+
+Check the following documentation for more information on using the minikube Docker daemon:
+
+  https://kubernetes.io/docs/getting-started-guides/minikube/#reusing-the-docker-daemon
+
+Examples:
+  - Build images in minikube with tag "testing"
+    $0 -m -t testing build
+
+  - Build and push images with tag "v2.3.0" to docker.io/myrepo
+    $0 -r docker.io/myrepo -t v2.3.0 build
+    $0 -r docker.io/myrepo -t v2.3.0 push
+EOF
 }
 
 if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
@@ -49,21 +114,36 @@ if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
   exit 0
 fi
 
-while getopts r:t: option
+REPO=
+TAG=
+while getopts mr:t: option
 do
  case "${option}"
  in
  r) REPO=${OPTARG};;
  t) TAG=${OPTARG};;
+ m)
+   if ! which minikube 1>/dev/null; then
+     error "Cannot find minikube."
+   fi
+   eval $(minikube docker-env)
+   ;;
  esac
 done
 
-if [ -z "$REPO" ] || [ -z "$TAG" ]; then
+case "${@: -1}" in
+  build)
+    build
+    ;;
+  push)
+    if [ -z "$REPO" ]; then
+      usage
+      exit 1
+    fi
+    push
+    ;;
+  *)
     usage
-else
-  case "${@: -1}" in
-    build) build;;
-    push) push;;
-    *) usage;;
-  esac
-fi
+    exit 1
+    ;;
+esac

From 158f7e6a93b5acf4ce05c97b575124fd599cf927 Mon Sep 17 00:00:00 2001
From: Juliusz Sompolski <julek@databricks.com>
Date: Fri, 5 Jan 2018 10:16:34 +0800
Subject: [PATCH 019/566] [SPARK-22957] ApproxQuantile breaks if the number of
 rows exceeds MaxInt

## What changes were proposed in this pull request?

32bit Int was used for row rank.
That overflowed in a dataframe with more than 2B rows.

## How was this patch tested?

Added test, but ignored, as it takes 4 minutes.

Author: Juliusz Sompolski <julek@databricks.com>

Closes #20152 from juliuszsompolski/SPARK-22957.

(cherry picked from commit df7fc3ef3899cadd252d2837092bebe3442d6523)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../aggregate/ApproximatePercentile.scala            | 12 ++++++------
 .../spark/sql/catalyst/util/QuantileSummaries.scala  |  8 ++++----
 .../org/apache/spark/sql/DataFrameStatSuite.scala    |  8 ++++++++
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
index 149ac265e6ed5..a45854a3b5146 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
@@ -296,8 +296,8 @@ object ApproximatePercentile {
       Ints.BYTES + Doubles.BYTES + Longs.BYTES +
       // length of summary.sampled
       Ints.BYTES +
-      // summary.sampled, Array[Stat(value: Double, g: Int, delta: Int)]
-      summaries.sampled.length * (Doubles.BYTES + Ints.BYTES + Ints.BYTES)
+      // summary.sampled, Array[Stat(value: Double, g: Long, delta: Long)]
+      summaries.sampled.length * (Doubles.BYTES + Longs.BYTES + Longs.BYTES)
     }
 
     final def serialize(obj: PercentileDigest): Array[Byte] = {
@@ -312,8 +312,8 @@ object ApproximatePercentile {
       while (i < summary.sampled.length) {
         val stat = summary.sampled(i)
         buffer.putDouble(stat.value)
-        buffer.putInt(stat.g)
-        buffer.putInt(stat.delta)
+        buffer.putLong(stat.g)
+        buffer.putLong(stat.delta)
         i += 1
       }
       buffer.array()
@@ -330,8 +330,8 @@ object ApproximatePercentile {
       var i = 0
       while (i < sampledLength) {
         val value = buffer.getDouble()
-        val g = buffer.getInt()
-        val delta = buffer.getInt()
+        val g = buffer.getLong()
+        val delta = buffer.getLong()
         sampled(i) = Stats(value, g, delta)
         i += 1
       }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
index eb7941cf9e6af..b013add9c9778 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
@@ -105,7 +105,7 @@ class QuantileSummaries(
         if (newSamples.isEmpty || (sampleIdx == sampled.length && opsIdx == sorted.length - 1)) {
           0
         } else {
-          math.floor(2 * relativeError * currentCount).toInt
+          math.floor(2 * relativeError * currentCount).toLong
         }
 
       val tuple = Stats(currentSample, 1, delta)
@@ -192,10 +192,10 @@ class QuantileSummaries(
     }
 
     // Target rank
-    val rank = math.ceil(quantile * count).toInt
+    val rank = math.ceil(quantile * count).toLong
     val targetError = relativeError * count
     // Minimum rank at current sample
-    var minRank = 0
+    var minRank = 0L
     var i = 0
     while (i < sampled.length - 1) {
       val curSample = sampled(i)
@@ -235,7 +235,7 @@ object QuantileSummaries {
    * @param g the minimum rank jump from the previous value's minimum rank
    * @param delta the maximum span of the rank.
    */
-  case class Stats(value: Double, g: Int, delta: Int)
+  case class Stats(value: Double, g: Long, delta: Long)
 
   private def compressImmut(
       currentSamples: IndexedSeq[Stats],
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
index 46b21c3b64a2e..5169d2b5fc6b2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -260,6 +260,14 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
     assert(res2(1).isEmpty)
   }
 
+  // SPARK-22957: check for 32bit overflow when computing rank.
+  // ignored - takes 4 minutes to run.
+  ignore("approx quantile 4: test for Int overflow") {
+    val res = spark.range(3000000000L).stat.approxQuantile("id", Array(0.8, 0.9), 0.05)
+    assert(res(0) > 2200000000.0)
+    assert(res(1) > 2200000000.0)
+  }
+
   test("crosstab") {
     withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") {
       val rng = new Random()

From 145820bda140d1385c4dd802fa79a871e6bf98be Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Fri, 5 Jan 2018 14:02:21 +0800
Subject: [PATCH 020/566] [SPARK-22825][SQL] Fix incorrect results of Casting
 Array to String

## What changes were proposed in this pull request?
This pr fixed the issue when casting arrays into strings;
```
scala> val df = spark.range(10).select('id.cast("integer")).agg(collect_list('id).as('ids))
scala> df.write.saveAsTable("t")
scala> sql("SELECT cast(ids as String) FROM t").show(false)
+------------------------------------------------------------------+
|ids                                                               |
+------------------------------------------------------------------+
|org.apache.spark.sql.catalyst.expressions.UnsafeArrayData8bc285df|
+------------------------------------------------------------------+
```

This pr modified the result into;
```
+------------------------------+
|ids                           |
+------------------------------+
|[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]|
+------------------------------+
```

## How was this patch tested?
Added tests in `CastSuite` and `SQLQuerySuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #20024 from maropu/SPARK-22825.

(cherry picked from commit 52fc5c17d9d784b846149771b398e741621c0b5c)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../codegen/UTF8StringBuilder.java            | 78 +++++++++++++++++++
 .../spark/sql/catalyst/expressions/Cast.scala | 68 ++++++++++++++++
 .../sql/catalyst/expressions/CastSuite.scala  | 25 ++++++
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  2 -
 4 files changed, 171 insertions(+), 2 deletions(-)
 create mode 100644 sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java
new file mode 100644
index 0000000000000..f0f66bae245fd
--- /dev/null
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen;
+
+import org.apache.spark.unsafe.Platform;
+import org.apache.spark.unsafe.array.ByteArrayMethods;
+import org.apache.spark.unsafe.types.UTF8String;
+
+/**
+ * A helper class to write {@link UTF8String}s to an internal buffer and build the concatenated
+ * {@link UTF8String} at the end.
+ */
+public class UTF8StringBuilder {
+
+  private static final int ARRAY_MAX = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH;
+
+  private byte[] buffer;
+  private int cursor = Platform.BYTE_ARRAY_OFFSET;
+
+  public UTF8StringBuilder() {
+    // Since initial buffer size is 16 in `StringBuilder`, we set the same size here
+    this.buffer = new byte[16];
+  }
+
+  // Grows the buffer by at least `neededSize`
+  private void grow(int neededSize) {
+    if (neededSize > ARRAY_MAX - totalSize()) {
+      throw new UnsupportedOperationException(
+        "Cannot grow internal buffer by size " + neededSize + " because the size after growing " +
+          "exceeds size limitation " + ARRAY_MAX);
+    }
+    final int length = totalSize() + neededSize;
+    if (buffer.length < length) {
+      int newLength = length < ARRAY_MAX / 2 ? length * 2 : ARRAY_MAX;
+      final byte[] tmp = new byte[newLength];
+      Platform.copyMemory(
+        buffer,
+        Platform.BYTE_ARRAY_OFFSET,
+        tmp,
+        Platform.BYTE_ARRAY_OFFSET,
+        totalSize());
+      buffer = tmp;
+    }
+  }
+
+  private int totalSize() {
+    return cursor - Platform.BYTE_ARRAY_OFFSET;
+  }
+
+  public void append(UTF8String value) {
+    grow(value.numBytes());
+    value.writeToMemory(buffer, cursor);
+    cursor += value.numBytes();
+  }
+
+  public void append(String value) {
+    append(UTF8String.fromString(value));
+  }
+
+  public UTF8String build() {
+    return UTF8String.fromBytes(buffer, 0, totalSize());
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 274d8813f16db..d4fc5e0f168a7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -206,6 +206,28 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
     case DateType => buildCast[Int](_, d => UTF8String.fromString(DateTimeUtils.dateToString(d)))
     case TimestampType => buildCast[Long](_,
       t => UTF8String.fromString(DateTimeUtils.timestampToString(t, timeZone)))
+    case ArrayType(et, _) =>
+      buildCast[ArrayData](_, array => {
+        val builder = new UTF8StringBuilder
+        builder.append("[")
+        if (array.numElements > 0) {
+          val toUTF8String = castToString(et)
+          if (!array.isNullAt(0)) {
+            builder.append(toUTF8String(array.get(0, et)).asInstanceOf[UTF8String])
+          }
+          var i = 1
+          while (i < array.numElements) {
+            builder.append(",")
+            if (!array.isNullAt(i)) {
+              builder.append(" ")
+              builder.append(toUTF8String(array.get(i, et)).asInstanceOf[UTF8String])
+            }
+            i += 1
+          }
+        }
+        builder.append("]")
+        builder.build()
+      })
     case _ => buildCast[Any](_, o => UTF8String.fromString(o.toString))
   }
 
@@ -597,6 +619,41 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
     """
   }
 
+  private def writeArrayToStringBuilder(
+      et: DataType,
+      array: String,
+      buffer: String,
+      ctx: CodegenContext): String = {
+    val elementToStringCode = castToStringCode(et, ctx)
+    val funcName = ctx.freshName("elementToString")
+    val elementToStringFunc = ctx.addNewFunction(funcName,
+      s"""
+         |private UTF8String $funcName(${ctx.javaType(et)} element) {
+         |  UTF8String elementStr = null;
+         |  ${elementToStringCode("element", "elementStr", null /* resultIsNull won't be used */)}
+         |  return elementStr;
+         |}
+       """.stripMargin)
+
+    val loopIndex = ctx.freshName("loopIndex")
+    s"""
+       |$buffer.append("[");
+       |if ($array.numElements() > 0) {
+       |  if (!$array.isNullAt(0)) {
+       |    $buffer.append($elementToStringFunc(${ctx.getValue(array, et, "0")}));
+       |  }
+       |  for (int $loopIndex = 1; $loopIndex < $array.numElements(); $loopIndex++) {
+       |    $buffer.append(",");
+       |    if (!$array.isNullAt($loopIndex)) {
+       |      $buffer.append(" ");
+       |      $buffer.append($elementToStringFunc(${ctx.getValue(array, et, loopIndex)}));
+       |    }
+       |  }
+       |}
+       |$buffer.append("]");
+     """.stripMargin
+  }
+
   private[this] def castToStringCode(from: DataType, ctx: CodegenContext): CastFunction = {
     from match {
       case BinaryType =>
@@ -608,6 +665,17 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
         val tz = ctx.addReferenceObj("timeZone", timeZone)
         (c, evPrim, evNull) => s"""$evPrim = UTF8String.fromString(
           org.apache.spark.sql.catalyst.util.DateTimeUtils.timestampToString($c, $tz));"""
+      case ArrayType(et, _) =>
+        (c, evPrim, evNull) => {
+          val buffer = ctx.freshName("buffer")
+          val bufferClass = classOf[UTF8StringBuilder].getName
+          val writeArrayElemCode = writeArrayToStringBuilder(et, c, buffer, ctx)
+          s"""
+             |$bufferClass $buffer = new $bufferClass();
+             |$writeArrayElemCode;
+             |$evPrim = $buffer.build();
+           """.stripMargin
+        }
       case _ =>
         (c, evPrim, evNull) => s"$evPrim = UTF8String.fromString(String.valueOf($c));"
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index 1dd040e4696a1..e3ed7171defd8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -853,4 +853,29 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     cast("2", LongType).genCode(ctx)
     assert(ctx.inlinedMutableStates.length == 0)
   }
+
+  test("SPARK-22825 Cast array to string") {
+    val ret1 = cast(Literal.create(Array(1, 2, 3, 4, 5)), StringType)
+    checkEvaluation(ret1, "[1, 2, 3, 4, 5]")
+    val ret2 = cast(Literal.create(Array("ab", "cde", "f")), StringType)
+    checkEvaluation(ret2, "[ab, cde, f]")
+    val ret3 = cast(Literal.create(Array("ab", null, "c")), StringType)
+    checkEvaluation(ret3, "[ab,, c]")
+    val ret4 = cast(Literal.create(Array("ab".getBytes, "cde".getBytes, "f".getBytes)), StringType)
+    checkEvaluation(ret4, "[ab, cde, f]")
+    val ret5 = cast(
+      Literal.create(Array("2014-12-03", "2014-12-04", "2014-12-06").map(Date.valueOf)),
+      StringType)
+    checkEvaluation(ret5, "[2014-12-03, 2014-12-04, 2014-12-06]")
+    val ret6 = cast(
+      Literal.create(Array("2014-12-03 13:01:00", "2014-12-04 15:05:00").map(Timestamp.valueOf)),
+      StringType)
+    checkEvaluation(ret6, "[2014-12-03 13:01:00, 2014-12-04 15:05:00]")
+    val ret7 = cast(Literal.create(Array(Array(1, 2, 3), Array(4, 5))), StringType)
+    checkEvaluation(ret7, "[[1, 2, 3], [4, 5]]")
+    val ret8 = cast(
+      Literal.create(Array(Array(Array("a"), Array("b", "c")), Array(Array("d")))),
+      StringType)
+    checkEvaluation(ret8, "[[[a], [b, c]], [[d]]]")
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 5e077285ade55..96bf65fce9c4a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -28,8 +28,6 @@ import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
 import org.apache.spark.sql.catalyst.util.StringUtils
 import org.apache.spark.sql.execution.aggregate
 import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, SortAggregateExec}
-import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
-import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, CartesianProductExec, SortMergeJoinExec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf

From 5b524cc0cd5a82e4fb0681363b6641e40b37075d Mon Sep 17 00:00:00 2001
From: Bago Amirbekian <bago@databricks.com>
Date: Thu, 4 Jan 2018 22:45:15 -0800
Subject: [PATCH 021/566] [SPARK-22949][ML] Apply CrossValidator approach to
 Driver/Distributed memory tradeoff for TrainValidationSplit

## What changes were proposed in this pull request?

Avoid holding all models in memory for `TrainValidationSplit`.

## How was this patch tested?

Existing tests.

Author: Bago Amirbekian <bago@databricks.com>

Closes #20143 from MrBago/trainValidMemoryFix.

(cherry picked from commit cf0aa65576acbe0209c67f04c029058fd73555c1)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 .../spark/ml/tuning/CrossValidator.scala       |  4 +++-
 .../spark/ml/tuning/TrainValidationSplit.scala | 18 ++++--------------
 2 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 095b54c0fe83f..a0b507d2e718c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -160,8 +160,10 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
         } (executionContext)
       }
 
-      // Wait for metrics to be calculated before unpersisting validation dataset
+      // Wait for metrics to be calculated
       val foldMetrics = foldMetricFutures.map(ThreadUtils.awaitResult(_, Duration.Inf))
+
+      // Unpersist training & validation set once all metrics have been produced
       trainingDataset.unpersist()
       validationDataset.unpersist()
       foldMetrics
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
index c73bd18475475..8826ef3271bc1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
@@ -143,24 +143,13 @@ class TrainValidationSplit @Since("1.5.0") (@Since("1.5.0") override val uid: St
 
     // Fit models in a Future for training in parallel
     logDebug(s"Train split with multiple sets of parameters.")
-    val modelFutures = epm.zipWithIndex.map { case (paramMap, paramIndex) =>
-      Future[Model[_]] {
+    val metricFutures = epm.zipWithIndex.map { case (paramMap, paramIndex) =>
+      Future[Double] {
         val model = est.fit(trainingDataset, paramMap).asInstanceOf[Model[_]]
 
         if (collectSubModelsParam) {
           subModels.get(paramIndex) = model
         }
-        model
-      } (executionContext)
-    }
-
-    // Unpersist training data only when all models have trained
-    Future.sequence[Model[_], Iterable](modelFutures)(implicitly, executionContext)
-      .onComplete { _ => trainingDataset.unpersist() } (executionContext)
-
-    // Evaluate models in a Future that will calulate a metric and allow model to be cleaned up
-    val metricFutures = modelFutures.zip(epm).map { case (modelFuture, paramMap) =>
-      modelFuture.map { model =>
         // TODO: duplicate evaluator to take extra params from input
         val metric = eval.evaluate(model.transform(validationDataset, paramMap))
         logDebug(s"Got metric $metric for model trained with $paramMap.")
@@ -171,7 +160,8 @@ class TrainValidationSplit @Since("1.5.0") (@Since("1.5.0") override val uid: St
     // Wait for all metrics to be calculated
     val metrics = metricFutures.map(ThreadUtils.awaitResult(_, Duration.Inf))
 
-    // Unpersist validation set once all metrics have been produced
+    // Unpersist training & validation set once all metrics have been produced
+    trainingDataset.unpersist()
     validationDataset.unpersist()
 
     logInfo(s"Train validation split metrics: ${metrics.toSeq}")

From f9dcdbcefb545ced3f5b457e1e88c88a8e180f9f Mon Sep 17 00:00:00 2001
From: Yinan Li <liyinan926@gmail.com>
Date: Thu, 4 Jan 2018 23:23:41 -0800
Subject: [PATCH 022/566] [SPARK-22757][K8S] Enable spark.jars and spark.files
 in KUBERNETES mode

## What changes were proposed in this pull request?

We missed enabling `spark.files` and `spark.jars` in https://github.com/apache/spark/pull/19954. The result is that remote dependencies specified through `spark.files` or `spark.jars` are not included in the list of remote dependencies to be downloaded by the init-container. This PR fixes it.

## How was this patch tested?

Manual tests.

vanzin This replaces https://github.com/apache/spark/pull/20157.

foxish

Author: Yinan Li <liyinan926@gmail.com>

Closes #20160 from liyinan926/SPARK-22757.

(cherry picked from commit 6cff7d19f6a905fe425bd6892fe7ca014c0e696b)
Signed-off-by: Felix Cheung <felixcheung@apache.org>
---
 .../src/main/scala/org/apache/spark/deploy/SparkSubmit.scala | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index cbe1f2c3e08a1..1e381965c52ba 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -584,10 +584,11 @@ object SparkSubmit extends CommandLineUtils with Logging {
         confKey = "spark.executor.memory"),
       OptionAssigner(args.totalExecutorCores, STANDALONE | MESOS | KUBERNETES, ALL_DEPLOY_MODES,
         confKey = "spark.cores.max"),
-      OptionAssigner(args.files, LOCAL | STANDALONE | MESOS, ALL_DEPLOY_MODES,
+      OptionAssigner(args.files, LOCAL | STANDALONE | MESOS | KUBERNETES, ALL_DEPLOY_MODES,
         confKey = "spark.files"),
       OptionAssigner(args.jars, LOCAL, CLIENT, confKey = "spark.jars"),
-      OptionAssigner(args.jars, STANDALONE | MESOS, ALL_DEPLOY_MODES, confKey = "spark.jars"),
+      OptionAssigner(args.jars, STANDALONE | MESOS | KUBERNETES, ALL_DEPLOY_MODES,
+        confKey = "spark.jars"),
       OptionAssigner(args.driverMemory, STANDALONE | MESOS | YARN | KUBERNETES, CLUSTER,
         confKey = "spark.driver.memory"),
       OptionAssigner(args.driverCores, STANDALONE | MESOS | YARN | KUBERNETES, CLUSTER,

From fd4e30476894b7c37cc2ae6243a941f0bc90388d Mon Sep 17 00:00:00 2001
From: Adrian Ionescu <adrian@databricks.com>
Date: Fri, 5 Jan 2018 21:32:39 +0800
Subject: [PATCH 023/566] [SPARK-22961][REGRESSION] Constant columns should
 generate QueryPlanConstraints

## What changes were proposed in this pull request?

#19201 introduced the following regression: given something like `df.withColumn("c", lit(2))`, we're no longer picking up `c === 2` as a constraint and infer filters from it when joins are involved, which may lead to noticeable performance degradation.

This patch re-enables this optimization by picking up Aliases of Literals in Projection lists as constraints and making sure they're not treated as aliased columns.

## How was this patch tested?

Unit test was added.

Author: Adrian Ionescu <adrian@databricks.com>

Closes #20155 from adrian-ionescu/constant_constraints.

(cherry picked from commit 51c33bd0d402af9e0284c6cbc0111f926446bfba)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../sql/catalyst/plans/logical/LogicalPlan.scala    |  2 ++
 .../plans/logical/QueryPlanConstraints.scala        |  2 +-
 .../InferFiltersFromConstraintsSuite.scala          | 13 +++++++++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index a38458add7b5e..ff2a0ec588567 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -247,6 +247,8 @@ abstract class UnaryNode extends LogicalPlan {
   protected def getAliasedConstraints(projectList: Seq[NamedExpression]): Set[Expression] = {
     var allConstraints = child.constraints.asInstanceOf[Set[Expression]]
     projectList.foreach {
+      case a @ Alias(l: Literal, _) =>
+        allConstraints += EqualTo(a.toAttribute, l)
       case a @ Alias(e, _) =>
         // For every alias in `projectList`, replace the reference in constraints by its attribute.
         allConstraints ++= allConstraints.map(_ transform {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala
index b0f611fd38dea..9c0a30a47f839 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala
@@ -98,7 +98,7 @@ trait QueryPlanConstraints { self: LogicalPlan =>
   // we may avoid producing recursive constraints.
   private lazy val aliasMap: AttributeMap[Expression] = AttributeMap(
     expressions.collect {
-      case a: Alias => (a.toAttribute, a.child)
+      case a: Alias if !a.child.isInstanceOf[Literal] => (a.toAttribute, a.child)
     } ++ children.flatMap(_.asInstanceOf[QueryPlanConstraints].aliasMap))
     // Note: the explicit cast is necessary, since Scala compiler fails to infer the type.
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
index 5580f8604ec72..a0708bf7eee9a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
@@ -236,4 +236,17 @@ class InferFiltersFromConstraintsSuite extends PlanTest {
       comparePlans(optimized, originalQuery)
     }
   }
+
+  test("constraints should be inferred from aliased literals") {
+    val originalLeft = testRelation.subquery('left).as("left")
+    val optimizedLeft = testRelation.subquery('left).where(IsNotNull('a) && 'a === 2).as("left")
+
+    val right = Project(Seq(Literal(2).as("two")), testRelation.subquery('right)).as("right")
+    val condition = Some("left.a".attr === "right.two".attr)
+
+    val original = originalLeft.join(right, Inner, condition)
+    val correct = optimizedLeft.join(right, Inner, condition)
+
+    comparePlans(Optimize.execute(original.analyze), correct.analyze)
+  }
 }

From 0a30e93507ba784729a498943e7eeda1d6f19fbf Mon Sep 17 00:00:00 2001
From: Bruce Robbins <bersprockets@gmail.com>
Date: Fri, 5 Jan 2018 09:58:28 -0800
Subject: [PATCH 024/566] [SPARK-22940][SQL] HiveExternalCatalogVersionsSuite
 should succeed on platforms that don't have wget

## What changes were proposed in this pull request?

Modified HiveExternalCatalogVersionsSuite.scala to use Utils.doFetchFile to download different versions of Spark binaries rather than launching wget as an external process.

On platforms that don't have wget installed, this suite fails with an error.

cloud-fan : would you like to check this change?

## How was this patch tested?

1) test-only of HiveExternalCatalogVersionsSuite on several platforms. Tested bad mirror, read timeout, and redirects.
2) ./dev/run-tests

Author: Bruce Robbins <bersprockets@gmail.com>

Closes #20147 from bersprockets/SPARK-22940-alt.

(cherry picked from commit c0b7424ecacb56d3e7a18acc11ba3d5e7be57c43)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../HiveExternalCatalogVersionsSuite.scala    | 48 ++++++++++++++++---
 1 file changed, 42 insertions(+), 6 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
index a3d5b941a6761..ae4aeb7b4ce4a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
@@ -18,11 +18,14 @@
 package org.apache.spark.sql.hive
 
 import java.io.File
-import java.nio.file.Files
+import java.nio.charset.StandardCharsets
+import java.nio.file.{Files, Paths}
 
 import scala.sys.process._
 
-import org.apache.spark.TestUtils
+import org.apache.hadoop.conf.Configuration
+
+import org.apache.spark.{SecurityManager, SparkConf, TestUtils}
 import org.apache.spark.sql.{QueryTest, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.CatalogTableType
@@ -55,14 +58,19 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
   private def tryDownloadSpark(version: String, path: String): Unit = {
     // Try mirrors a few times until one succeeds
     for (i <- 0 until 3) {
+      // we don't retry on a failure to get mirror url. If we can't get a mirror url,
+      // the test fails (getStringFromUrl will throw an exception)
       val preferredMirror =
-        Seq("wget", "https://www.apache.org/dyn/closer.lua?preferred=true", "-q", "-O", "-").!!.trim
-      val url = s"$preferredMirror/spark/spark-$version/spark-$version-bin-hadoop2.7.tgz"
+        getStringFromUrl("https://www.apache.org/dyn/closer.lua?preferred=true")
+      val filename = s"spark-$version-bin-hadoop2.7.tgz"
+      val url = s"$preferredMirror/spark/spark-$version/$filename"
       logInfo(s"Downloading Spark $version from $url")
-      if (Seq("wget", url, "-q", "-P", path).! == 0) {
+      try {
+        getFileFromUrl(url, path, filename)
         return
+      } catch {
+        case ex: Exception => logWarning(s"Failed to download Spark $version from $url", ex)
       }
-      logWarning(s"Failed to download Spark $version from $url")
     }
     fail(s"Unable to download Spark $version")
   }
@@ -85,6 +93,34 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
     new File(tmpDataDir, name).getCanonicalPath
   }
 
+  private def getFileFromUrl(urlString: String, targetDir: String, filename: String): Unit = {
+    val conf = new SparkConf
+    // if the caller passes the name of an existing file, we want doFetchFile to write over it with
+    // the contents from the specified url.
+    conf.set("spark.files.overwrite", "true")
+    val securityManager = new SecurityManager(conf)
+    val hadoopConf = new Configuration
+
+    val outDir = new File(targetDir)
+    if (!outDir.exists()) {
+      outDir.mkdirs()
+    }
+
+    // propagate exceptions up to the caller of getFileFromUrl
+    Utils.doFetchFile(urlString, outDir, filename, conf, securityManager, hadoopConf)
+  }
+
+  private def getStringFromUrl(urlString: String): String = {
+    val contentFile = File.createTempFile("string-", ".txt")
+    contentFile.deleteOnExit()
+
+    // exceptions will propagate to the caller of getStringFromUrl
+    getFileFromUrl(urlString, contentFile.getParent, contentFile.getName)
+
+    val contentPath = Paths.get(contentFile.toURI)
+    new String(Files.readAllBytes(contentPath), StandardCharsets.UTF_8)
+  }
+
   override def beforeAll(): Unit = {
     super.beforeAll()
 

From d1f422c1c12c8095e8522d1051a6e0e406748a3a Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Fri, 5 Jan 2018 11:51:25 -0800
Subject: [PATCH 025/566] [SPARK-13030][ML] Follow-up cleanups for
 OneHotEncoderEstimator

## What changes were proposed in this pull request?

Follow-up cleanups for the OneHotEncoderEstimator PR.  See some discussion in the original PR: https://github.com/apache/spark/pull/19527 or read below for what this PR includes:
* configedCategorySize: I reverted this to return an Array.  I realized the original setup (which I had recommended in the original PR) caused the whole model to be serialized in the UDF.
* encoder: I reorganized the logic to show what I meant in the comment in the previous PR.  I think it's simpler but am open to suggestions.

I also made some small style cleanups based on IntelliJ warnings.

## How was this patch tested?

Existing unit tests

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #20132 from jkbradley/viirya-SPARK-13030.

(cherry picked from commit 930b90a84871e2504b57ed50efa7b8bb52d3ba44)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 .../ml/feature/OneHotEncoderEstimator.scala   | 92 ++++++++++---------
 1 file changed, 49 insertions(+), 43 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoderEstimator.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoderEstimator.scala
index 074622d41e28d..bd1e3426c8780 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoderEstimator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoderEstimator.scala
@@ -30,24 +30,27 @@ import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.sql.expressions.UserDefinedFunction
 import org.apache.spark.sql.functions.{col, lit, udf}
-import org.apache.spark.sql.types.{DoubleType, NumericType, StructField, StructType}
+import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
 
 /** Private trait for params and common methods for OneHotEncoderEstimator and OneHotEncoderModel */
 private[ml] trait OneHotEncoderBase extends Params with HasHandleInvalid
     with HasInputCols with HasOutputCols {
 
   /**
-   * Param for how to handle invalid data.
+   * Param for how to handle invalid data during transform().
    * Options are 'keep' (invalid data presented as an extra categorical feature) or
    * 'error' (throw an error).
+   * Note that this Param is only used during transform; during fitting, invalid data
+   * will result in an error.
    * Default: "error"
    * @group param
    */
   @Since("2.3.0")
   override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid",
-    "How to handle invalid data " +
+    "How to handle invalid data during transform(). " +
     "Options are 'keep' (invalid data presented as an extra categorical feature) " +
-    "or error (throw an error).",
+    "or error (throw an error). Note that this Param is only used during transform; " +
+    "during fitting, invalid data will result in an error.",
     ParamValidators.inArray(OneHotEncoderEstimator.supportedHandleInvalids))
 
   setDefault(handleInvalid, OneHotEncoderEstimator.ERROR_INVALID)
@@ -66,10 +69,11 @@ private[ml] trait OneHotEncoderBase extends Params with HasHandleInvalid
   def getDropLast: Boolean = $(dropLast)
 
   protected def validateAndTransformSchema(
-      schema: StructType, dropLast: Boolean, keepInvalid: Boolean): StructType = {
+      schema: StructType,
+      dropLast: Boolean,
+      keepInvalid: Boolean): StructType = {
     val inputColNames = $(inputCols)
     val outputColNames = $(outputCols)
-    val existingFields = schema.fields
 
     require(inputColNames.length == outputColNames.length,
       s"The number of input columns ${inputColNames.length} must be the same as the number of " +
@@ -197,6 +201,10 @@ object OneHotEncoderEstimator extends DefaultParamsReadable[OneHotEncoderEstimat
   override def load(path: String): OneHotEncoderEstimator = super.load(path)
 }
 
+/**
+ * @param categorySizes  Original number of categories for each feature being encoded.
+ *                       The array contains one value for each input column, in order.
+ */
 @Since("2.3.0")
 class OneHotEncoderModel private[ml] (
     @Since("2.3.0") override val uid: String,
@@ -205,60 +213,58 @@ class OneHotEncoderModel private[ml] (
 
   import OneHotEncoderModel._
 
-  // Returns the category size for a given index with `dropLast` and `handleInvalid`
+  // Returns the category size for each index with `dropLast` and `handleInvalid`
   // taken into account.
-  private def configedCategorySize(orgCategorySize: Int, idx: Int): Int = {
+  private def getConfigedCategorySizes: Array[Int] = {
     val dropLast = getDropLast
     val keepInvalid = getHandleInvalid == OneHotEncoderEstimator.KEEP_INVALID
 
     if (!dropLast && keepInvalid) {
       // When `handleInvalid` is "keep", an extra category is added as last category
       // for invalid data.
-      orgCategorySize + 1
+      categorySizes.map(_ + 1)
     } else if (dropLast && !keepInvalid) {
       // When `dropLast` is true, the last category is removed.
-      orgCategorySize - 1
+      categorySizes.map(_ - 1)
     } else {
       // When `dropLast` is true and `handleInvalid` is "keep", the extra category for invalid
       // data is removed. Thus, it is the same as the plain number of categories.
-      orgCategorySize
+      categorySizes
     }
   }
 
   private def encoder: UserDefinedFunction = {
-    val oneValue = Array(1.0)
-    val emptyValues = Array.empty[Double]
-    val emptyIndices = Array.empty[Int]
-    val dropLast = getDropLast
-    val handleInvalid = getHandleInvalid
-    val keepInvalid = handleInvalid == OneHotEncoderEstimator.KEEP_INVALID
+    val keepInvalid = getHandleInvalid == OneHotEncoderEstimator.KEEP_INVALID
+    val configedSizes = getConfigedCategorySizes
+    val localCategorySizes = categorySizes
 
     // The udf performed on input data. The first parameter is the input value. The second
-    // parameter is the index of input.
-    udf { (label: Double, idx: Int) =>
-      val plainNumCategories = categorySizes(idx)
-      val size = configedCategorySize(plainNumCategories, idx)
-
-      if (label < 0) {
-        throw new SparkException(s"Negative value: $label. Input can't be negative.")
-      } else if (label == size && dropLast && !keepInvalid) {
-        // When `dropLast` is true and `handleInvalid` is not "keep",
-        // the last category is removed.
-        Vectors.sparse(size, emptyIndices, emptyValues)
-      } else if (label >= plainNumCategories && keepInvalid) {
-        // When `handleInvalid` is "keep", encodes invalid data to last category (and removed
-        // if `dropLast` is true)
-        if (dropLast) {
-          Vectors.sparse(size, emptyIndices, emptyValues)
+    // parameter is the index in inputCols of the column being encoded.
+    udf { (label: Double, colIdx: Int) =>
+      val origCategorySize = localCategorySizes(colIdx)
+      // idx: index in vector of the single 1-valued element
+      val idx = if (label >= 0 && label < origCategorySize) {
+        label
+      } else {
+        if (keepInvalid) {
+          origCategorySize
         } else {
-          Vectors.sparse(size, Array(size - 1), oneValue)
+          if (label < 0) {
+            throw new SparkException(s"Negative value: $label. Input can't be negative. " +
+              s"To handle invalid values, set Param handleInvalid to " +
+              s"${OneHotEncoderEstimator.KEEP_INVALID}")
+          } else {
+            throw new SparkException(s"Unseen value: $label. To handle unseen values, " +
+              s"set Param handleInvalid to ${OneHotEncoderEstimator.KEEP_INVALID}.")
+          }
         }
-      } else if (label < plainNumCategories) {
-        Vectors.sparse(size, Array(label.toInt), oneValue)
+      }
+
+      val size = configedSizes(colIdx)
+      if (idx < size) {
+        Vectors.sparse(size, Array(idx.toInt), Array(1.0))
       } else {
-        assert(handleInvalid == OneHotEncoderEstimator.ERROR_INVALID)
-        throw new SparkException(s"Unseen value: $label. To handle unseen values, " +
-          s"set Param handleInvalid to ${OneHotEncoderEstimator.KEEP_INVALID}.")
+        Vectors.sparse(size, Array.empty[Int], Array.empty[Double])
       }
     }
   }
@@ -282,7 +288,6 @@ class OneHotEncoderModel private[ml] (
   @Since("2.3.0")
   override def transformSchema(schema: StructType): StructType = {
     val inputColNames = $(inputCols)
-    val outputColNames = $(outputCols)
 
     require(inputColNames.length == categorySizes.length,
       s"The number of input columns ${inputColNames.length} must be the same as the number of " +
@@ -300,6 +305,7 @@ class OneHotEncoderModel private[ml] (
    * account. Mismatched numbers will cause exception.
    */
   private def verifyNumOfValues(schema: StructType): StructType = {
+    val configedSizes = getConfigedCategorySizes
     $(outputCols).zipWithIndex.foreach { case (outputColName, idx) =>
       val inputColName = $(inputCols)(idx)
       val attrGroup = AttributeGroup.fromStructField(schema(outputColName))
@@ -308,9 +314,9 @@ class OneHotEncoderModel private[ml] (
       // comparing with expected category number with `handleInvalid` and
       // `dropLast` taken into account.
       if (attrGroup.attributes.nonEmpty) {
-        val numCategories = configedCategorySize(categorySizes(idx), idx)
+        val numCategories = configedSizes(idx)
         require(attrGroup.size == numCategories, "OneHotEncoderModel expected " +
-          s"$numCategories categorical values for input column ${inputColName}, " +
+          s"$numCategories categorical values for input column $inputColName, " +
             s"but the input column had metadata specifying ${attrGroup.size} values.")
       }
     }
@@ -322,7 +328,7 @@ class OneHotEncoderModel private[ml] (
     val transformedSchema = transformSchema(dataset.schema, logging = true)
     val keepInvalid = $(handleInvalid) == OneHotEncoderEstimator.KEEP_INVALID
 
-    val encodedColumns = (0 until $(inputCols).length).map { idx =>
+    val encodedColumns = $(inputCols).indices.map { idx =>
       val inputColName = $(inputCols)(idx)
       val outputColName = $(outputCols)(idx)
 

From 55afac4e7b4f655aa05c5bcaf7851bb1e7699dba Mon Sep 17 00:00:00 2001
From: Gera Shegalov <gera@apache.org>
Date: Fri, 5 Jan 2018 17:25:28 -0800
Subject: [PATCH 026/566] [SPARK-22914][DEPLOY] Register history.ui.port

## What changes were proposed in this pull request?

Register spark.history.ui.port as a known spark conf to be used in substitution expressions even if it's not set explicitly.

## How was this patch tested?

Added unit test to demonstrate the issue

Author: Gera Shegalov <gera@apache.org>
Author: Gera Shegalov <gshegalov@salesforce.com>

Closes #20098 from gerashegalov/gera/register-SHS-port-conf.

(cherry picked from commit ea956833017fcbd8ed2288368bfa2e417a2251c5)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../spark/deploy/history/HistoryServer.scala  |  3 +-
 .../apache/spark/deploy/history/config.scala  |  5 +++
 .../spark/deploy/yarn/ApplicationMaster.scala | 17 +++++---
 .../deploy/yarn/ApplicationMasterSuite.scala  | 43 +++++++++++++++++++
 4 files changed, 62 insertions(+), 6 deletions(-)
 create mode 100644 resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ApplicationMasterSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
index 75484f5c9f30f..0ec4afad0308c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
@@ -28,6 +28,7 @@ import org.eclipse.jetty.servlet.{ServletContextHandler, ServletHolder}
 
 import org.apache.spark.{SecurityManager, SparkConf}
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.history.config.HISTORY_SERVER_UI_PORT
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.status.api.v1.{ApiRootResource, ApplicationInfo, UIRoot}
@@ -276,7 +277,7 @@ object HistoryServer extends Logging {
       .newInstance(conf)
       .asInstanceOf[ApplicationHistoryProvider]
 
-    val port = conf.getInt("spark.history.ui.port", 18080)
+    val port = conf.get(HISTORY_SERVER_UI_PORT)
 
     val server = new HistoryServer(conf, provider, securityManager, port)
     server.bind()
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/config.scala b/core/src/main/scala/org/apache/spark/deploy/history/config.scala
index 22b6d49d8e2a4..efdbf672bb52f 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/config.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/config.scala
@@ -44,4 +44,9 @@ private[spark] object config {
     .bytesConf(ByteUnit.BYTE)
     .createWithDefaultString("10g")
 
+  val HISTORY_SERVER_UI_PORT = ConfigBuilder("spark.history.ui.port")
+    .doc("Web UI port to bind Spark History Server")
+    .intConf
+    .createWithDefault(18080)
+
 }
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index b2576b0d72633..4d5e3bb043671 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -427,11 +427,8 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends
       uiAddress: Option[String]) = {
     val appId = client.getAttemptId().getApplicationId().toString()
     val attemptId = client.getAttemptId().getAttemptId().toString()
-    val historyAddress =
-      _sparkConf.get(HISTORY_SERVER_ADDRESS)
-        .map { text => SparkHadoopUtil.get.substituteHadoopVariables(text, yarnConf) }
-        .map { address => s"${address}${HistoryServer.UI_PATH_PREFIX}/${appId}/${attemptId}" }
-        .getOrElse("")
+    val historyAddress = ApplicationMaster
+      .getHistoryServerAddress(_sparkConf, yarnConf, appId, attemptId)
 
     val driverUrl = RpcEndpointAddress(
       _sparkConf.get("spark.driver.host"),
@@ -834,6 +831,16 @@ object ApplicationMaster extends Logging {
     master.getAttemptId
   }
 
+  private[spark] def getHistoryServerAddress(
+      sparkConf: SparkConf,
+      yarnConf: YarnConfiguration,
+      appId: String,
+      attemptId: String): String = {
+    sparkConf.get(HISTORY_SERVER_ADDRESS)
+      .map { text => SparkHadoopUtil.get.substituteHadoopVariables(text, yarnConf) }
+      .map { address => s"${address}${HistoryServer.UI_PATH_PREFIX}/${appId}/${attemptId}" }
+      .getOrElse("")
+  }
 }
 
 /**
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ApplicationMasterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ApplicationMasterSuite.scala
new file mode 100644
index 0000000000000..695a82f3583e6
--- /dev/null
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ApplicationMasterSuite.scala
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+
+class ApplicationMasterSuite extends SparkFunSuite {
+
+  test("history url with hadoop and spark substitutions") {
+    val host = "rm.host.com"
+    val port = 18080
+    val sparkConf = new SparkConf()
+
+    sparkConf.set("spark.yarn.historyServer.address",
+      "http://${hadoopconf-yarn.resourcemanager.hostname}:${spark.history.ui.port}")
+    val yarnConf = new YarnConfiguration()
+    yarnConf.set("yarn.resourcemanager.hostname", host)
+    val appId = "application_123_1"
+    val attemptId = appId + "_1"
+
+    val shsAddr = ApplicationMaster
+      .getHistoryServerAddress(sparkConf, yarnConf, appId, attemptId)
+
+    assert(shsAddr === s"http://${host}:${port}/history/${appId}/${attemptId}")
+  }
+}

From bf853018cabcd3b3abf84bfe534d2981020b4a71 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Sat, 6 Jan 2018 09:26:03 +0800
Subject: [PATCH 027/566] [SPARK-22937][SQL] SQL elt output binary for binary
 inputs

## What changes were proposed in this pull request?
This pr modified `elt` to output binary for binary inputs.
`elt` in the current master always output data as a string. But, in some databases (e.g., MySQL), if all inputs are binary, `elt` also outputs binary (Also, this might be a small surprise).
This pr is related to #19977.

## How was this patch tested?
Added tests in `SQLQueryTestSuite` and `TypeCoercionSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #20135 from maropu/SPARK-22937.

(cherry picked from commit e8af7e8aeca15a6107248f358d9514521ffdc6d3)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 docs/sql-programming-guide.md                 |   2 +
 .../sql/catalyst/analysis/TypeCoercion.scala  |  29 +++++
 .../expressions/stringExpressions.scala       |  46 ++++---
 .../apache/spark/sql/internal/SQLConf.scala   |   8 ++
 .../catalyst/analysis/TypeCoercionSuite.scala |  54 ++++++++
 .../inputs/typeCoercion/native/elt.sql        |  44 +++++++
 .../results/typeCoercion/native/elt.sql.out   | 115 ++++++++++++++++++
 7 files changed, 281 insertions(+), 17 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/elt.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/typeCoercion/native/elt.sql.out

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index dc3e384008d27..b50f9360b866c 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1783,6 +1783,8 @@ options.
 
  - Since Spark 2.3, when all inputs are binary, `functions.concat()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.concatBinaryAsString` to `true`.
 
+ - Since Spark 2.3, when all inputs are binary, SQL `elt()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.eltOutputAsString` to `true`.
+
 ## Upgrading From Spark SQL 2.1 to 2.2
 
   - Spark 2.1.1 introduced a new configuration key: `spark.sql.hive.caseSensitiveInferenceMode`. It had a default setting of `NEVER_INFER`, which kept behavior identical to 2.1.0. However, Spark 2.2.0 changes this setting's default value to `INFER_AND_SAVE` to restore compatibility with reading Hive metastore tables whose underlying file schema have mixed-case column names. With the `INFER_AND_SAVE` configuration value, on first access Spark will perform schema inference on any Hive metastore table for which it has not already saved an inferred schema. Note that schema inference can be a very time consuming operation for tables with thousands of partitions. If compatibility with mixed-case column names is not a concern, you can safely set `spark.sql.hive.caseSensitiveInferenceMode` to `NEVER_INFER` to avoid the initial overhead of schema inference. Note that with the new default `INFER_AND_SAVE` setting, the results of the schema inference are saved as a metastore key for future use. Therefore, the initial schema inference occurs only at a table's first access.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
index e9436367c7e2e..e8669c4637d06 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -54,6 +54,7 @@ object TypeCoercion {
       BooleanEquality ::
       FunctionArgumentConversion ::
       ConcatCoercion(conf) ::
+      EltCoercion(conf) ::
       CaseWhenCoercion ::
       IfCoercion ::
       StackCoercion ::
@@ -684,6 +685,34 @@ object TypeCoercion {
     }
   }
 
+  /**
+   * Coerces the types of [[Elt]] children to expected ones.
+   *
+   * If `spark.sql.function.eltOutputAsString` is false and all children types are binary,
+   * the expected types are binary. Otherwise, the expected ones are strings.
+   */
+  case class EltCoercion(conf: SQLConf) extends TypeCoercionRule {
+
+    override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan transform { case p =>
+      p transformExpressionsUp {
+        // Skip nodes if unresolved or not enough children
+        case c @ Elt(children) if !c.childrenResolved || children.size < 2 => c
+        case c @ Elt(children) =>
+          val index = children.head
+          val newIndex = ImplicitTypeCasts.implicitCast(index, IntegerType).getOrElse(index)
+          val newInputs = if (conf.eltOutputAsString ||
+              !children.tail.map(_.dataType).forall(_ == BinaryType)) {
+            children.tail.map { e =>
+              ImplicitTypeCasts.implicitCast(e, StringType).getOrElse(e)
+            }
+          } else {
+            children.tail
+          }
+          c.copy(children = newIndex +: newInputs)
+      }
+    }
+  }
+
   /**
    * Turns Add/Subtract of DateType/TimestampType/StringType and CalendarIntervalType
    * to TimeAdd/TimeSub
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 41dc762154a4c..e004bfc6af473 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -271,33 +271,45 @@ case class ConcatWs(children: Seq[Expression])
   }
 }
 
+/**
+ * An expression that returns the `n`-th input in given inputs.
+ * If all inputs are binary, `elt` returns an output as binary. Otherwise, it returns as string.
+ * If any input is null, `elt` returns null.
+ */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(n, str1, str2, ...) - Returns the `n`-th string, e.g., returns `str2` when `n` is 2.",
+  usage = "_FUNC_(n, input1, input2, ...) - Returns the `n`-th input, e.g., returns `input2` when `n` is 2.",
   examples = """
     Examples:
       > SELECT _FUNC_(1, 'scala', 'java');
        scala
   """)
 // scalastyle:on line.size.limit
-case class Elt(children: Seq[Expression])
-  extends Expression with ImplicitCastInputTypes {
+case class Elt(children: Seq[Expression]) extends Expression {
 
   private lazy val indexExpr = children.head
-  private lazy val stringExprs = children.tail.toArray
+  private lazy val inputExprs = children.tail.toArray
 
   /** This expression is always nullable because it returns null if index is out of range. */
   override def nullable: Boolean = true
 
-  override def dataType: DataType = StringType
-
-  override def inputTypes: Seq[DataType] = IntegerType +: Seq.fill(children.size - 1)(StringType)
+  override def dataType: DataType = inputExprs.map(_.dataType).headOption.getOrElse(StringType)
 
   override def checkInputDataTypes(): TypeCheckResult = {
     if (children.size < 2) {
       TypeCheckResult.TypeCheckFailure("elt function requires at least two arguments")
     } else {
-      super[ImplicitCastInputTypes].checkInputDataTypes()
+      val (indexType, inputTypes) = (indexExpr.dataType, inputExprs.map(_.dataType))
+      if (indexType != IntegerType) {
+        return TypeCheckResult.TypeCheckFailure(s"first input to function $prettyName should " +
+          s"have IntegerType, but it's $indexType")
+      }
+      if (inputTypes.exists(tpe => !Seq(StringType, BinaryType).contains(tpe))) {
+        return TypeCheckResult.TypeCheckFailure(
+          s"input to function $prettyName should have StringType or BinaryType, but it's " +
+            inputTypes.map(_.simpleString).mkString("[", ", ", "]"))
+      }
+      TypeUtils.checkForSameTypeInputExpr(inputTypes, s"function $prettyName")
     }
   }
 
@@ -307,27 +319,27 @@ case class Elt(children: Seq[Expression])
       null
     } else {
       val index = indexObj.asInstanceOf[Int]
-      if (index <= 0 || index > stringExprs.length) {
+      if (index <= 0 || index > inputExprs.length) {
         null
       } else {
-        stringExprs(index - 1).eval(input)
+        inputExprs(index - 1).eval(input)
       }
     }
   }
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val index = indexExpr.genCode(ctx)
-    val strings = stringExprs.map(_.genCode(ctx))
+    val inputs = inputExprs.map(_.genCode(ctx))
     val indexVal = ctx.freshName("index")
     val indexMatched = ctx.freshName("eltIndexMatched")
 
-    val stringVal = ctx.addMutableState(ctx.javaType(dataType), "stringVal")
+    val inputVal = ctx.addMutableState(ctx.javaType(dataType), "inputVal")
 
-    val assignStringValue = strings.zipWithIndex.map { case (eval, index) =>
+    val assignInputValue = inputs.zipWithIndex.map { case (eval, index) =>
       s"""
          |if ($indexVal == ${index + 1}) {
          |  ${eval.code}
-         |  $stringVal = ${eval.isNull} ? null : ${eval.value};
+         |  $inputVal = ${eval.isNull} ? null : ${eval.value};
          |  $indexMatched = true;
          |  continue;
          |}
@@ -335,7 +347,7 @@ case class Elt(children: Seq[Expression])
     }
 
     val codes = ctx.splitExpressionsWithCurrentInputs(
-      expressions = assignStringValue,
+      expressions = assignInputValue,
       funcName = "eltFunc",
       extraArguments = ("int", indexVal) :: Nil,
       returnType = ctx.JAVA_BOOLEAN,
@@ -361,11 +373,11 @@ case class Elt(children: Seq[Expression])
          |${index.code}
          |final int $indexVal = ${index.value};
          |${ctx.JAVA_BOOLEAN} $indexMatched = false;
-         |$stringVal = null;
+         |$inputVal = null;
          |do {
          |  $codes
          |} while (false);
-         |final UTF8String ${ev.value} = $stringVal;
+         |final ${ctx.javaType(dataType)} ${ev.value} = $inputVal;
          |final boolean ${ev.isNull} = ${ev.value} == null;
        """.stripMargin)
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 5d6edf6b8abec..80b8965e084a2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1052,6 +1052,12 @@ object SQLConf {
     .booleanConf
     .createWithDefault(false)
 
+  val ELT_OUTPUT_AS_STRING = buildConf("spark.sql.function.eltOutputAsString")
+    .doc("When this option is set to false and all inputs are binary, `elt` returns " +
+      "an output as binary. Otherwise, it returns as a string. ")
+    .booleanConf
+    .createWithDefault(false)
+
   val CONTINUOUS_STREAMING_EXECUTOR_QUEUE_SIZE =
     buildConf("spark.sql.streaming.continuous.executorQueueSize")
     .internal()
@@ -1412,6 +1418,8 @@ class SQLConf extends Serializable with Logging {
 
   def concatBinaryAsString: Boolean = getConf(CONCAT_BINARY_AS_STRING)
 
+  def eltOutputAsString: Boolean = getConf(ELT_OUTPUT_AS_STRING)
+
   def partitionOverwriteMode: PartitionOverwriteMode.Value =
     PartitionOverwriteMode.withName(getConf(PARTITION_OVERWRITE_MODE))
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
index 3661530cd622b..52a7ebdafd7c7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
@@ -923,6 +923,60 @@ class TypeCoercionSuite extends AnalysisTest {
     }
   }
 
+  test("type coercion for Elt") {
+    val rule = TypeCoercion.EltCoercion(conf)
+
+    ruleTest(rule,
+      Elt(Seq(Literal(1), Literal("ab"), Literal("cde"))),
+      Elt(Seq(Literal(1), Literal("ab"), Literal("cde"))))
+    ruleTest(rule,
+      Elt(Seq(Literal(1.toShort), Literal("ab"), Literal("cde"))),
+      Elt(Seq(Cast(Literal(1.toShort), IntegerType), Literal("ab"), Literal("cde"))))
+    ruleTest(rule,
+      Elt(Seq(Literal(2), Literal(null), Literal("abc"))),
+      Elt(Seq(Literal(2), Cast(Literal(null), StringType), Literal("abc"))))
+    ruleTest(rule,
+      Elt(Seq(Literal(2), Literal(1), Literal("234"))),
+      Elt(Seq(Literal(2), Cast(Literal(1), StringType), Literal("234"))))
+    ruleTest(rule,
+      Elt(Seq(Literal(3), Literal(1L), Literal(2.toByte), Literal(0.1))),
+      Elt(Seq(Literal(3), Cast(Literal(1L), StringType), Cast(Literal(2.toByte), StringType),
+        Cast(Literal(0.1), StringType))))
+    ruleTest(rule,
+      Elt(Seq(Literal(2), Literal(true), Literal(0.1f), Literal(3.toShort))),
+      Elt(Seq(Literal(2), Cast(Literal(true), StringType), Cast(Literal(0.1f), StringType),
+        Cast(Literal(3.toShort), StringType))))
+    ruleTest(rule,
+      Elt(Seq(Literal(1), Literal(1L), Literal(0.1))),
+      Elt(Seq(Literal(1), Cast(Literal(1L), StringType), Cast(Literal(0.1), StringType))))
+    ruleTest(rule,
+      Elt(Seq(Literal(1), Literal(Decimal(10)))),
+      Elt(Seq(Literal(1), Cast(Literal(Decimal(10)), StringType))))
+    ruleTest(rule,
+      Elt(Seq(Literal(1), Literal(BigDecimal.valueOf(10)))),
+      Elt(Seq(Literal(1), Cast(Literal(BigDecimal.valueOf(10)), StringType))))
+    ruleTest(rule,
+      Elt(Seq(Literal(1), Literal(java.math.BigDecimal.valueOf(10)))),
+      Elt(Seq(Literal(1), Cast(Literal(java.math.BigDecimal.valueOf(10)), StringType))))
+    ruleTest(rule,
+      Elt(Seq(Literal(2), Literal(new java.sql.Date(0)), Literal(new Timestamp(0)))),
+      Elt(Seq(Literal(2), Cast(Literal(new java.sql.Date(0)), StringType),
+        Cast(Literal(new Timestamp(0)), StringType))))
+
+    withSQLConf("spark.sql.function.eltOutputAsString" -> "true") {
+      ruleTest(rule,
+        Elt(Seq(Literal(1), Literal("123".getBytes), Literal("456".getBytes))),
+        Elt(Seq(Literal(1), Cast(Literal("123".getBytes), StringType),
+          Cast(Literal("456".getBytes), StringType))))
+    }
+
+    withSQLConf("spark.sql.function.eltOutputAsString" -> "false") {
+      ruleTest(rule,
+        Elt(Seq(Literal(1), Literal("123".getBytes), Literal("456".getBytes))),
+        Elt(Seq(Literal(1), Literal("123".getBytes), Literal("456".getBytes))))
+    }
+  }
+
   test("BooleanEquality type cast") {
     val be = TypeCoercion.BooleanEquality
     // Use something more than a literal to avoid triggering the simplification rules.
diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/elt.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/elt.sql
new file mode 100644
index 0000000000000..717616f91db05
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/elt.sql
@@ -0,0 +1,44 @@
+-- Mixed inputs (output type is string)
+SELECT elt(2, col1, col2, col3, col4, col5) col
+FROM (
+  SELECT
+    'prefix_' col1,
+    id col2,
+    string(id + 1) col3,
+    encode(string(id + 2), 'utf-8') col4,
+    CAST(id AS DOUBLE) col5
+  FROM range(10)
+);
+
+SELECT elt(3, col1, col2, col3, col4) col
+FROM (
+  SELECT
+    string(id) col1,
+    string(id + 1) col2,
+    encode(string(id + 2), 'utf-8') col3,
+    encode(string(id + 3), 'utf-8') col4
+  FROM range(10)
+);
+
+-- turn on eltOutputAsString
+set spark.sql.function.eltOutputAsString=true;
+
+SELECT elt(1, col1, col2) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2
+  FROM range(10)
+);
+
+-- turn off eltOutputAsString
+set spark.sql.function.eltOutputAsString=false;
+
+-- Elt binary inputs (output type is binary)
+SELECT elt(2, col1, col2) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2
+  FROM range(10)
+);
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/elt.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/elt.sql.out
new file mode 100644
index 0000000000000..b62e1b6826045
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/elt.sql.out
@@ -0,0 +1,115 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 6
+
+
+-- !query 0
+SELECT elt(2, col1, col2, col3, col4, col5) col
+FROM (
+  SELECT
+    'prefix_' col1,
+    id col2,
+    string(id + 1) col3,
+    encode(string(id + 2), 'utf-8') col4,
+    CAST(id AS DOUBLE) col5
+  FROM range(10)
+)
+-- !query 0 schema
+struct<col:string>
+-- !query 0 output
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+
+
+-- !query 1
+SELECT elt(3, col1, col2, col3, col4) col
+FROM (
+  SELECT
+    string(id) col1,
+    string(id + 1) col2,
+    encode(string(id + 2), 'utf-8') col3,
+    encode(string(id + 3), 'utf-8') col4
+  FROM range(10)
+)
+-- !query 1 schema
+struct<col:string>
+-- !query 1 output
+10
+11
+2
+3
+4
+5
+6
+7
+8
+9
+
+
+-- !query 2
+set spark.sql.function.eltOutputAsString=true
+-- !query 2 schema
+struct<key:string,value:string>
+-- !query 2 output
+spark.sql.function.eltOutputAsString	true
+
+
+-- !query 3
+SELECT elt(1, col1, col2) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2
+  FROM range(10)
+)
+-- !query 3 schema
+struct<col:string>
+-- !query 3 output
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+
+
+-- !query 4
+set spark.sql.function.eltOutputAsString=false
+-- !query 4 schema
+struct<key:string,value:string>
+-- !query 4 output
+spark.sql.function.eltOutputAsString	false
+
+
+-- !query 5
+SELECT elt(2, col1, col2) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2
+  FROM range(10)
+)
+-- !query 5 schema
+struct<col:binary>
+-- !query 5 output
+1
+10
+2
+3
+4
+5
+6
+7
+8
+9

From 3e3e9386ed95435a2d1817653d1402c102e380dc Mon Sep 17 00:00:00 2001
From: Yinan Li <liyinan926@gmail.com>
Date: Fri, 5 Jan 2018 17:29:27 -0800
Subject: [PATCH 028/566] [SPARK-22960][K8S] Revert use of ARG base_image in
 images

## What changes were proposed in this pull request?

This PR reverts the `ARG base_image` before `FROM` in the images of driver, executor, and init-container, introduced in https://github.com/apache/spark/pull/20154. The reason is Docker versions before 17.06 do not support this use (`ARG` before `FROM`).

## How was this patch tested?

Tested manually.

vanzin foxish kimoonkim

Author: Yinan Li <liyinan926@gmail.com>

Closes #20170 from liyinan926/master.

(cherry picked from commit bf65cd3cda46d5480bfcd13110975c46ca631972)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../docker/src/main/dockerfiles/driver/Dockerfile         | 3 +--
 .../docker/src/main/dockerfiles/executor/Dockerfile       | 3 +--
 .../docker/src/main/dockerfiles/init-container/Dockerfile | 3 +--
 sbin/build-push-docker-images.sh                          | 8 ++++----
 4 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
index ff5289e10c21e..45fbcd9cd0deb 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
@@ -15,8 +15,7 @@
 # limitations under the License.
 #
 
-ARG base_image
-FROM ${base_image}
+FROM spark-base
 
 # Before building the docker image, first build and make a Spark distribution following
 # the instructions in http://spark.apache.org/docs/latest/building-spark.html.
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
index 3eabb42d4d852..0f806cf7e148e 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
@@ -15,8 +15,7 @@
 # limitations under the License.
 #
 
-ARG base_image
-FROM ${base_image}
+FROM spark-base
 
 # Before building the docker image, first build and make a Spark distribution following
 # the instructions in http://spark.apache.org/docs/latest/building-spark.html.
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
index e0a249e0ac71f..047056ab2633b 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
@@ -15,8 +15,7 @@
 # limitations under the License.
 #
 
-ARG base_image
-FROM ${base_image}
+FROM spark-base
 
 # If this docker file is being used in the context of building your images from a Spark distribution, the docker build
 # command should be invoked from the top level directory of the Spark distribution. E.g.:
diff --git a/sbin/build-push-docker-images.sh b/sbin/build-push-docker-images.sh
index bb8806dd33f37..b9532597419a5 100755
--- a/sbin/build-push-docker-images.sh
+++ b/sbin/build-push-docker-images.sh
@@ -60,13 +60,13 @@ function image_ref {
 }
 
 function build {
-  local base_image="$(image_ref spark-base 0)"
-  docker build --build-arg "spark_jars=$SPARK_JARS" \
+  docker build \
+    --build-arg "spark_jars=$SPARK_JARS" \
     --build-arg "img_path=$IMG_PATH" \
-    -t "$base_image" \
+    -t spark-base \
     -f "$IMG_PATH/spark-base/Dockerfile" .
   for image in "${!path[@]}"; do
-    docker build --build-arg "base_image=$base_image" -t "$(image_ref $image)" -f ${path[$image]} .
+    docker build -t "$(image_ref $image)" -f ${path[$image]} .
   done
 }
 

From 7236914e5e7aeb4eb919530b6edbad70256cca52 Mon Sep 17 00:00:00 2001
From: Li Jin <ice.xelloss@gmail.com>
Date: Sat, 6 Jan 2018 16:11:20 +0800
Subject: [PATCH 029/566] [SPARK-22930][PYTHON][SQL] Improve the description of
 Vectorized UDFs for non-deterministic cases

## What changes were proposed in this pull request?

Add tests for using non deterministic UDFs in aggregate.

Update pandas_udf docstring w.r.t to determinism.

## How was this patch tested?
test_nondeterministic_udf_in_aggregate

Author: Li Jin <ice.xelloss@gmail.com>

Closes #20142 from icexelloss/SPARK-22930-pandas-udf-deterministic.

(cherry picked from commit f2dd8b923759e8771b0e5f59bfa7ae4ad7e6a339)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 python/pyspark/sql/functions.py | 12 +++++++-
 python/pyspark/sql/tests.py     | 52 +++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index a4ed562ad48b4..733e32bd825b0 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2214,7 +2214,17 @@ def pandas_udf(f=None, returnType=None, functionType=None):
 
        .. seealso:: :meth:`pyspark.sql.GroupedData.apply`
 
-    .. note:: The user-defined function must be deterministic.
+    .. note:: The user-defined functions are considered deterministic by default. Due to
+        optimization, duplicate invocations may be eliminated or the function may even be invoked
+        more times than it is present in the query. If your function is not deterministic, call
+        `asNondeterministic` on the user defined function. E.g.:
+
+    >>> @pandas_udf('double', PandasUDFType.SCALAR)  # doctest: +SKIP
+    ... def random(v):
+    ...     import numpy as np
+    ...     import pandas as pd
+    ...     return pd.Series(np.random.randn(len(v))
+    >>> random = random.asNondeterministic()  # doctest: +SKIP
 
     .. note:: The user-defined functions do not support conditional expressions or short curcuiting
         in boolean expressions and it ends up with being executed all internally. If the functions
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 6dc767f9ec46e..689736d8e6456 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -386,6 +386,7 @@ def test_udf3(self):
         self.assertEqual(row[0], 5)
 
     def test_nondeterministic_udf(self):
+        # Test that nondeterministic UDFs are evaluated only once in chained UDF evaluations
         from pyspark.sql.functions import udf
         import random
         udf_random_col = udf(lambda: int(100 * random.random()), IntegerType()).asNondeterministic()
@@ -413,6 +414,18 @@ def test_nondeterministic_udf2(self):
         pydoc.render_doc(random_udf)
         pydoc.render_doc(random_udf1)
 
+    def test_nondeterministic_udf_in_aggregate(self):
+        from pyspark.sql.functions import udf, sum
+        import random
+        udf_random_col = udf(lambda: int(100 * random.random()), 'int').asNondeterministic()
+        df = self.spark.range(10)
+
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(AnalysisException, "nondeterministic"):
+                df.groupby('id').agg(sum(udf_random_col())).collect()
+            with self.assertRaisesRegexp(AnalysisException, "nondeterministic"):
+                df.agg(sum(udf_random_col())).collect()
+
     def test_chained_udf(self):
         self.spark.catalog.registerFunction("double", lambda x: x + x, IntegerType())
         [row] = self.spark.sql("SELECT double(1)").collect()
@@ -3567,6 +3580,18 @@ def tearDownClass(cls):
         time.tzset()
         ReusedSQLTestCase.tearDownClass()
 
+    @property
+    def random_udf(self):
+        from pyspark.sql.functions import pandas_udf
+
+        @pandas_udf('double')
+        def random_udf(v):
+            import pandas as pd
+            import numpy as np
+            return pd.Series(np.random.random(len(v)))
+        random_udf = random_udf.asNondeterministic()
+        return random_udf
+
     def test_vectorized_udf_basic(self):
         from pyspark.sql.functions import pandas_udf, col
         df = self.spark.range(10).select(
@@ -3950,6 +3975,33 @@ def test_vectorized_udf_timestamps_respect_session_timezone(self):
         finally:
             self.spark.conf.set("spark.sql.session.timeZone", orig_tz)
 
+    def test_nondeterministic_udf(self):
+        # Test that nondeterministic UDFs are evaluated only once in chained UDF evaluations
+        from pyspark.sql.functions import udf, pandas_udf, col
+
+        @pandas_udf('double')
+        def plus_ten(v):
+            return v + 10
+        random_udf = self.random_udf
+
+        df = self.spark.range(10).withColumn('rand', random_udf(col('id')))
+        result1 = df.withColumn('plus_ten(rand)', plus_ten(df['rand'])).toPandas()
+
+        self.assertEqual(random_udf.deterministic, False)
+        self.assertTrue(result1['plus_ten(rand)'].equals(result1['rand'] + 10))
+
+    def test_nondeterministic_udf_in_aggregate(self):
+        from pyspark.sql.functions import pandas_udf, sum
+
+        df = self.spark.range(10)
+        random_udf = self.random_udf
+
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(AnalysisException, 'nondeterministic'):
+                df.groupby(df.id).agg(sum(random_udf(df.id))).collect()
+            with self.assertRaisesRegexp(AnalysisException, 'nondeterministic'):
+                df.agg(sum(random_udf(df.id))).collect()
+
 
 @unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
 class GroupbyApplyTests(ReusedSQLTestCase):

From e6449e8167776e3921c286d75e8cdd30ee33d77a Mon Sep 17 00:00:00 2001
From: zuotingbing <zuo.tingbing9@zte.com.cn>
Date: Sat, 6 Jan 2018 18:07:45 +0800
Subject: [PATCH 030/566] [SPARK-22793][SQL] Memory leak in Spark Thrift Server

# What changes were proposed in this pull request?
1. Start HiveThriftServer2.
2. Connect to thriftserver through beeline.
3. Close the beeline.
4. repeat step2 and step 3 for many times.
we found there are many directories never be dropped under the path `hive.exec.local.scratchdir` and `hive.exec.scratchdir`, as we know the scratchdir has been added to deleteOnExit when it be created. So it means that the cache size of FileSystem `deleteOnExit` will keep increasing until JVM terminated.

In addition, we use `jmap -histo:live [PID]`
to printout the size of objects in HiveThriftServer2 Process, we can find the object `org.apache.spark.sql.hive.client.HiveClientImpl` and `org.apache.hadoop.hive.ql.session.SessionState` keep increasing even though we closed all the beeline connections, which may caused the leak of Memory.

# How was this patch tested?
manual tests

This PR follw-up the https://github.com/apache/spark/pull/19989

Author: zuotingbing <zuo.tingbing9@zte.com.cn>

Closes #20029 from zuotingbing/SPARK-22793.

(cherry picked from commit be9a804f2ef77a5044d3da7d9374976daf59fc16)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../org/apache/spark/sql/hive/HiveSessionStateBuilder.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
index 92cb4ef11c9e3..dc92ad3b0c1ac 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
@@ -42,7 +42,7 @@ class HiveSessionStateBuilder(session: SparkSession, parentState: Option[Session
    * Create a Hive aware resource loader.
    */
   override protected lazy val resourceLoader: HiveSessionResourceLoader = {
-    val client: HiveClient = externalCatalog.client.newSession()
+    val client: HiveClient = externalCatalog.client
     new HiveSessionResourceLoader(session, client)
   }
 

From 0377755985897c850dcf3a938520e5c10de4040a Mon Sep 17 00:00:00 2001
From: fjh100456 <fu.jinhua6@zte.com.cn>
Date: Sat, 6 Jan 2018 18:19:57 +0800
Subject: [PATCH 031/566] [SPARK-21786][SQL] When acquiring
 'compressionCodecClassName' in 'ParquetOptions', `parquet.compression` needs
 to be considered.

[SPARK-21786][SQL] When acquiring 'compressionCodecClassName' in 'ParquetOptions', `parquet.compression` needs to be considered.

## What changes were proposed in this pull request?
Since Hive 1.1, Hive allows users to set parquet compression codec via table-level properties parquet.compression. See the JIRA: https://issues.apache.org/jira/browse/HIVE-7858 . We do support orc.compression for ORC. Thus, for external users, it is more straightforward to support both. See the stackflow question: https://stackoverflow.com/questions/36941122/spark-sql-ignores-parquet-compression-propertie-specified-in-tblproperties
In Spark side, our table-level compression conf compression was added by #11464 since Spark 2.0.
We need to support both table-level conf. Users might also use session-level conf spark.sql.parquet.compression.codec. The priority rule will be like
If other compression codec configuration was found through hive or parquet, the precedence would be compression, parquet.compression, spark.sql.parquet.compression.codec. Acceptable values include: none, uncompressed, snappy, gzip, lzo.
The rule for Parquet is consistent with the ORC after the change.

Changes:
1.Increased acquiring 'compressionCodecClassName' from `parquet.compression`,and the precedence order is `compression`,`parquet.compression`,`spark.sql.parquet.compression.codec`, just like what we do in `OrcOptions`.

2.Change `spark.sql.parquet.compression.codec` to support "none".Actually in `ParquetOptions`,we do support "none" as equivalent to "uncompressed", but it does not allowed to configured to "none".

3.Change `compressionCode` to `compressionCodecClassName`.

## How was this patch tested?
Add test.

Author: fjh100456 <fu.jinhua6@zte.com.cn>

Closes #20076 from fjh100456/ParquetOptionIssue.

(cherry picked from commit 7b78041423b6ee330def2336dfd1ff9ae8469c59)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 docs/sql-programming-guide.md                 |   6 +-
 .../apache/spark/sql/internal/SQLConf.scala   |  14 +-
 .../datasources/parquet/ParquetOptions.scala  |  12 +-
 ...rquetCompressionCodecPrecedenceSuite.scala | 122 ++++++++++++++++++
 4 files changed, 145 insertions(+), 9 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompressionCodecPrecedenceSuite.scala

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index b50f9360b866c..3ccaaf4d5b1fa 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -953,8 +953,10 @@ Configuration of Parquet can be done using the `setConf` method on `SparkSession
   <td><code>spark.sql.parquet.compression.codec</code></td>
   <td>snappy</td>
   <td>
-    Sets the compression codec use when writing Parquet files. Acceptable values include:
-    uncompressed, snappy, gzip, lzo.
+    Sets the compression codec used when writing Parquet files. If either `compression` or
+    `parquet.compression` is specified in the table-specific options/properties, the precedence would be
+    `compression`, `parquet.compression`, `spark.sql.parquet.compression.codec`. Acceptable values include:
+    none, uncompressed, snappy, gzip, lzo.
   </td>
 </tr>
 <tr>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 80b8965e084a2..7d1217de254a2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -325,11 +325,13 @@ object SQLConf {
     .createWithDefault(false)
 
   val PARQUET_COMPRESSION = buildConf("spark.sql.parquet.compression.codec")
-    .doc("Sets the compression codec use when writing Parquet files. Acceptable values include: " +
-      "uncompressed, snappy, gzip, lzo.")
+    .doc("Sets the compression codec used when writing Parquet files. If either `compression` or" +
+      "`parquet.compression` is specified in the table-specific options/properties, the precedence" +
+      "would be `compression`, `parquet.compression`, `spark.sql.parquet.compression.codec`." +
+      "Acceptable values include: none, uncompressed, snappy, gzip, lzo.")
     .stringConf
     .transform(_.toLowerCase(Locale.ROOT))
-    .checkValues(Set("uncompressed", "snappy", "gzip", "lzo"))
+    .checkValues(Set("none", "uncompressed", "snappy", "gzip", "lzo"))
     .createWithDefault("snappy")
 
   val PARQUET_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.parquet.filterPushdown")
@@ -366,8 +368,10 @@ object SQLConf {
       .createWithDefault(true)
 
   val ORC_COMPRESSION = buildConf("spark.sql.orc.compression.codec")
-    .doc("Sets the compression codec use when writing ORC files. Acceptable values include: " +
-      "none, uncompressed, snappy, zlib, lzo.")
+    .doc("Sets the compression codec used when writing ORC files. If either `compression` or" +
+      "`orc.compress` is specified in the table-specific options/properties, the precedence" +
+      "would be `compression`, `orc.compress`, `spark.sql.orc.compression.codec`." +
+      "Acceptable values include: none, uncompressed, snappy, zlib, lzo.")
     .stringConf
     .transform(_.toLowerCase(Locale.ROOT))
     .checkValues(Set("none", "uncompressed", "snappy", "zlib", "lzo"))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
index 772d4565de548..ef67ea7d17cea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources.parquet
 
 import java.util.Locale
 
+import org.apache.parquet.hadoop.ParquetOutputFormat
 import org.apache.parquet.hadoop.metadata.CompressionCodecName
 
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
@@ -42,8 +43,15 @@ private[parquet] class ParquetOptions(
    * Acceptable values are defined in [[shortParquetCompressionCodecNames]].
    */
   val compressionCodecClassName: String = {
-    val codecName = parameters.getOrElse("compression",
-      sqlConf.parquetCompressionCodec).toLowerCase(Locale.ROOT)
+    // `compression`, `parquet.compression`(i.e., ParquetOutputFormat.COMPRESSION), and
+    // `spark.sql.parquet.compression.codec`
+    // are in order of precedence from highest to lowest.
+    val parquetCompressionConf = parameters.get(ParquetOutputFormat.COMPRESSION)
+    val codecName = parameters
+      .get("compression")
+      .orElse(parquetCompressionConf)
+      .getOrElse(sqlConf.parquetCompressionCodec)
+      .toLowerCase(Locale.ROOT)
     if (!shortParquetCompressionCodecNames.contains(codecName)) {
       val availableCodecs =
         shortParquetCompressionCodecNames.keys.map(_.toLowerCase(Locale.ROOT))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompressionCodecPrecedenceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompressionCodecPrecedenceSuite.scala
new file mode 100644
index 0000000000000..ed8fd2b453456
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompressionCodecPrecedenceSuite.scala
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.parquet
+
+import java.io.File
+
+import scala.collection.JavaConverters._
+
+import org.apache.hadoop.fs.Path
+import org.apache.parquet.hadoop.ParquetOutputFormat
+
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSQLContext
+
+class ParquetCompressionCodecPrecedenceSuite extends ParquetTest with SharedSQLContext {
+  test("Test `spark.sql.parquet.compression.codec` config") {
+    Seq("NONE", "UNCOMPRESSED", "SNAPPY", "GZIP", "LZO").foreach { c =>
+      withSQLConf(SQLConf.PARQUET_COMPRESSION.key -> c) {
+        val expected = if (c == "NONE") "UNCOMPRESSED" else c
+        val option = new ParquetOptions(Map.empty[String, String], spark.sessionState.conf)
+        assert(option.compressionCodecClassName == expected)
+      }
+    }
+  }
+
+  test("[SPARK-21786] Test Acquiring 'compressionCodecClassName' for parquet in right order.") {
+    // When "compression" is configured, it should be the first choice.
+    withSQLConf(SQLConf.PARQUET_COMPRESSION.key -> "snappy") {
+      val props = Map("compression" -> "uncompressed", ParquetOutputFormat.COMPRESSION -> "gzip")
+      val option = new ParquetOptions(props, spark.sessionState.conf)
+      assert(option.compressionCodecClassName == "UNCOMPRESSED")
+    }
+
+    // When "compression" is not configured, "parquet.compression" should be the preferred choice.
+    withSQLConf(SQLConf.PARQUET_COMPRESSION.key -> "snappy") {
+      val props = Map(ParquetOutputFormat.COMPRESSION -> "gzip")
+      val option = new ParquetOptions(props, spark.sessionState.conf)
+      assert(option.compressionCodecClassName == "GZIP")
+    }
+
+    // When both "compression" and "parquet.compression" are not configured,
+    // spark.sql.parquet.compression.codec should be the right choice.
+    withSQLConf(SQLConf.PARQUET_COMPRESSION.key -> "snappy") {
+      val props = Map.empty[String, String]
+      val option = new ParquetOptions(props, spark.sessionState.conf)
+      assert(option.compressionCodecClassName == "SNAPPY")
+    }
+  }
+
+  private def getTableCompressionCodec(path: String): Seq[String] = {
+    val hadoopConf = spark.sessionState.newHadoopConf()
+    val codecs = for {
+      footer <- readAllFootersWithoutSummaryFiles(new Path(path), hadoopConf)
+      block <- footer.getParquetMetadata.getBlocks.asScala
+      column <- block.getColumns.asScala
+    } yield column.getCodec.name()
+    codecs.distinct
+  }
+
+  private def createTableWithCompression(
+      tableName: String,
+      isPartitioned: Boolean,
+      compressionCodec: String,
+      rootDir: File): Unit = {
+    val options =
+      s"""
+        |OPTIONS('path'='${rootDir.toURI.toString.stripSuffix("/")}/$tableName',
+        |'parquet.compression'='$compressionCodec')
+       """.stripMargin
+    val partitionCreate = if (isPartitioned) "PARTITIONED BY (p)" else ""
+    sql(
+      s"""
+        |CREATE TABLE $tableName USING Parquet $options $partitionCreate
+        |AS SELECT 1 AS col1, 2 AS p
+       """.stripMargin)
+  }
+
+  private def checkCompressionCodec(compressionCodec: String, isPartitioned: Boolean): Unit = {
+    withTempDir { tmpDir =>
+      val tempTableName = "TempParquetTable"
+      withTable(tempTableName) {
+        createTableWithCompression(tempTableName, isPartitioned, compressionCodec, tmpDir)
+        val partitionPath = if (isPartitioned) "p=2" else ""
+        val path = s"${tmpDir.getPath.stripSuffix("/")}/$tempTableName/$partitionPath"
+        val realCompressionCodecs = getTableCompressionCodec(path)
+        assert(realCompressionCodecs.forall(_ == compressionCodec))
+      }
+    }
+  }
+
+  test("Create parquet table with compression") {
+    Seq(true, false).foreach { isPartitioned =>
+      Seq("UNCOMPRESSED", "SNAPPY", "GZIP").foreach { compressionCodec =>
+        checkCompressionCodec(compressionCodec, isPartitioned)
+      }
+    }
+  }
+
+  test("Create table with unknown compression") {
+    Seq(true, false).foreach { isPartitioned =>
+      val exception = intercept[IllegalArgumentException] {
+        checkCompressionCodec("aa", isPartitioned)
+      }
+      assert(exception.getMessage.contains("Codec [aa] is not available"))
+    }
+  }
+}

From b66700a5ed9a6e31433bec7961361c382a8b4162 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sat, 6 Jan 2018 23:08:26 +0800
Subject: [PATCH 032/566] [SPARK-22901][PYTHON][FOLLOWUP] Adds the doc for
 asNondeterministic for wrapped UDF function

## What changes were proposed in this pull request?

This PR wraps the `asNondeterministic` attribute in the wrapped UDF function to set the docstring properly.

```python
from pyspark.sql.functions import udf
help(udf(lambda x: x).asNondeterministic)
```

Before:

```
Help on function <lambda> in module pyspark.sql.udf:

<lambda> lambda
(END
```

After:

```
Help on function asNondeterministic in module pyspark.sql.udf:

asNondeterministic()
    Updates UserDefinedFunction to nondeterministic.

    .. versionadded:: 2.3
(END)
```

## How was this patch tested?

Manually tested and a simple test was added.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20173 from HyukjinKwon/SPARK-22901-followup.
---
 python/pyspark/sql/tests.py | 1 +
 python/pyspark/sql/udf.py   | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 689736d8e6456..122a65b83aef9 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -413,6 +413,7 @@ def test_nondeterministic_udf2(self):
         pydoc.render_doc(udf(lambda: random.randint(6, 6), IntegerType()))
         pydoc.render_doc(random_udf)
         pydoc.render_doc(random_udf1)
+        pydoc.render_doc(udf(lambda x: x).asNondeterministic)
 
     def test_nondeterministic_udf_in_aggregate(self):
         from pyspark.sql.functions import udf, sum
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index 5e75eb6545333..5e80ab9165867 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -169,8 +169,8 @@ def wrapper(*args):
         wrapper.returnType = self.returnType
         wrapper.evalType = self.evalType
         wrapper.deterministic = self.deterministic
-        wrapper.asNondeterministic = lambda: self.asNondeterministic()._wrapped()
-
+        wrapper.asNondeterministic = functools.wraps(
+            self.asNondeterministic)(lambda: self.asNondeterministic()._wrapped())
         return wrapper
 
     def asNondeterministic(self):

From f9e7b0c8aa9334fa2e467b26516ac8e54a51dc63 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Sun, 7 Jan 2018 00:19:21 +0800
Subject: [PATCH 033/566] [HOTFIX] Fix style checking failure

## What changes were proposed in this pull request?
This PR is to fix the  style checking failure.

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20175 from gatorsmile/stylefix.

(cherry picked from commit 9a7048b2889bd0fd66e68a0ce3e07e466315a051)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../org/apache/spark/sql/internal/SQLConf.scala     | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 7d1217de254a2..5c61f10bb71ad 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -325,10 +325,11 @@ object SQLConf {
     .createWithDefault(false)
 
   val PARQUET_COMPRESSION = buildConf("spark.sql.parquet.compression.codec")
-    .doc("Sets the compression codec used when writing Parquet files. If either `compression` or" +
-      "`parquet.compression` is specified in the table-specific options/properties, the precedence" +
-      "would be `compression`, `parquet.compression`, `spark.sql.parquet.compression.codec`." +
-      "Acceptable values include: none, uncompressed, snappy, gzip, lzo.")
+    .doc("Sets the compression codec used when writing Parquet files. If either `compression` or " +
+      "`parquet.compression` is specified in the table-specific options/properties, the " +
+      "precedence would be `compression`, `parquet.compression`, " +
+      "`spark.sql.parquet.compression.codec`. Acceptable values include: none, uncompressed, " +
+      "snappy, gzip, lzo.")
     .stringConf
     .transform(_.toLowerCase(Locale.ROOT))
     .checkValues(Set("none", "uncompressed", "snappy", "gzip", "lzo"))
@@ -368,8 +369,8 @@ object SQLConf {
       .createWithDefault(true)
 
   val ORC_COMPRESSION = buildConf("spark.sql.orc.compression.codec")
-    .doc("Sets the compression codec used when writing ORC files. If either `compression` or" +
-      "`orc.compress` is specified in the table-specific options/properties, the precedence" +
+    .doc("Sets the compression codec used when writing ORC files. If either `compression` or " +
+      "`orc.compress` is specified in the table-specific options/properties, the precedence " +
       "would be `compression`, `orc.compress`, `spark.sql.orc.compression.codec`." +
       "Acceptable values include: none, uncompressed, snappy, zlib, lzo.")
     .stringConf

From 285d342c406cf304931844665e56725ef1a848e0 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Sun, 7 Jan 2018 13:42:01 +0800
Subject: [PATCH 034/566] [SPARK-22973][SQL] Fix incorrect results of Casting
 Map to String

## What changes were proposed in this pull request?
This pr fixed the issue when casting maps into strings;
```
scala> Seq(Map(1 -> "a", 2 -> "b")).toDF("a").write.saveAsTable("t")
scala> sql("SELECT cast(a as String) FROM t").show(false)
+----------------------------------------------------------------+
|a                                                               |
+----------------------------------------------------------------+
|org.apache.spark.sql.catalyst.expressions.UnsafeMapData38bdd75d|
+----------------------------------------------------------------+
```
This pr modified the result into;
```
+----------------+
|a               |
+----------------+
|[1 -> a, 2 -> b]|
+----------------+
```

## How was this patch tested?
Added tests in `CastSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #20166 from maropu/SPARK-22973.

(cherry picked from commit 18e94149992618a2b4e6f0fd3b3f4594e1745224)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/catalyst/expressions/Cast.scala | 89 +++++++++++++++++++
 .../sql/catalyst/expressions/CastSuite.scala  | 28 ++++++
 2 files changed, 117 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index d4fc5e0f168a7..f2de4c8e30bec 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -228,6 +228,37 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
         builder.append("]")
         builder.build()
       })
+    case MapType(kt, vt, _) =>
+      buildCast[MapData](_, map => {
+        val builder = new UTF8StringBuilder
+        builder.append("[")
+        if (map.numElements > 0) {
+          val keyArray = map.keyArray()
+          val valueArray = map.valueArray()
+          val keyToUTF8String = castToString(kt)
+          val valueToUTF8String = castToString(vt)
+          builder.append(keyToUTF8String(keyArray.get(0, kt)).asInstanceOf[UTF8String])
+          builder.append(" ->")
+          if (!valueArray.isNullAt(0)) {
+            builder.append(" ")
+            builder.append(valueToUTF8String(valueArray.get(0, vt)).asInstanceOf[UTF8String])
+          }
+          var i = 1
+          while (i < map.numElements) {
+            builder.append(", ")
+            builder.append(keyToUTF8String(keyArray.get(i, kt)).asInstanceOf[UTF8String])
+            builder.append(" ->")
+            if (!valueArray.isNullAt(i)) {
+              builder.append(" ")
+              builder.append(valueToUTF8String(valueArray.get(i, vt))
+                .asInstanceOf[UTF8String])
+            }
+            i += 1
+          }
+        }
+        builder.append("]")
+        builder.build()
+      })
     case _ => buildCast[Any](_, o => UTF8String.fromString(o.toString))
   }
 
@@ -654,6 +685,53 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
      """.stripMargin
   }
 
+  private def writeMapToStringBuilder(
+      kt: DataType,
+      vt: DataType,
+      map: String,
+      buffer: String,
+      ctx: CodegenContext): String = {
+
+    def dataToStringFunc(func: String, dataType: DataType) = {
+      val funcName = ctx.freshName(func)
+      val dataToStringCode = castToStringCode(dataType, ctx)
+      ctx.addNewFunction(funcName,
+        s"""
+           |private UTF8String $funcName(${ctx.javaType(dataType)} data) {
+           |  UTF8String dataStr = null;
+           |  ${dataToStringCode("data", "dataStr", null /* resultIsNull won't be used */)}
+           |  return dataStr;
+           |}
+         """.stripMargin)
+    }
+
+    val keyToStringFunc = dataToStringFunc("keyToString", kt)
+    val valueToStringFunc = dataToStringFunc("valueToString", vt)
+    val loopIndex = ctx.freshName("loopIndex")
+    s"""
+       |$buffer.append("[");
+       |if ($map.numElements() > 0) {
+       |  $buffer.append($keyToStringFunc(${ctx.getValue(s"$map.keyArray()", kt, "0")}));
+       |  $buffer.append(" ->");
+       |  if (!$map.valueArray().isNullAt(0)) {
+       |    $buffer.append(" ");
+       |    $buffer.append($valueToStringFunc(${ctx.getValue(s"$map.valueArray()", vt, "0")}));
+       |  }
+       |  for (int $loopIndex = 1; $loopIndex < $map.numElements(); $loopIndex++) {
+       |    $buffer.append(", ");
+       |    $buffer.append($keyToStringFunc(${ctx.getValue(s"$map.keyArray()", kt, loopIndex)}));
+       |    $buffer.append(" ->");
+       |    if (!$map.valueArray().isNullAt($loopIndex)) {
+       |      $buffer.append(" ");
+       |      $buffer.append($valueToStringFunc(
+       |        ${ctx.getValue(s"$map.valueArray()", vt, loopIndex)}));
+       |    }
+       |  }
+       |}
+       |$buffer.append("]");
+     """.stripMargin
+  }
+
   private[this] def castToStringCode(from: DataType, ctx: CodegenContext): CastFunction = {
     from match {
       case BinaryType =>
@@ -676,6 +754,17 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
              |$evPrim = $buffer.build();
            """.stripMargin
         }
+      case MapType(kt, vt, _) =>
+        (c, evPrim, evNull) => {
+          val buffer = ctx.freshName("buffer")
+          val bufferClass = classOf[UTF8StringBuilder].getName
+          val writeMapElemCode = writeMapToStringBuilder(kt, vt, c, buffer, ctx)
+          s"""
+             |$bufferClass $buffer = new $bufferClass();
+             |$writeMapElemCode;
+             |$evPrim = $buffer.build();
+           """.stripMargin
+        }
       case _ =>
         (c, evPrim, evNull) => s"$evPrim = UTF8String.fromString(String.valueOf($c));"
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index e3ed7171defd8..1445bb8a97d40 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -878,4 +878,32 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
       StringType)
     checkEvaluation(ret8, "[[[a], [b, c]], [[d]]]")
   }
+
+  test("SPARK-22973 Cast map to string") {
+    val ret1 = cast(Literal.create(Map(1 -> "a", 2 -> "b", 3 -> "c")), StringType)
+    checkEvaluation(ret1, "[1 -> a, 2 -> b, 3 -> c]")
+    val ret2 = cast(
+      Literal.create(Map("1" -> "a".getBytes, "2" -> null, "3" -> "c".getBytes)),
+      StringType)
+    checkEvaluation(ret2, "[1 -> a, 2 ->, 3 -> c]")
+    val ret3 = cast(
+      Literal.create(Map(
+        1 -> Date.valueOf("2014-12-03"),
+        2 -> Date.valueOf("2014-12-04"),
+        3 -> Date.valueOf("2014-12-05"))),
+      StringType)
+    checkEvaluation(ret3, "[1 -> 2014-12-03, 2 -> 2014-12-04, 3 -> 2014-12-05]")
+    val ret4 = cast(
+      Literal.create(Map(
+        1 -> Timestamp.valueOf("2014-12-03 13:01:00"),
+        2 -> Timestamp.valueOf("2014-12-04 15:05:00"))),
+      StringType)
+    checkEvaluation(ret4, "[1 -> 2014-12-03 13:01:00, 2 -> 2014-12-04 15:05:00]")
+    val ret5 = cast(
+      Literal.create(Map(
+        1 -> Array(1, 2, 3),
+        2 -> Array(4, 5, 6))),
+      StringType)
+    checkEvaluation(ret5, "[1 -> [1, 2, 3], 2 -> [4, 5, 6]]")
+  }
 }

From 7673e9c569821e20c56c2a9a875bcc025d36315f Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Mon, 8 Jan 2018 11:39:45 +0800
Subject: [PATCH 035/566] [SPARK-22985] Fix argument escaping bug in
 from_utc_timestamp / to_utc_timestamp codegen

## What changes were proposed in this pull request?

This patch adds additional escaping in `from_utc_timestamp` / `to_utc_timestamp` expression codegen in order to a bug where invalid timezones which contain special characters could cause generated code to fail to compile.

## How was this patch tested?

New regression tests in `DateExpressionsSuite`.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #20182 from JoshRosen/SPARK-22985-fix-utc-timezone-function-escaping-bugs.

(cherry picked from commit 71d65a32158a55285be197bec4e41fedc9225b94)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../catalyst/expressions/datetimeExpressions.scala   | 12 ++++++++----
 .../catalyst/expressions/DateExpressionsSuite.scala  |  6 ++++++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 7a674ea7f4d76..424871f2047e9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -23,6 +23,8 @@ import java.util.{Calendar, TimeZone}
 
 import scala.util.control.NonFatal
 
+import org.apache.commons.lang3.StringEscapeUtils
+
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodegenFallback, ExprCode}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
@@ -1008,7 +1010,7 @@ case class FromUTCTimestamp(left: Expression, right: Expression)
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
     if (right.foldable) {
-      val tz = right.eval()
+      val tz = right.eval().asInstanceOf[UTF8String]
       if (tz == null) {
         ev.copy(code = s"""
            |boolean ${ev.isNull} = true;
@@ -1017,8 +1019,9 @@ case class FromUTCTimestamp(left: Expression, right: Expression)
       } else {
         val tzClass = classOf[TimeZone].getName
         val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+        val escapedTz = StringEscapeUtils.escapeJava(tz.toString)
         val tzTerm = ctx.addMutableState(tzClass, "tz",
-          v => s"""$v = $dtu.getTimeZone("$tz");""")
+          v => s"""$v = $dtu.getTimeZone("$escapedTz");""")
         val utcTerm = "tzUTC"
         ctx.addImmutableStateIfNotExists(tzClass, utcTerm,
           v => s"""$v = $dtu.getTimeZone("UTC");""")
@@ -1185,7 +1188,7 @@ case class ToUTCTimestamp(left: Expression, right: Expression)
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
     if (right.foldable) {
-      val tz = right.eval()
+      val tz = right.eval().asInstanceOf[UTF8String]
       if (tz == null) {
         ev.copy(code = s"""
            |boolean ${ev.isNull} = true;
@@ -1194,8 +1197,9 @@ case class ToUTCTimestamp(left: Expression, right: Expression)
       } else {
         val tzClass = classOf[TimeZone].getName
         val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+        val escapedTz = StringEscapeUtils.escapeJava(tz.toString)
         val tzTerm = ctx.addMutableState(tzClass, "tz",
-          v => s"""$v = $dtu.getTimeZone("$tz");""")
+          v => s"""$v = $dtu.getTimeZone("$escapedTz");""")
         val utcTerm = "tzUTC"
         ctx.addImmutableStateIfNotExists(tzClass, utcTerm,
           v => s"""$v = $dtu.getTimeZone("UTC");""")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index 63f6ceeb21b96..786266a2c13c0 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -22,6 +22,7 @@ import java.text.SimpleDateFormat
 import java.util.{Calendar, Locale, TimeZone}
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
 import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.catalyst.util.DateTimeUtils.TimeZoneGMT
@@ -791,6 +792,9 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     test(null, "UTC", null)
     test("2015-07-24 00:00:00", null, null)
     test(null, null, null)
+    // Test escaping of timezone
+    GenerateUnsafeProjection.generate(
+      ToUTCTimestamp(Literal(Timestamp.valueOf("2015-07-24 00:00:00")), Literal("\"quote")) :: Nil)
   }
 
   test("from_utc_timestamp") {
@@ -811,5 +815,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     test(null, "UTC", null)
     test("2015-07-24 00:00:00", null, null)
     test(null, null, null)
+    // Test escaping of timezone
+    GenerateUnsafeProjection.generate(FromUTCTimestamp(Literal(0), Literal("\"quote")) :: Nil)
   }
 }

From a1d3352d279d9eec6de23df83398b12454c6564e Mon Sep 17 00:00:00 2001
From: Guilherme Berger <gberger@palantir.com>
Date: Mon, 8 Jan 2018 14:32:05 +0900
Subject: [PATCH 036/566] [SPARK-22566][PYTHON] Better error message for
 `_merge_type` in Pandas to Spark DF conversion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

It provides a better error message when doing `spark_session.createDataFrame(pandas_df)` with no schema and an error occurs in the schema inference due to incompatible types.

The Pandas column names are propagated down and the error message mentions which column had the merging error.

https://issues.apache.org/jira/browse/SPARK-22566

## How was this patch tested?

Manually in the `./bin/pyspark` console, and with new tests: `./python/run-tests`

<img width="873" alt="screen shot 2017-11-21 at 13 29 49" src="https://user-images.githubusercontent.com/3977115/33080121-382274e0-cecf-11e7-808f-057a65bb7b00.png">

I state that the contribution is my original work and that I license the work to the Apache Spark project under the project’s open source license.

Author: Guilherme Berger <gberger@palantir.com>

Closes #19792 from gberger/master.

(cherry picked from commit 3e40eb3f1ffac3d2f49459a801e3ce171ed34091)
Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
---
 python/pyspark/sql/session.py |  17 +++---
 python/pyspark/sql/tests.py   | 100 ++++++++++++++++++++++++++++++++++
 python/pyspark/sql/types.py   |  28 +++++++---
 3 files changed, 129 insertions(+), 16 deletions(-)

diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 6e5eec48e8aca..6052fa9e84096 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -325,11 +325,12 @@ def range(self, start, end=None, step=1, numPartitions=None):
 
         return DataFrame(jdf, self._wrapped)
 
-    def _inferSchemaFromList(self, data):
+    def _inferSchemaFromList(self, data, names=None):
         """
         Infer schema from list of Row or tuple.
 
         :param data: list of Row or tuple
+        :param names: list of column names
         :return: :class:`pyspark.sql.types.StructType`
         """
         if not data:
@@ -338,12 +339,12 @@ def _inferSchemaFromList(self, data):
         if type(first) is dict:
             warnings.warn("inferring schema from dict is deprecated,"
                           "please use pyspark.sql.Row instead")
-        schema = reduce(_merge_type, map(_infer_schema, data))
+        schema = reduce(_merge_type, (_infer_schema(row, names) for row in data))
         if _has_nulltype(schema):
             raise ValueError("Some of types cannot be determined after inferring")
         return schema
 
-    def _inferSchema(self, rdd, samplingRatio=None):
+    def _inferSchema(self, rdd, samplingRatio=None, names=None):
         """
         Infer schema from an RDD of Row or tuple.
 
@@ -360,10 +361,10 @@ def _inferSchema(self, rdd, samplingRatio=None):
                           "Use pyspark.sql.Row instead")
 
         if samplingRatio is None:
-            schema = _infer_schema(first)
+            schema = _infer_schema(first, names=names)
             if _has_nulltype(schema):
                 for row in rdd.take(100)[1:]:
-                    schema = _merge_type(schema, _infer_schema(row))
+                    schema = _merge_type(schema, _infer_schema(row, names=names))
                     if not _has_nulltype(schema):
                         break
                 else:
@@ -372,7 +373,7 @@ def _inferSchema(self, rdd, samplingRatio=None):
         else:
             if samplingRatio < 0.99:
                 rdd = rdd.sample(False, float(samplingRatio))
-            schema = rdd.map(_infer_schema).reduce(_merge_type)
+            schema = rdd.map(lambda row: _infer_schema(row, names)).reduce(_merge_type)
         return schema
 
     def _createFromRDD(self, rdd, schema, samplingRatio):
@@ -380,7 +381,7 @@ def _createFromRDD(self, rdd, schema, samplingRatio):
         Create an RDD for DataFrame from an existing RDD, returns the RDD and schema.
         """
         if schema is None or isinstance(schema, (list, tuple)):
-            struct = self._inferSchema(rdd, samplingRatio)
+            struct = self._inferSchema(rdd, samplingRatio, names=schema)
             converter = _create_converter(struct)
             rdd = rdd.map(converter)
             if isinstance(schema, (list, tuple)):
@@ -406,7 +407,7 @@ def _createFromLocal(self, data, schema):
             data = list(data)
 
         if schema is None or isinstance(schema, (list, tuple)):
-            struct = self._inferSchemaFromList(data)
+            struct = self._inferSchemaFromList(data, names=schema)
             converter = _create_converter(struct)
             data = map(converter, data)
             if isinstance(schema, (list, tuple)):
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 122a65b83aef9..13576ff57001b 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -68,6 +68,7 @@
 from pyspark.sql.types import UserDefinedType, _infer_type, _make_type_verifier
 from pyspark.sql.types import _array_signed_int_typecode_ctype_mappings, _array_type_mappings
 from pyspark.sql.types import _array_unsigned_int_typecode_ctype_mappings
+from pyspark.sql.types import _merge_type
 from pyspark.tests import QuietTest, ReusedPySparkTestCase, SparkSubmitTests
 from pyspark.sql.functions import UserDefinedFunction, sha2, lit
 from pyspark.sql.window import Window
@@ -898,6 +899,15 @@ def test_infer_schema(self):
         result = self.spark.sql("SELECT l[0].a from test2 where d['key'].d = '2'")
         self.assertEqual(1, result.head()[0])
 
+    def test_infer_schema_not_enough_names(self):
+        df = self.spark.createDataFrame([["a", "b"]], ["col1"])
+        self.assertEqual(df.columns, ['col1', '_2'])
+
+    def test_infer_schema_fails(self):
+        with self.assertRaisesRegexp(TypeError, 'field a'):
+            self.spark.createDataFrame(self.spark.sparkContext.parallelize([[1, 1], ["x", 1]]),
+                                       schema=["a", "b"], samplingRatio=0.99)
+
     def test_infer_nested_schema(self):
         NestedRow = Row("f1", "f2")
         nestedRdd1 = self.sc.parallelize([NestedRow([1, 2], {"row1": 1.0}),
@@ -918,6 +928,10 @@ def test_infer_nested_schema(self):
         df = self.spark.createDataFrame(rdd)
         self.assertEqual(Row(field1=1, field2=u'row1'), df.first())
 
+    def test_create_dataframe_from_dict_respects_schema(self):
+        df = self.spark.createDataFrame([{'a': 1}], ["b"])
+        self.assertEqual(df.columns, ['b'])
+
     def test_create_dataframe_from_objects(self):
         data = [MyObject(1, "1"), MyObject(2, "2")]
         df = self.spark.createDataFrame(data)
@@ -1772,6 +1786,92 @@ def test_infer_long_type(self):
         self.assertEqual(_infer_type(2**61), LongType())
         self.assertEqual(_infer_type(2**71), LongType())
 
+    def test_merge_type(self):
+        self.assertEqual(_merge_type(LongType(), NullType()), LongType())
+        self.assertEqual(_merge_type(NullType(), LongType()), LongType())
+
+        self.assertEqual(_merge_type(LongType(), LongType()), LongType())
+
+        self.assertEqual(_merge_type(
+            ArrayType(LongType()),
+            ArrayType(LongType())
+        ), ArrayType(LongType()))
+        with self.assertRaisesRegexp(TypeError, 'element in array'):
+            _merge_type(ArrayType(LongType()), ArrayType(DoubleType()))
+
+        self.assertEqual(_merge_type(
+            MapType(StringType(), LongType()),
+            MapType(StringType(), LongType())
+        ), MapType(StringType(), LongType()))
+        with self.assertRaisesRegexp(TypeError, 'key of map'):
+            _merge_type(
+                MapType(StringType(), LongType()),
+                MapType(DoubleType(), LongType()))
+        with self.assertRaisesRegexp(TypeError, 'value of map'):
+            _merge_type(
+                MapType(StringType(), LongType()),
+                MapType(StringType(), DoubleType()))
+
+        self.assertEqual(_merge_type(
+            StructType([StructField("f1", LongType()), StructField("f2", StringType())]),
+            StructType([StructField("f1", LongType()), StructField("f2", StringType())])
+        ), StructType([StructField("f1", LongType()), StructField("f2", StringType())]))
+        with self.assertRaisesRegexp(TypeError, 'field f1'):
+            _merge_type(
+                StructType([StructField("f1", LongType()), StructField("f2", StringType())]),
+                StructType([StructField("f1", DoubleType()), StructField("f2", StringType())]))
+
+        self.assertEqual(_merge_type(
+            StructType([StructField("f1", StructType([StructField("f2", LongType())]))]),
+            StructType([StructField("f1", StructType([StructField("f2", LongType())]))])
+        ), StructType([StructField("f1", StructType([StructField("f2", LongType())]))]))
+        with self.assertRaisesRegexp(TypeError, 'field f2 in field f1'):
+            _merge_type(
+                StructType([StructField("f1", StructType([StructField("f2", LongType())]))]),
+                StructType([StructField("f1", StructType([StructField("f2", StringType())]))]))
+
+        self.assertEqual(_merge_type(
+            StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())]),
+            StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())])
+        ), StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())]))
+        with self.assertRaisesRegexp(TypeError, 'element in array field f1'):
+            _merge_type(
+                StructType([
+                    StructField("f1", ArrayType(LongType())),
+                    StructField("f2", StringType())]),
+                StructType([
+                    StructField("f1", ArrayType(DoubleType())),
+                    StructField("f2", StringType())]))
+
+        self.assertEqual(_merge_type(
+            StructType([
+                StructField("f1", MapType(StringType(), LongType())),
+                StructField("f2", StringType())]),
+            StructType([
+                StructField("f1", MapType(StringType(), LongType())),
+                StructField("f2", StringType())])
+        ), StructType([
+            StructField("f1", MapType(StringType(), LongType())),
+            StructField("f2", StringType())]))
+        with self.assertRaisesRegexp(TypeError, 'value of map field f1'):
+            _merge_type(
+                StructType([
+                    StructField("f1", MapType(StringType(), LongType())),
+                    StructField("f2", StringType())]),
+                StructType([
+                    StructField("f1", MapType(StringType(), DoubleType())),
+                    StructField("f2", StringType())]))
+
+        self.assertEqual(_merge_type(
+            StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]),
+            StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))])
+        ), StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]))
+        with self.assertRaisesRegexp(TypeError, 'key of map element in array field f1'):
+            _merge_type(
+                StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]),
+                StructType([StructField("f1", ArrayType(MapType(DoubleType(), LongType())))])
+            )
+
     def test_filter_with_datetime(self):
         time = datetime.datetime(2015, 4, 17, 23, 1, 2, 3000)
         date = time.date()
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 146e673ae9756..0dc5823f72a3c 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1073,7 +1073,7 @@ def _infer_type(obj):
             raise TypeError("not supported type: %s" % type(obj))
 
 
-def _infer_schema(row):
+def _infer_schema(row, names=None):
     """Infer the schema from dict/namedtuple/object"""
     if isinstance(row, dict):
         items = sorted(row.items())
@@ -1084,7 +1084,10 @@ def _infer_schema(row):
         elif hasattr(row, "_fields"):  # namedtuple
             items = zip(row._fields, tuple(row))
         else:
-            names = ['_%d' % i for i in range(1, len(row) + 1)]
+            if names is None:
+                names = ['_%d' % i for i in range(1, len(row) + 1)]
+            elif len(names) < len(row):
+                names.extend('_%d' % i for i in range(len(names) + 1, len(row) + 1))
             items = zip(names, row)
 
     elif hasattr(row, "__dict__"):  # object
@@ -1109,19 +1112,27 @@ def _has_nulltype(dt):
         return isinstance(dt, NullType)
 
 
-def _merge_type(a, b):
+def _merge_type(a, b, name=None):
+    if name is None:
+        new_msg = lambda msg: msg
+        new_name = lambda n: "field %s" % n
+    else:
+        new_msg = lambda msg: "%s: %s" % (name, msg)
+        new_name = lambda n: "field %s in %s" % (n, name)
+
     if isinstance(a, NullType):
         return b
     elif isinstance(b, NullType):
         return a
     elif type(a) is not type(b):
         # TODO: type cast (such as int -> long)
-        raise TypeError("Can not merge type %s and %s" % (type(a), type(b)))
+        raise TypeError(new_msg("Can not merge type %s and %s" % (type(a), type(b))))
 
     # same type
     if isinstance(a, StructType):
         nfs = dict((f.name, f.dataType) for f in b.fields)
-        fields = [StructField(f.name, _merge_type(f.dataType, nfs.get(f.name, NullType())))
+        fields = [StructField(f.name, _merge_type(f.dataType, nfs.get(f.name, NullType()),
+                                                  name=new_name(f.name)))
                   for f in a.fields]
         names = set([f.name for f in fields])
         for n in nfs:
@@ -1130,11 +1141,12 @@ def _merge_type(a, b):
         return StructType(fields)
 
     elif isinstance(a, ArrayType):
-        return ArrayType(_merge_type(a.elementType, b.elementType), True)
+        return ArrayType(_merge_type(a.elementType, b.elementType,
+                                     name='element in array %s' % name), True)
 
     elif isinstance(a, MapType):
-        return MapType(_merge_type(a.keyType, b.keyType),
-                       _merge_type(a.valueType, b.valueType),
+        return MapType(_merge_type(a.keyType, b.keyType, name='key of map %s' % name),
+                       _merge_type(a.valueType, b.valueType, name='value of map %s' % name),
                        True)
     else:
         return a

From 8bf24e9fea9e9e23f03caf8b32acb4a64f5b00e3 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Mon, 8 Jan 2018 13:59:08 +0800
Subject: [PATCH 037/566] [SPARK-22979][PYTHON][SQL] Avoid per-record type
 dispatch in Python data conversion (EvaluatePython.fromJava)

## What changes were proposed in this pull request?

Seems we can avoid type dispatch for each value when Java objection (from Pyrolite) -> Spark's internal data format because we know the schema ahead.

I manually performed the benchmark as below:

```scala
  test("EvaluatePython.fromJava / EvaluatePython.makeFromJava") {
    val numRows = 1000 * 1000
    val numFields = 30

    val random = new Random(System.nanoTime())
    val types = Array(
      BooleanType, ByteType, FloatType, DoubleType, IntegerType, LongType, ShortType,
      DecimalType.ShortDecimal, DecimalType.IntDecimal, DecimalType.ByteDecimal,
      DecimalType.FloatDecimal, DecimalType.LongDecimal, new DecimalType(5, 2),
      new DecimalType(12, 2), new DecimalType(30, 10), CalendarIntervalType)
    val schema = RandomDataGenerator.randomSchema(random, numFields, types)
    val rows = mutable.ArrayBuffer.empty[Array[Any]]
    var i = 0
    while (i < numRows) {
      val row = RandomDataGenerator.randomRow(random, schema)
      rows += row.toSeq.toArray
      i += 1
    }

    val benchmark = new Benchmark("EvaluatePython.fromJava / EvaluatePython.makeFromJava", numRows)
    benchmark.addCase("Before - EvaluatePython.fromJava", 3) { _ =>
      var i = 0
      while (i < numRows) {
        EvaluatePython.fromJava(rows(i), schema)
        i += 1
      }
    }

    benchmark.addCase("After - EvaluatePython.makeFromJava", 3) { _ =>
      val fromJava = EvaluatePython.makeFromJava(schema)
      var i = 0
      while (i < numRows) {
        fromJava(rows(i))
        i += 1
      }
    }

    benchmark.run()
  }
```

```
EvaluatePython.fromJava / EvaluatePython.makeFromJava: Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
------------------------------------------------------------------------------------------------
Before - EvaluatePython.fromJava              1265 / 1346          0.8        1264.8       1.0X
After - EvaluatePython.makeFromJava            571 /  649          1.8         570.8       2.2X
```

If the structure is nested, I think the advantage should be larger than this.

## How was this patch tested?

Existing tests should cover this. Also, I manually checked if the values from before / after are actually same via `assert` when performing the benchmarks.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20172 from HyukjinKwon/type-dispatch-python-eval.

(cherry picked from commit 8fdeb4b9946bd9be045abb919da2e531708b3bd4)

Signed-off-by: Wenchen Fan <wenchen@databricks.com>

Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../org/apache/spark/sql/SparkSession.scala   |   5 +-
 .../python/BatchEvalPythonExec.scala          |   7 +-
 .../sql/execution/python/EvaluatePython.scala | 166 ++++++++++++------
 3 files changed, 118 insertions(+), 60 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 272eb844226d4..734573ba31f71 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -742,7 +742,10 @@ class SparkSession private(
   private[sql] def applySchemaToPythonRDD(
       rdd: RDD[Array[Any]],
       schema: StructType): DataFrame = {
-    val rowRdd = rdd.map(r => python.EvaluatePython.fromJava(r, schema).asInstanceOf[InternalRow])
+    val rowRdd = rdd.mapPartitions { iter =>
+      val fromJava = python.EvaluatePython.makeFromJava(schema)
+      iter.map(r => fromJava(r).asInstanceOf[InternalRow])
+    }
     internalCreateDataFrame(rowRdd, schema)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
index 26ee25f633ea4..f4d83e8dc7c2b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
@@ -79,16 +79,19 @@ case class BatchEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chi
     } else {
       StructType(udfs.map(u => StructField("", u.dataType, u.nullable)))
     }
+
+    val fromJava = EvaluatePython.makeFromJava(resultType)
+
     outputIterator.flatMap { pickedResult =>
       val unpickledBatch = unpickle.loads(pickedResult)
       unpickledBatch.asInstanceOf[java.util.ArrayList[Any]].asScala
     }.map { result =>
       if (udfs.length == 1) {
         // fast path for single UDF
-        mutableRow(0) = EvaluatePython.fromJava(result, resultType)
+        mutableRow(0) = fromJava(result)
         mutableRow
       } else {
-        EvaluatePython.fromJava(result, resultType).asInstanceOf[InternalRow]
+        fromJava(result).asInstanceOf[InternalRow]
       }
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
index 9bbfa6018ba77..520afad287648 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
@@ -83,82 +83,134 @@ object EvaluatePython {
   }
 
   /**
-   * Converts `obj` to the type specified by the data type, or returns null if the type of obj is
-   * unexpected. Because Python doesn't enforce the type.
+   * Make a converter that converts `obj` to the type specified by the data type, or returns
+   * null if the type of obj is unexpected. Because Python doesn't enforce the type.
    */
-  def fromJava(obj: Any, dataType: DataType): Any = (obj, dataType) match {
-    case (null, _) => null
-
-    case (c: Boolean, BooleanType) => c
+  def makeFromJava(dataType: DataType): Any => Any = dataType match {
+    case BooleanType => (obj: Any) => nullSafeConvert(obj) {
+      case b: Boolean => b
+    }
 
-    case (c: Byte, ByteType) => c
-    case (c: Short, ByteType) => c.toByte
-    case (c: Int, ByteType) => c.toByte
-    case (c: Long, ByteType) => c.toByte
+    case ByteType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Byte => c
+      case c: Short => c.toByte
+      case c: Int => c.toByte
+      case c: Long => c.toByte
+    }
 
-    case (c: Byte, ShortType) => c.toShort
-    case (c: Short, ShortType) => c
-    case (c: Int, ShortType) => c.toShort
-    case (c: Long, ShortType) => c.toShort
+    case ShortType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Byte => c.toShort
+      case c: Short => c
+      case c: Int => c.toShort
+      case c: Long => c.toShort
+    }
 
-    case (c: Byte, IntegerType) => c.toInt
-    case (c: Short, IntegerType) => c.toInt
-    case (c: Int, IntegerType) => c
-    case (c: Long, IntegerType) => c.toInt
+    case IntegerType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Byte => c.toInt
+      case c: Short => c.toInt
+      case c: Int => c
+      case c: Long => c.toInt
+    }
 
-    case (c: Byte, LongType) => c.toLong
-    case (c: Short, LongType) => c.toLong
-    case (c: Int, LongType) => c.toLong
-    case (c: Long, LongType) => c
+    case LongType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Byte => c.toLong
+      case c: Short => c.toLong
+      case c: Int => c.toLong
+      case c: Long => c
+    }
 
-    case (c: Float, FloatType) => c
-    case (c: Double, FloatType) => c.toFloat
+    case FloatType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Float => c
+      case c: Double => c.toFloat
+    }
 
-    case (c: Float, DoubleType) => c.toDouble
-    case (c: Double, DoubleType) => c
+    case DoubleType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Float => c.toDouble
+      case c: Double => c
+    }
 
-    case (c: java.math.BigDecimal, dt: DecimalType) => Decimal(c, dt.precision, dt.scale)
+    case dt: DecimalType => (obj: Any) => nullSafeConvert(obj) {
+      case c: java.math.BigDecimal => Decimal(c, dt.precision, dt.scale)
+    }
 
-    case (c: Int, DateType) => c
+    case DateType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Int => c
+    }
 
-    case (c: Long, TimestampType) => c
-    // Py4J serializes values between MIN_INT and MAX_INT as Ints, not Longs
-    case (c: Int, TimestampType) => c.toLong
+    case TimestampType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Long => c
+      // Py4J serializes values between MIN_INT and MAX_INT as Ints, not Longs
+      case c: Int => c.toLong
+    }
 
-    case (c, StringType) => UTF8String.fromString(c.toString)
+    case StringType => (obj: Any) => nullSafeConvert(obj) {
+      case _ => UTF8String.fromString(obj.toString)
+    }
 
-    case (c: String, BinaryType) => c.getBytes(StandardCharsets.UTF_8)
-    case (c, BinaryType) if c.getClass.isArray && c.getClass.getComponentType.getName == "byte" => c
+    case BinaryType => (obj: Any) => nullSafeConvert(obj) {
+      case c: String => c.getBytes(StandardCharsets.UTF_8)
+      case c if c.getClass.isArray && c.getClass.getComponentType.getName == "byte" => c
+    }
 
-    case (c: java.util.List[_], ArrayType(elementType, _)) =>
-      new GenericArrayData(c.asScala.map { e => fromJava(e, elementType)}.toArray)
+    case ArrayType(elementType, _) =>
+      val elementFromJava = makeFromJava(elementType)
 
-    case (c, ArrayType(elementType, _)) if c.getClass.isArray =>
-      new GenericArrayData(c.asInstanceOf[Array[_]].map(e => fromJava(e, elementType)))
+      (obj: Any) => nullSafeConvert(obj) {
+        case c: java.util.List[_] =>
+          new GenericArrayData(c.asScala.map { e => elementFromJava(e) }.toArray)
+        case c if c.getClass.isArray =>
+          new GenericArrayData(c.asInstanceOf[Array[_]].map(e => elementFromJava(e)))
+      }
 
-    case (javaMap: java.util.Map[_, _], MapType(keyType, valueType, _)) =>
-      ArrayBasedMapData(
-        javaMap,
-        (key: Any) => fromJava(key, keyType),
-        (value: Any) => fromJava(value, valueType))
+    case MapType(keyType, valueType, _) =>
+      val keyFromJava = makeFromJava(keyType)
+      val valueFromJava = makeFromJava(valueType)
+
+      (obj: Any) => nullSafeConvert(obj) {
+        case javaMap: java.util.Map[_, _] =>
+          ArrayBasedMapData(
+            javaMap,
+            (key: Any) => keyFromJava(key),
+            (value: Any) => valueFromJava(value))
+      }
 
-    case (c, StructType(fields)) if c.getClass.isArray =>
-      val array = c.asInstanceOf[Array[_]]
-      if (array.length != fields.length) {
-        throw new IllegalStateException(
-          s"Input row doesn't have expected number of values required by the schema. " +
-            s"${fields.length} fields are required while ${array.length} values are provided."
-        )
+    case StructType(fields) =>
+      val fieldsFromJava = fields.map(f => makeFromJava(f.dataType)).toArray
+
+      (obj: Any) => nullSafeConvert(obj) {
+        case c if c.getClass.isArray =>
+          val array = c.asInstanceOf[Array[_]]
+          if (array.length != fields.length) {
+            throw new IllegalStateException(
+              s"Input row doesn't have expected number of values required by the schema. " +
+                s"${fields.length} fields are required while ${array.length} values are provided."
+            )
+          }
+
+          val row = new GenericInternalRow(fields.length)
+          var i = 0
+          while (i < fields.length) {
+            row(i) = fieldsFromJava(i)(array(i))
+            i += 1
+          }
+          row
       }
-      new GenericInternalRow(array.zip(fields).map {
-        case (e, f) => fromJava(e, f.dataType)
-      })
 
-    case (_, udt: UserDefinedType[_]) => fromJava(obj, udt.sqlType)
+    case udt: UserDefinedType[_] => makeFromJava(udt.sqlType)
+
+    case other => (obj: Any) => nullSafeConvert(other)(PartialFunction.empty)
+  }
 
-    // all other unexpected type should be null, or we will have runtime exception
-    // TODO(davies): we could improve this by try to cast the object to expected type
-    case (c, _) => null
+  private def nullSafeConvert(input: Any)(f: PartialFunction[Any, Any]): Any = {
+    if (input == null) {
+      null
+    } else {
+      f.applyOrElse(input, {
+        // all other unexpected type should be null, or we will have runtime exception
+        // TODO(davies): we could improve this by try to cast the object to expected type
+        _: Any => null
+      })
+    }
   }
 
   private val module = "pyspark.sql.types"

From 6964dfe47b2090e542b26cd64e27420ec3eb1a3d Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Mon, 8 Jan 2018 16:04:03 +0800
Subject: [PATCH 038/566] [SPARK-22983] Don't push filters beneath aggregates
 with empty grouping expressions

## What changes were proposed in this pull request?

The following SQL query should return zero rows, but in Spark it actually returns one row:

```
SELECT 1 from (
  SELECT 1 AS z,
  MIN(a.x)
  FROM (select 1 as x) a
  WHERE false
) b
where b.z != b.z
```

The problem stems from the `PushDownPredicate` rule: when this rule encounters a filter on top of an Aggregate operator, e.g. `Filter(Agg(...))`, it removes the original filter and adds a new filter onto Aggregate's child, e.g. `Agg(Filter(...))`. This is sometimes okay, but the case above is a counterexample: because there is no explicit `GROUP BY`, we are implicitly computing a global aggregate over the entire table so the original filter was not acting like a `HAVING` clause filtering the number of groups: if we push this filter then it fails to actually reduce the cardinality of the Aggregate output, leading to the wrong answer.

In 2016 I fixed a similar problem involving invalid pushdowns of data-independent filters (filters which reference no columns of the filtered relation). There was additional discussion after my fix was merged which pointed out that my patch was an incomplete fix (see #15289), but it looks I must have either misunderstood the comment or forgot to follow up on the additional points raised there.

This patch fixes the problem by choosing to never push down filters in cases where there are no grouping expressions. Since there are no grouping keys, the only columns are aggregate columns and we can't push filters defined over aggregate results, so this change won't cause us to miss out on any legitimate pushdown opportunities.

## How was this patch tested?

New regression tests in `SQLQueryTestSuite` and `FilterPushdownSuite`.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #20180 from JoshRosen/SPARK-22983-dont-push-filters-beneath-aggs-with-empty-grouping-expressions.

(cherry picked from commit 2c73d2a948bdde798aaf0f87c18846281deb05fd)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../spark/sql/catalyst/optimizer/Optimizer.scala |  3 ++-
 .../catalyst/optimizer/FilterPushdownSuite.scala | 13 +++++++++++++
 .../test/resources/sql-tests/inputs/group-by.sql |  9 +++++++++
 .../resources/sql-tests/results/group-by.sql.out | 16 +++++++++++++++-
 4 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 0d4b02c6e7d8a..df0af8264a329 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -795,7 +795,8 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper {
       project.copy(child = Filter(replaceAlias(condition, aliasMap), grandChild))
 
     case filter @ Filter(condition, aggregate: Aggregate)
-      if aggregate.aggregateExpressions.forall(_.deterministic) =>
+      if aggregate.aggregateExpressions.forall(_.deterministic)
+        && aggregate.groupingExpressions.nonEmpty =>
       // Find all the aliased expressions in the aggregate list that don't include any actual
       // AggregateExpression, and create a map from the alias to the expression
       val aliasMap = AttributeMap(aggregate.aggregateExpressions.collect {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index 85a5e979f6021..82a10254d846d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -809,6 +809,19 @@ class FilterPushdownSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
+  test("aggregate: don't push filters if the aggregate has no grouping expressions") {
+    val originalQuery = LocalRelation.apply(testRelation.output, Seq.empty)
+      .select('a, 'b)
+      .groupBy()(count(1))
+      .where(false)
+
+    val optimized = Optimize.execute(originalQuery.analyze)
+
+    val correctAnswer = originalQuery.analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
   test("broadcast hint") {
     val originalQuery = ResolvedHint(testRelation)
       .where('a === 2L && 'b + Rand(10).as("rnd") === 3)
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
index 1e1384549a410..c5070b734d521 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
@@ -60,3 +60,12 @@ SELECT a, COUNT(1) FROM testData WHERE false GROUP BY a;
 -- Aggregate with empty input and empty GroupBy expressions.
 SELECT COUNT(1) FROM testData WHERE false;
 SELECT 1 FROM (SELECT COUNT(1) FROM testData WHERE false) t;
+
+-- Aggregate with empty GroupBy expressions and filter on top
+SELECT 1 from (
+  SELECT 1 AS z,
+  MIN(a.x)
+  FROM (select 1 as x) a
+  WHERE false
+) b
+where b.z != b.z
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
index 986bb01c13fe4..c1abc6dff754b 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 25
+-- Number of queries: 26
 
 
 -- !query 0
@@ -227,3 +227,17 @@ SELECT 1 FROM (SELECT COUNT(1) FROM testData WHERE false) t
 struct<1:int>
 -- !query 24 output
 1
+
+
+-- !query 25
+SELECT 1 from (
+  SELECT 1 AS z,
+  MIN(a.x)
+  FROM (select 1 as x) a
+  WHERE false
+) b
+where b.z != b.z
+-- !query 25 schema
+struct<1:int>
+-- !query 25 output
+

From 4a45f0a532216736f8874417c5cbd7912ca13db5 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Mon, 8 Jan 2018 19:41:41 +0800
Subject: [PATCH 039/566] [SPARK-21865][SQL] simplify the distribution semantic
 of Spark SQL

## What changes were proposed in this pull request?

**The current shuffle planning logic**

1. Each operator specifies the distribution requirements for its children, via the `Distribution` interface.
2. Each operator specifies its output partitioning, via the `Partitioning` interface.
3. `Partitioning.satisfy` determines whether a `Partitioning` can satisfy a `Distribution`.
4. For each operator, check each child of it, add a shuffle node above the child if the child partitioning can not satisfy the required distribution.
5. For each operator, check if its children's output partitionings are compatible with each other, via the `Partitioning.compatibleWith`.
6. If the check in 5 failed, add a shuffle above each child.
7. try to eliminate the shuffles added in 6, via `Partitioning.guarantees`.

This design has a major problem with the definition of "compatible".

`Partitioning.compatibleWith` is not well defined, ideally a `Partitioning` can't know if it's compatible with other `Partitioning`, without more information from the operator. For example, `t1 join t2 on t1.a = t2.b`, `HashPartitioning(a, 10)` should be compatible with `HashPartitioning(b, 10)` under this case, but the partitioning itself doesn't know it.

As a result, currently `Partitioning.compatibleWith` always return false except for literals, which make it almost useless. This also means, if an operator has distribution requirements for multiple children, Spark always add shuffle nodes to all the children(although some of them can be eliminated). However, there is no guarantee that the children's output partitionings are compatible with each other after adding these shuffles, we just assume that the operator will only specify `ClusteredDistribution` for multiple children.

I think it's very hard to guarantee children co-partition for all kinds of operators, and we can not even give a clear definition about co-partition between distributions like `ClusteredDistribution(a,b)` and `ClusteredDistribution(c)`.

I think we should drop the "compatible" concept in the distribution model, and let the operator achieve the co-partition requirement by special distribution requirements.

**Proposed shuffle planning logic after this PR**
(The first 4 are same as before)
1. Each operator specifies the distribution requirements for its children, via the `Distribution` interface.
2. Each operator specifies its output partitioning, via the `Partitioning` interface.
3. `Partitioning.satisfy` determines whether a `Partitioning` can satisfy a `Distribution`.
4. For each operator, check each child of it, add a shuffle node above the child if the child partitioning can not satisfy the required distribution.
5. For each operator, check if its children's output partitionings have the same number of partitions.
6. If the check in 5 failed, pick the max number of partitions from children's output partitionings, and add shuffle to child whose number of partitions doesn't equal to the max one.

The new distribution model is very simple, we only have one kind of relationship, which is `Partitioning.satisfy`. For multiple children, Spark only guarantees they have the same number of partitions, and it's the operator's responsibility to leverage this guarantee to achieve more complicated requirements. For example, non-broadcast joins can use the newly added `HashPartitionedDistribution` to achieve co-partition.

## How was this patch tested?

existing tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19080 from cloud-fan/exchange.

(cherry picked from commit eb45b52e826ea9cea48629760db35ef87f91fea0)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../plans/physical/partitioning.scala         | 286 +++++++-----------
 .../sql/catalyst/PartitioningSuite.scala      |  55 ----
 .../spark/sql/execution/SparkPlan.scala       |  16 +-
 .../exchange/EnsureRequirements.scala         | 120 +++-----
 .../joins/ShuffledHashJoinExec.scala          |   2 +-
 .../execution/joins/SortMergeJoinExec.scala   |   2 +-
 .../apache/spark/sql/execution/objects.scala  |   2 +-
 .../spark/sql/execution/PlannerSuite.scala    |  81 ++---
 8 files changed, 194 insertions(+), 370 deletions(-)
 delete mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/PartitioningSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
index e57c842ce2a36..0189bd73c56bf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
@@ -30,18 +30,43 @@ import org.apache.spark.sql.types.{DataType, IntegerType}
  *  - Intra-partition ordering of data: In this case the distribution describes guarantees made
  *    about how tuples are distributed within a single partition.
  */
-sealed trait Distribution
+sealed trait Distribution {
+  /**
+   * The required number of partitions for this distribution. If it's None, then any number of
+   * partitions is allowed for this distribution.
+   */
+  def requiredNumPartitions: Option[Int]
+
+  /**
+   * Creates a default partitioning for this distribution, which can satisfy this distribution while
+   * matching the given number of partitions.
+   */
+  def createPartitioning(numPartitions: Int): Partitioning
+}
 
 /**
  * Represents a distribution where no promises are made about co-location of data.
  */
-case object UnspecifiedDistribution extends Distribution
+case object UnspecifiedDistribution extends Distribution {
+  override def requiredNumPartitions: Option[Int] = None
+
+  override def createPartitioning(numPartitions: Int): Partitioning = {
+    throw new IllegalStateException("UnspecifiedDistribution does not have default partitioning.")
+  }
+}
 
 /**
  * Represents a distribution that only has a single partition and all tuples of the dataset
  * are co-located.
  */
-case object AllTuples extends Distribution
+case object AllTuples extends Distribution {
+  override def requiredNumPartitions: Option[Int] = Some(1)
+
+  override def createPartitioning(numPartitions: Int): Partitioning = {
+    assert(numPartitions == 1, "The default partitioning of AllTuples can only have 1 partition.")
+    SinglePartition
+  }
+}
 
 /**
  * Represents data where tuples that share the same values for the `clustering`
@@ -51,12 +76,41 @@ case object AllTuples extends Distribution
  */
 case class ClusteredDistribution(
     clustering: Seq[Expression],
-    numPartitions: Option[Int] = None) extends Distribution {
+    requiredNumPartitions: Option[Int] = None) extends Distribution {
   require(
     clustering != Nil,
     "The clustering expressions of a ClusteredDistribution should not be Nil. " +
       "An AllTuples should be used to represent a distribution that only has " +
       "a single partition.")
+
+  override def createPartitioning(numPartitions: Int): Partitioning = {
+    assert(requiredNumPartitions.isEmpty || requiredNumPartitions.get == numPartitions,
+      s"This ClusteredDistribution requires ${requiredNumPartitions.get} partitions, but " +
+        s"the actual number of partitions is $numPartitions.")
+    HashPartitioning(clustering, numPartitions)
+  }
+}
+
+/**
+ * Represents data where tuples have been clustered according to the hash of the given
+ * `expressions`. The hash function is defined as `HashPartitioning.partitionIdExpression`, so only
+ * [[HashPartitioning]] can satisfy this distribution.
+ *
+ * This is a strictly stronger guarantee than [[ClusteredDistribution]]. Given a tuple and the
+ * number of partitions, this distribution strictly requires which partition the tuple should be in.
+ */
+case class HashClusteredDistribution(expressions: Seq[Expression]) extends Distribution {
+  require(
+    expressions != Nil,
+    "The expressions for hash of a HashPartitionedDistribution should not be Nil. " +
+      "An AllTuples should be used to represent a distribution that only has " +
+      "a single partition.")
+
+  override def requiredNumPartitions: Option[Int] = None
+
+  override def createPartitioning(numPartitions: Int): Partitioning = {
+    HashPartitioning(expressions, numPartitions)
+  }
 }
 
 /**
@@ -73,46 +127,31 @@ case class OrderedDistribution(ordering: Seq[SortOrder]) extends Distribution {
       "An AllTuples should be used to represent a distribution that only has " +
       "a single partition.")
 
-  // TODO: This is not really valid...
-  def clustering: Set[Expression] = ordering.map(_.child).toSet
+  override def requiredNumPartitions: Option[Int] = None
+
+  override def createPartitioning(numPartitions: Int): Partitioning = {
+    RangePartitioning(ordering, numPartitions)
+  }
 }
 
 /**
  * Represents data where tuples are broadcasted to every node. It is quite common that the
  * entire set of tuples is transformed into different data structure.
  */
-case class BroadcastDistribution(mode: BroadcastMode) extends Distribution
+case class BroadcastDistribution(mode: BroadcastMode) extends Distribution {
+  override def requiredNumPartitions: Option[Int] = Some(1)
+
+  override def createPartitioning(numPartitions: Int): Partitioning = {
+    assert(numPartitions == 1,
+      "The default partitioning of BroadcastDistribution can only have 1 partition.")
+    BroadcastPartitioning(mode)
+  }
+}
 
 /**
- * Describes how an operator's output is split across partitions. The `compatibleWith`,
- * `guarantees`, and `satisfies` methods describe relationships between child partitionings,
- * target partitionings, and [[Distribution]]s. These relations are described more precisely in
- * their individual method docs, but at a high level:
- *
- *  - `satisfies` is a relationship between partitionings and distributions.
- *  - `compatibleWith` is relationships between an operator's child output partitionings.
- *  - `guarantees` is a relationship between a child's existing output partitioning and a target
- *     output partitioning.
- *
- *  Diagrammatically:
- *
- *            +--------------+
- *            | Distribution |
- *            +--------------+
- *                    ^
- *                    |
- *               satisfies
- *                    |
- *            +--------------+                  +--------------+
- *            |    Child     |                  |    Target    |
- *       +----| Partitioning |----guarantees--->| Partitioning |
- *       |    +--------------+                  +--------------+
- *       |            ^
- *       |            |
- *       |     compatibleWith
- *       |            |
- *       +------------+
- *
+ * Describes how an operator's output is split across partitions. It has 2 major properties:
+ *   1. number of partitions.
+ *   2. if it can satisfy a given distribution.
  */
 sealed trait Partitioning {
   /** Returns the number of partitions that the data is split across */
@@ -123,113 +162,35 @@ sealed trait Partitioning {
    * to satisfy the partitioning scheme mandated by the `required` [[Distribution]],
    * i.e. the current dataset does not need to be re-partitioned for the `required`
    * Distribution (it is possible that tuples within a partition need to be reorganized).
-   */
-  def satisfies(required: Distribution): Boolean
-
-  /**
-   * Returns true iff we can say that the partitioning scheme of this [[Partitioning]]
-   * guarantees the same partitioning scheme described by `other`.
-   *
-   * Compatibility of partitionings is only checked for operators that have multiple children
-   * and that require a specific child output [[Distribution]], such as joins.
-   *
-   * Intuitively, partitionings are compatible if they route the same partitioning key to the same
-   * partition. For instance, two hash partitionings are only compatible if they produce the same
-   * number of output partitionings and hash records according to the same hash function and
-   * same partitioning key schema.
-   *
-   * Put another way, two partitionings are compatible with each other if they satisfy all of the
-   * same distribution guarantees.
-   */
-  def compatibleWith(other: Partitioning): Boolean
-
-  /**
-   * Returns true iff we can say that the partitioning scheme of this [[Partitioning]] guarantees
-   * the same partitioning scheme described by `other`. If a `A.guarantees(B)`, then repartitioning
-   * the child's output according to `B` will be unnecessary. `guarantees` is used as a performance
-   * optimization to allow the exchange planner to avoid redundant repartitionings. By default,
-   * a partitioning only guarantees partitionings that are equal to itself (i.e. the same number
-   * of partitions, same strategy (range or hash), etc).
-   *
-   * In order to enable more aggressive optimization, this strict equality check can be relaxed.
-   * For example, say that the planner needs to repartition all of an operator's children so that
-   * they satisfy the [[AllTuples]] distribution. One way to do this is to repartition all children
-   * to have the [[SinglePartition]] partitioning. If one of the operator's children already happens
-   * to be hash-partitioned with a single partition then we do not need to re-shuffle this child;
-   * this repartitioning can be avoided if a single-partition [[HashPartitioning]] `guarantees`
-   * [[SinglePartition]].
-   *
-   * The SinglePartition example given above is not particularly interesting; guarantees' real
-   * value occurs for more advanced partitioning strategies. SPARK-7871 will introduce a notion
-   * of null-safe partitionings, under which partitionings can specify whether rows whose
-   * partitioning keys contain null values will be grouped into the same partition or whether they
-   * will have an unknown / random distribution. If a partitioning does not require nulls to be
-   * clustered then a partitioning which _does_ cluster nulls will guarantee the null clustered
-   * partitioning. The converse is not true, however: a partitioning which clusters nulls cannot
-   * be guaranteed by one which does not cluster them. Thus, in general `guarantees` is not a
-   * symmetric relation.
    *
-   * Another way to think about `guarantees`: if `A.guarantees(B)`, then any partitioning of rows
-   * produced by `A` could have also been produced by `B`.
+   * By default a [[Partitioning]] can satisfy [[UnspecifiedDistribution]], and [[AllTuples]] if
+   * the [[Partitioning]] only have one partition. Implementations can overwrite this method with
+   * special logic.
    */
-  def guarantees(other: Partitioning): Boolean = this == other
-}
-
-object Partitioning {
-  def allCompatible(partitionings: Seq[Partitioning]): Boolean = {
-    // Note: this assumes transitivity
-    partitionings.sliding(2).map {
-      case Seq(a) => true
-      case Seq(a, b) =>
-        if (a.numPartitions != b.numPartitions) {
-          assert(!a.compatibleWith(b) && !b.compatibleWith(a))
-          false
-        } else {
-          a.compatibleWith(b) && b.compatibleWith(a)
-        }
-    }.forall(_ == true)
-  }
-}
-
-case class UnknownPartitioning(numPartitions: Int) extends Partitioning {
-  override def satisfies(required: Distribution): Boolean = required match {
+  def satisfies(required: Distribution): Boolean = required match {
     case UnspecifiedDistribution => true
+    case AllTuples => numPartitions == 1
     case _ => false
   }
-
-  override def compatibleWith(other: Partitioning): Boolean = false
-
-  override def guarantees(other: Partitioning): Boolean = false
 }
 
+case class UnknownPartitioning(numPartitions: Int) extends Partitioning
+
 /**
  * Represents a partitioning where rows are distributed evenly across output partitions
  * by starting from a random target partition number and distributing rows in a round-robin
  * fashion. This partitioning is used when implementing the DataFrame.repartition() operator.
  */
-case class RoundRobinPartitioning(numPartitions: Int) extends Partitioning {
-  override def satisfies(required: Distribution): Boolean = required match {
-    case UnspecifiedDistribution => true
-    case _ => false
-  }
-
-  override def compatibleWith(other: Partitioning): Boolean = false
-
-  override def guarantees(other: Partitioning): Boolean = false
-}
+case class RoundRobinPartitioning(numPartitions: Int) extends Partitioning
 
 case object SinglePartition extends Partitioning {
   val numPartitions = 1
 
   override def satisfies(required: Distribution): Boolean = required match {
     case _: BroadcastDistribution => false
-    case ClusteredDistribution(_, desiredPartitions) => desiredPartitions.forall(_ == 1)
+    case ClusteredDistribution(_, Some(requiredNumPartitions)) => requiredNumPartitions == 1
     case _ => true
   }
-
-  override def compatibleWith(other: Partitioning): Boolean = other.numPartitions == 1
-
-  override def guarantees(other: Partitioning): Boolean = other.numPartitions == 1
 }
 
 /**
@@ -244,22 +205,19 @@ case class HashPartitioning(expressions: Seq[Expression], numPartitions: Int)
   override def nullable: Boolean = false
   override def dataType: DataType = IntegerType
 
-  override def satisfies(required: Distribution): Boolean = required match {
-    case UnspecifiedDistribution => true
-    case ClusteredDistribution(requiredClustering, desiredPartitions) =>
-      expressions.forall(x => requiredClustering.exists(_.semanticEquals(x))) &&
-        desiredPartitions.forall(_ == numPartitions) // if desiredPartitions = None, returns true
-    case _ => false
-  }
-
-  override def compatibleWith(other: Partitioning): Boolean = other match {
-    case o: HashPartitioning => this.semanticEquals(o)
-    case _ => false
-  }
-
-  override def guarantees(other: Partitioning): Boolean = other match {
-    case o: HashPartitioning => this.semanticEquals(o)
-    case _ => false
+  override def satisfies(required: Distribution): Boolean = {
+    super.satisfies(required) || {
+      required match {
+        case h: HashClusteredDistribution =>
+          expressions.length == h.expressions.length && expressions.zip(h.expressions).forall {
+            case (l, r) => l.semanticEquals(r)
+          }
+        case ClusteredDistribution(requiredClustering, requiredNumPartitions) =>
+          expressions.forall(x => requiredClustering.exists(_.semanticEquals(x))) &&
+            (requiredNumPartitions.isEmpty || requiredNumPartitions.get == numPartitions)
+        case _ => false
+      }
+    }
   }
 
   /**
@@ -288,25 +246,18 @@ case class RangePartitioning(ordering: Seq[SortOrder], numPartitions: Int)
   override def nullable: Boolean = false
   override def dataType: DataType = IntegerType
 
-  override def satisfies(required: Distribution): Boolean = required match {
-    case UnspecifiedDistribution => true
-    case OrderedDistribution(requiredOrdering) =>
-      val minSize = Seq(requiredOrdering.size, ordering.size).min
-      requiredOrdering.take(minSize) == ordering.take(minSize)
-    case ClusteredDistribution(requiredClustering, desiredPartitions) =>
-      ordering.map(_.child).forall(x => requiredClustering.exists(_.semanticEquals(x))) &&
-        desiredPartitions.forall(_ == numPartitions) // if desiredPartitions = None, returns true
-    case _ => false
-  }
-
-  override def compatibleWith(other: Partitioning): Boolean = other match {
-    case o: RangePartitioning => this.semanticEquals(o)
-    case _ => false
-  }
-
-  override def guarantees(other: Partitioning): Boolean = other match {
-    case o: RangePartitioning => this.semanticEquals(o)
-    case _ => false
+  override def satisfies(required: Distribution): Boolean = {
+    super.satisfies(required) || {
+      required match {
+        case OrderedDistribution(requiredOrdering) =>
+          val minSize = Seq(requiredOrdering.size, ordering.size).min
+          requiredOrdering.take(minSize) == ordering.take(minSize)
+        case ClusteredDistribution(requiredClustering, requiredNumPartitions) =>
+          ordering.map(_.child).forall(x => requiredClustering.exists(_.semanticEquals(x))) &&
+            (requiredNumPartitions.isEmpty || requiredNumPartitions.get == numPartitions)
+        case _ => false
+      }
+    }
   }
 }
 
@@ -347,20 +298,6 @@ case class PartitioningCollection(partitionings: Seq[Partitioning])
   override def satisfies(required: Distribution): Boolean =
     partitionings.exists(_.satisfies(required))
 
-  /**
-   * Returns true if any `partitioning` of this collection is compatible with
-   * the given [[Partitioning]].
-   */
-  override def compatibleWith(other: Partitioning): Boolean =
-    partitionings.exists(_.compatibleWith(other))
-
-  /**
-   * Returns true if any `partitioning` of this collection guarantees
-   * the given [[Partitioning]].
-   */
-  override def guarantees(other: Partitioning): Boolean =
-    partitionings.exists(_.guarantees(other))
-
   override def toString: String = {
     partitionings.map(_.toString).mkString("(", " or ", ")")
   }
@@ -377,9 +314,4 @@ case class BroadcastPartitioning(mode: BroadcastMode) extends Partitioning {
     case BroadcastDistribution(m) if m == mode => true
     case _ => false
   }
-
-  override def compatibleWith(other: Partitioning): Boolean = other match {
-    case BroadcastPartitioning(m) if m == mode => true
-    case _ => false
-  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/PartitioningSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/PartitioningSuite.scala
deleted file mode 100644
index 5b802ccc637dd..0000000000000
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/PartitioningSuite.scala
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.expressions.{InterpretedMutableProjection, Literal}
-import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, HashPartitioning}
-
-class PartitioningSuite extends SparkFunSuite {
-  test("HashPartitioning compatibility should be sensitive to expression ordering (SPARK-9785)") {
-    val expressions = Seq(Literal(2), Literal(3))
-    // Consider two HashPartitionings that have the same _set_ of hash expressions but which are
-    // created with different orderings of those expressions:
-    val partitioningA = HashPartitioning(expressions, 100)
-    val partitioningB = HashPartitioning(expressions.reverse, 100)
-    // These partitionings are not considered equal:
-    assert(partitioningA != partitioningB)
-    // However, they both satisfy the same clustered distribution:
-    val distribution = ClusteredDistribution(expressions)
-    assert(partitioningA.satisfies(distribution))
-    assert(partitioningB.satisfies(distribution))
-    // These partitionings compute different hashcodes for the same input row:
-    def computeHashCode(partitioning: HashPartitioning): Int = {
-      val hashExprProj = new InterpretedMutableProjection(partitioning.expressions, Seq.empty)
-      hashExprProj.apply(InternalRow.empty).hashCode()
-    }
-    assert(computeHashCode(partitioningA) != computeHashCode(partitioningB))
-    // Thus, these partitionings are incompatible:
-    assert(!partitioningA.compatibleWith(partitioningB))
-    assert(!partitioningB.compatibleWith(partitioningA))
-    assert(!partitioningA.guarantees(partitioningB))
-    assert(!partitioningB.guarantees(partitioningA))
-
-    // Just to be sure that we haven't cheated by having these methods always return false,
-    // check that identical partitionings are still compatible with and guarantee each other:
-    assert(partitioningA === partitioningA)
-    assert(partitioningA.guarantees(partitioningA))
-    assert(partitioningA.compatibleWith(partitioningA))
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 787c1cfbfb3d8..82300efc01632 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -94,7 +94,21 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
   /** Specifies how data is partitioned across different nodes in the cluster. */
   def outputPartitioning: Partitioning = UnknownPartitioning(0) // TODO: WRONG WIDTH!
 
-  /** Specifies any partition requirements on the input data for this operator. */
+  /**
+   * Specifies the data distribution requirements of all the children for this operator. By default
+   * it's [[UnspecifiedDistribution]] for each child, which means each child can have any
+   * distribution.
+   *
+   * If an operator overwrites this method, and specifies distribution requirements(excluding
+   * [[UnspecifiedDistribution]] and [[BroadcastDistribution]]) for more than one child, Spark
+   * guarantees that the outputs of these children will have same number of partitions, so that the
+   * operator can safely zip partitions of these children's result RDDs. Some operators can leverage
+   * this guarantee to satisfy some interesting requirement, e.g., non-broadcast joins can specify
+   * HashClusteredDistribution(a,b) for its left child, and specify HashClusteredDistribution(c,d)
+   * for its right child, then it's guaranteed that left and right child are co-partitioned by
+   * a,b/c,d, which means tuples of same value are in the partitions of same index, e.g.,
+   * (a=1,b=2) and (c=1,d=2) are both in the second partition of left and right child.
+   */
   def requiredChildDistribution: Seq[Distribution] =
     Seq.fill(children.size)(UnspecifiedDistribution)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
index c8e236be28b42..e3d28388c5470 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
@@ -46,23 +46,6 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
     if (minNumPostShufflePartitions > 0) Some(minNumPostShufflePartitions) else None
   }
 
-  /**
-   * Given a required distribution, returns a partitioning that satisfies that distribution.
-   * @param requiredDistribution The distribution that is required by the operator
-   * @param numPartitions Used when the distribution doesn't require a specific number of partitions
-   */
-  private def createPartitioning(
-      requiredDistribution: Distribution,
-      numPartitions: Int): Partitioning = {
-    requiredDistribution match {
-      case AllTuples => SinglePartition
-      case ClusteredDistribution(clustering, desiredPartitions) =>
-        HashPartitioning(clustering, desiredPartitions.getOrElse(numPartitions))
-      case OrderedDistribution(ordering) => RangePartitioning(ordering, numPartitions)
-      case dist => sys.error(s"Do not know how to satisfy distribution $dist")
-    }
-  }
-
   /**
    * Adds [[ExchangeCoordinator]] to [[ShuffleExchangeExec]]s if adaptive query execution is enabled
    * and partitioning schemes of these [[ShuffleExchangeExec]]s support [[ExchangeCoordinator]].
@@ -88,8 +71,9 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
         // shuffle data when we have more than one children because data generated by
         // these children may not be partitioned in the same way.
         // Please see the comment in withCoordinator for more details.
-        val supportsDistribution =
-          requiredChildDistributions.forall(_.isInstanceOf[ClusteredDistribution])
+        val supportsDistribution = requiredChildDistributions.forall { dist =>
+          dist.isInstanceOf[ClusteredDistribution] || dist.isInstanceOf[HashClusteredDistribution]
+        }
         children.length > 1 && supportsDistribution
       }
 
@@ -142,8 +126,7 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
             //
             // It will be great to introduce a new Partitioning to represent the post-shuffle
             // partitions when one post-shuffle partition includes multiple pre-shuffle partitions.
-            val targetPartitioning =
-              createPartitioning(distribution, defaultNumPreShufflePartitions)
+            val targetPartitioning = distribution.createPartitioning(defaultNumPreShufflePartitions)
             assert(targetPartitioning.isInstanceOf[HashPartitioning])
             ShuffleExchangeExec(targetPartitioning, child, Some(coordinator))
         }
@@ -162,71 +145,56 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
     assert(requiredChildDistributions.length == children.length)
     assert(requiredChildOrderings.length == children.length)
 
-    // Ensure that the operator's children satisfy their output distribution requirements:
+    // Ensure that the operator's children satisfy their output distribution requirements.
     children = children.zip(requiredChildDistributions).map {
       case (child, distribution) if child.outputPartitioning.satisfies(distribution) =>
         child
       case (child, BroadcastDistribution(mode)) =>
         BroadcastExchangeExec(mode, child)
       case (child, distribution) =>
-        ShuffleExchangeExec(createPartitioning(distribution, defaultNumPreShufflePartitions), child)
+        val numPartitions = distribution.requiredNumPartitions
+          .getOrElse(defaultNumPreShufflePartitions)
+        ShuffleExchangeExec(distribution.createPartitioning(numPartitions), child)
     }
 
-    // If the operator has multiple children and specifies child output distributions (e.g. join),
-    // then the children's output partitionings must be compatible:
-    def requireCompatiblePartitioning(distribution: Distribution): Boolean = distribution match {
-      case UnspecifiedDistribution => false
-      case BroadcastDistribution(_) => false
+    // Get the indexes of children which have specified distribution requirements and need to have
+    // same number of partitions.
+    val childrenIndexes = requiredChildDistributions.zipWithIndex.filter {
+      case (UnspecifiedDistribution, _) => false
+      case (_: BroadcastDistribution, _) => false
       case _ => true
-    }
-    if (children.length > 1
-        && requiredChildDistributions.exists(requireCompatiblePartitioning)
-        && !Partitioning.allCompatible(children.map(_.outputPartitioning))) {
+    }.map(_._2)
 
-      // First check if the existing partitions of the children all match. This means they are
-      // partitioned by the same partitioning into the same number of partitions. In that case,
-      // don't try to make them match `defaultPartitions`, just use the existing partitioning.
-      val maxChildrenNumPartitions = children.map(_.outputPartitioning.numPartitions).max
-      val useExistingPartitioning = children.zip(requiredChildDistributions).forall {
-        case (child, distribution) =>
-          child.outputPartitioning.guarantees(
-            createPartitioning(distribution, maxChildrenNumPartitions))
+    val childrenNumPartitions =
+      childrenIndexes.map(children(_).outputPartitioning.numPartitions).toSet
+
+    if (childrenNumPartitions.size > 1) {
+      // Get the number of partitions which is explicitly required by the distributions.
+      val requiredNumPartitions = {
+        val numPartitionsSet = childrenIndexes.flatMap {
+          index => requiredChildDistributions(index).requiredNumPartitions
+        }.toSet
+        assert(numPartitionsSet.size <= 1,
+          s"$operator have incompatible requirements of the number of partitions for its children")
+        numPartitionsSet.headOption
       }
 
-      children = if (useExistingPartitioning) {
-        // We do not need to shuffle any child's output.
-        children
-      } else {
-        // We need to shuffle at least one child's output.
-        // Now, we will determine the number of partitions that will be used by created
-        // partitioning schemes.
-        val numPartitions = {
-          // Let's see if we need to shuffle all child's outputs when we use
-          // maxChildrenNumPartitions.
-          val shufflesAllChildren = children.zip(requiredChildDistributions).forall {
-            case (child, distribution) =>
-              !child.outputPartitioning.guarantees(
-                createPartitioning(distribution, maxChildrenNumPartitions))
-          }
-          // If we need to shuffle all children, we use defaultNumPreShufflePartitions as the
-          // number of partitions. Otherwise, we use maxChildrenNumPartitions.
-          if (shufflesAllChildren) defaultNumPreShufflePartitions else maxChildrenNumPartitions
-        }
+      val targetNumPartitions = requiredNumPartitions.getOrElse(childrenNumPartitions.max)
 
-        children.zip(requiredChildDistributions).map {
-          case (child, distribution) =>
-            val targetPartitioning = createPartitioning(distribution, numPartitions)
-            if (child.outputPartitioning.guarantees(targetPartitioning)) {
-              child
-            } else {
-              child match {
-                // If child is an exchange, we replace it with
-                // a new one having targetPartitioning.
-                case ShuffleExchangeExec(_, c, _) => ShuffleExchangeExec(targetPartitioning, c)
-                case _ => ShuffleExchangeExec(targetPartitioning, child)
-              }
+      children = children.zip(requiredChildDistributions).zipWithIndex.map {
+        case ((child, distribution), index) if childrenIndexes.contains(index) =>
+          if (child.outputPartitioning.numPartitions == targetNumPartitions) {
+            child
+          } else {
+            val defaultPartitioning = distribution.createPartitioning(targetNumPartitions)
+            child match {
+              // If child is an exchange, we replace it with a new one having defaultPartitioning.
+              case ShuffleExchangeExec(_, c, _) => ShuffleExchangeExec(defaultPartitioning, c)
+              case _ => ShuffleExchangeExec(defaultPartitioning, child)
+            }
           }
-        }
+
+        case ((child, _), _) => child
       }
     }
 
@@ -324,10 +292,10 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
   }
 
   def apply(plan: SparkPlan): SparkPlan = plan.transformUp {
-    case operator @ ShuffleExchangeExec(partitioning, child, _) =>
-      child.children match {
-        case ShuffleExchangeExec(childPartitioning, baseChild, _)::Nil =>
-          if (childPartitioning.guarantees(partitioning)) child else operator
+    // TODO: remove this after we create a physical operator for `RepartitionByExpression`.
+    case operator @ ShuffleExchangeExec(upper: HashPartitioning, child, _) =>
+      child.outputPartitioning match {
+        case lower: HashPartitioning if upper.semanticEquals(lower) => child
         case _ => operator
       }
     case operator: SparkPlan =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
index 66e8031bb5191..897a4dae39f32 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
@@ -46,7 +46,7 @@ case class ShuffledHashJoinExec(
     "avgHashProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hash probe"))
 
   override def requiredChildDistribution: Seq[Distribution] =
-    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
+    HashClusteredDistribution(leftKeys) :: HashClusteredDistribution(rightKeys) :: Nil
 
   private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = {
     val buildDataSize = longMetric("buildDataSize")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
index 94405410cce90..2de2f30eb05d3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
@@ -78,7 +78,7 @@ case class SortMergeJoinExec(
   }
 
   override def requiredChildDistribution: Seq[Distribution] =
-    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
+    HashClusteredDistribution(leftKeys) :: HashClusteredDistribution(rightKeys) :: Nil
 
   override def outputOrdering: Seq[SortOrder] = joinType match {
     // For inner join, orders of both sides keys should be kept.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
index d1bd8a7076863..03d1bbf2ab882 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
@@ -456,7 +456,7 @@ case class CoGroupExec(
     right: SparkPlan) extends BinaryExecNode with ObjectProducerExec {
 
   override def requiredChildDistribution: Seq[Distribution] =
-    ClusteredDistribution(leftGroup) :: ClusteredDistribution(rightGroup) :: Nil
+    HashClusteredDistribution(leftGroup) :: HashClusteredDistribution(rightGroup) :: Nil
 
   override def requiredChildOrdering: Seq[Seq[SortOrder]] =
     leftGroup.map(SortOrder(_, Ascending)) :: rightGroup.map(SortOrder(_, Ascending)) :: Nil
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index b50642d275ba8..f8b26f5b28cc7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -260,11 +260,16 @@ class PlannerSuite extends SharedSQLContext {
   // do they satisfy the distribution requirements? As a result, we need at least four test cases.
 
   private def assertDistributionRequirementsAreSatisfied(outputPlan: SparkPlan): Unit = {
-    if (outputPlan.children.length > 1
-        && outputPlan.requiredChildDistribution.toSet != Set(UnspecifiedDistribution)) {
-      val childPartitionings = outputPlan.children.map(_.outputPartitioning)
-      if (!Partitioning.allCompatible(childPartitionings)) {
-        fail(s"Partitionings are not compatible: $childPartitionings")
+    if (outputPlan.children.length > 1) {
+      val childPartitionings = outputPlan.children.zip(outputPlan.requiredChildDistribution)
+        .filter {
+          case (_, UnspecifiedDistribution) => false
+          case (_, _: BroadcastDistribution) => false
+          case _ => true
+        }.map(_._1.outputPartitioning)
+
+      if (childPartitionings.map(_.numPartitions).toSet.size > 1) {
+        fail(s"Partitionings doesn't have same number of partitions: $childPartitionings")
       }
     }
     outputPlan.children.zip(outputPlan.requiredChildDistribution).foreach {
@@ -274,40 +279,7 @@ class PlannerSuite extends SharedSQLContext {
     }
   }
 
-  test("EnsureRequirements with incompatible child partitionings which satisfy distribution") {
-    // Consider an operator that requires inputs that are clustered by two expressions (e.g.
-    // sort merge join where there are multiple columns in the equi-join condition)
-    val clusteringA = Literal(1) :: Nil
-    val clusteringB = Literal(2) :: Nil
-    val distribution = ClusteredDistribution(clusteringA ++ clusteringB)
-    // Say that the left and right inputs are each partitioned by _one_ of the two join columns:
-    val leftPartitioning = HashPartitioning(clusteringA, 1)
-    val rightPartitioning = HashPartitioning(clusteringB, 1)
-    // Individually, each input's partitioning satisfies the clustering distribution:
-    assert(leftPartitioning.satisfies(distribution))
-    assert(rightPartitioning.satisfies(distribution))
-    // However, these partitionings are not compatible with each other, so we still need to
-    // repartition both inputs prior to performing the join:
-    assert(!leftPartitioning.compatibleWith(rightPartitioning))
-    assert(!rightPartitioning.compatibleWith(leftPartitioning))
-    val inputPlan = DummySparkPlan(
-      children = Seq(
-        DummySparkPlan(outputPartitioning = leftPartitioning),
-        DummySparkPlan(outputPartitioning = rightPartitioning)
-      ),
-      requiredChildDistribution = Seq(distribution, distribution),
-      requiredChildOrdering = Seq(Seq.empty, Seq.empty)
-    )
-    val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
-    assertDistributionRequirementsAreSatisfied(outputPlan)
-    if (outputPlan.collect { case e: ShuffleExchangeExec => true }.isEmpty) {
-      fail(s"Exchange should have been added:\n$outputPlan")
-    }
-  }
-
   test("EnsureRequirements with child partitionings with different numbers of output partitions") {
-    // This is similar to the previous test, except it checks that partitionings are not compatible
-    // unless they produce the same number of partitions.
     val clustering = Literal(1) :: Nil
     val distribution = ClusteredDistribution(clustering)
     val inputPlan = DummySparkPlan(
@@ -386,18 +358,15 @@ class PlannerSuite extends SharedSQLContext {
     }
   }
 
-  test("EnsureRequirements eliminates Exchange if child has Exchange with same partitioning") {
+  test("EnsureRequirements eliminates Exchange if child has same partitioning") {
     val distribution = ClusteredDistribution(Literal(1) :: Nil)
-    val finalPartitioning = HashPartitioning(Literal(1) :: Nil, 5)
-    val childPartitioning = HashPartitioning(Literal(2) :: Nil, 5)
-    assert(!childPartitioning.satisfies(distribution))
-    val inputPlan = ShuffleExchangeExec(finalPartitioning,
-      DummySparkPlan(
-        children = DummySparkPlan(outputPartitioning = childPartitioning) :: Nil,
-        requiredChildDistribution = Seq(distribution),
-        requiredChildOrdering = Seq(Seq.empty)),
-      None)
+    val partitioning = HashPartitioning(Literal(1) :: Nil, 5)
+    assert(partitioning.satisfies(distribution))
 
+    val inputPlan = ShuffleExchangeExec(
+      partitioning,
+      DummySparkPlan(outputPartitioning = partitioning),
+      None)
     val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
     assertDistributionRequirementsAreSatisfied(outputPlan)
     if (outputPlan.collect { case e: ShuffleExchangeExec => true }.size == 2) {
@@ -407,17 +376,13 @@ class PlannerSuite extends SharedSQLContext {
 
   test("EnsureRequirements does not eliminate Exchange with different partitioning") {
     val distribution = ClusteredDistribution(Literal(1) :: Nil)
-    // Number of partitions differ
-    val finalPartitioning = HashPartitioning(Literal(1) :: Nil, 8)
-    val childPartitioning = HashPartitioning(Literal(2) :: Nil, 5)
-    assert(!childPartitioning.satisfies(distribution))
-    val inputPlan = ShuffleExchangeExec(finalPartitioning,
-      DummySparkPlan(
-        children = DummySparkPlan(outputPartitioning = childPartitioning) :: Nil,
-        requiredChildDistribution = Seq(distribution),
-        requiredChildOrdering = Seq(Seq.empty)),
-      None)
+    val partitioning = HashPartitioning(Literal(2) :: Nil, 5)
+    assert(!partitioning.satisfies(distribution))
 
+    val inputPlan = ShuffleExchangeExec(
+      partitioning,
+      DummySparkPlan(outputPartitioning = partitioning),
+      None)
     val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
     assertDistributionRequirementsAreSatisfied(outputPlan)
     if (outputPlan.collect { case e: ShuffleExchangeExec => true }.size == 1) {

From 06fd842e3a120fde1c137e4945bcb747fc71a322 Mon Sep 17 00:00:00 2001
From: Xianjin YE <advancedxy@gmail.com>
Date: Mon, 8 Jan 2018 23:49:07 +0800
Subject: [PATCH 040/566] [SPARK-22952][CORE] Deprecate stageAttemptId in
 favour of stageAttemptNumber

## What changes were proposed in this pull request?
1.  Deprecate attemptId in StageInfo and add `def attemptNumber() = attemptId`
2. Replace usage of stageAttemptId with stageAttemptNumber

## How was this patch tested?
I manually checked the compiler warning info

Author: Xianjin YE <advancedxy@gmail.com>

Closes #20178 from advancedxy/SPARK-22952.

(cherry picked from commit 40b983c3b44b6771f07302ce87987fa4716b5ebf)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/scheduler/DAGScheduler.scala | 15 +++---
 .../apache/spark/scheduler/StageInfo.scala    |  4 +-
 .../spark/scheduler/StatsReportListener.scala |  2 +-
 .../spark/status/AppStatusListener.scala      |  7 +--
 .../org/apache/spark/status/LiveEntity.scala  |  4 +-
 .../spark/ui/scope/RDDOperationGraph.scala    |  2 +-
 .../org/apache/spark/util/JsonProtocol.scala  |  2 +-
 .../spark/status/AppStatusListenerSuite.scala | 54 ++++++++++---------
 .../execution/ui/SQLAppStatusListener.scala   |  2 +-
 9 files changed, 51 insertions(+), 41 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index c2498d4808e91..199937b8c27af 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -815,7 +815,8 @@ class DAGScheduler(
   private[scheduler] def handleBeginEvent(task: Task[_], taskInfo: TaskInfo) {
     // Note that there is a chance that this task is launched after the stage is cancelled.
     // In that case, we wouldn't have the stage anymore in stageIdToStage.
-    val stageAttemptId = stageIdToStage.get(task.stageId).map(_.latestInfo.attemptId).getOrElse(-1)
+    val stageAttemptId =
+      stageIdToStage.get(task.stageId).map(_.latestInfo.attemptNumber).getOrElse(-1)
     listenerBus.post(SparkListenerTaskStart(task.stageId, stageAttemptId, taskInfo))
   }
 
@@ -1050,7 +1051,7 @@ class DAGScheduler(
             val locs = taskIdToLocations(id)
             val part = stage.rdd.partitions(id)
             stage.pendingPartitions += id
-            new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,
+            new ShuffleMapTask(stage.id, stage.latestInfo.attemptNumber,
               taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),
               Option(sc.applicationId), sc.applicationAttemptId)
           }
@@ -1060,7 +1061,7 @@ class DAGScheduler(
             val p: Int = stage.partitions(id)
             val part = stage.rdd.partitions(p)
             val locs = taskIdToLocations(id)
-            new ResultTask(stage.id, stage.latestInfo.attemptId,
+            new ResultTask(stage.id, stage.latestInfo.attemptNumber,
               taskBinary, part, locs, id, properties, serializedTaskMetrics,
               Option(jobId), Option(sc.applicationId), sc.applicationAttemptId)
           }
@@ -1076,7 +1077,7 @@ class DAGScheduler(
       logInfo(s"Submitting ${tasks.size} missing tasks from $stage (${stage.rdd}) (first 15 " +
         s"tasks are for partitions ${tasks.take(15).map(_.partitionId)})")
       taskScheduler.submitTasks(new TaskSet(
-        tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties))
+        tasks.toArray, stage.id, stage.latestInfo.attemptNumber, jobId, properties))
     } else {
       // Because we posted SparkListenerStageSubmitted earlier, we should mark
       // the stage as completed here in case there are no tasks to run
@@ -1245,7 +1246,7 @@ class DAGScheduler(
             val status = event.result.asInstanceOf[MapStatus]
             val execId = status.location.executorId
             logDebug("ShuffleMapTask finished on " + execId)
-            if (stageIdToStage(task.stageId).latestInfo.attemptId == task.stageAttemptId) {
+            if (stageIdToStage(task.stageId).latestInfo.attemptNumber == task.stageAttemptId) {
               // This task was for the currently running attempt of the stage. Since the task
               // completed successfully from the perspective of the TaskSetManager, mark it as
               // no longer pending (the TaskSetManager may consider the task complete even
@@ -1324,10 +1325,10 @@ class DAGScheduler(
         val failedStage = stageIdToStage(task.stageId)
         val mapStage = shuffleIdToMapStage(shuffleId)
 
-        if (failedStage.latestInfo.attemptId != task.stageAttemptId) {
+        if (failedStage.latestInfo.attemptNumber != task.stageAttemptId) {
           logInfo(s"Ignoring fetch failure from $task as it's from $failedStage attempt" +
             s" ${task.stageAttemptId} and there is a more recent attempt for that stage " +
-            s"(attempt ID ${failedStage.latestInfo.attemptId}) running")
+            s"(attempt ${failedStage.latestInfo.attemptNumber}) running")
         } else {
           // It is likely that we receive multiple FetchFailed for a single stage (because we have
           // multiple tasks running concurrently on different executors). In that case, it is
diff --git a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala
index c513ed36d1680..903e25b7986f2 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala
@@ -30,7 +30,7 @@ import org.apache.spark.storage.RDDInfo
 @DeveloperApi
 class StageInfo(
     val stageId: Int,
-    val attemptId: Int,
+    @deprecated("Use attemptNumber instead", "2.3.0") val attemptId: Int,
     val name: String,
     val numTasks: Int,
     val rddInfos: Seq[RDDInfo],
@@ -56,6 +56,8 @@ class StageInfo(
     completionTime = Some(System.currentTimeMillis)
   }
 
+  def attemptNumber(): Int = attemptId
+
   private[spark] def getStatusString: String = {
     if (completionTime.isDefined) {
       if (failureReason.isDefined) {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/StatsReportListener.scala b/core/src/main/scala/org/apache/spark/scheduler/StatsReportListener.scala
index 3c8cab7504c17..3c7af4f6146fa 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/StatsReportListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/StatsReportListener.scala
@@ -79,7 +79,7 @@ class StatsReportListener extends SparkListener with Logging {
       x => info.completionTime.getOrElse(System.currentTimeMillis()) - x
     ).getOrElse("-")
 
-    s"Stage(${info.stageId}, ${info.attemptId}); Name: '${info.name}'; " +
+    s"Stage(${info.stageId}, ${info.attemptNumber}); Name: '${info.name}'; " +
       s"Status: ${info.getStatusString}$failureReason; numTasks: ${info.numTasks}; " +
       s"Took: $timeTaken msec"
   }
diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
index 487a782e865e8..88b75ddd5993a 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
@@ -529,7 +529,8 @@ private[spark] class AppStatusListener(
   }
 
   override def onStageCompleted(event: SparkListenerStageCompleted): Unit = {
-    val maybeStage = Option(liveStages.remove((event.stageInfo.stageId, event.stageInfo.attemptId)))
+    val maybeStage =
+      Option(liveStages.remove((event.stageInfo.stageId, event.stageInfo.attemptNumber)))
     maybeStage.foreach { stage =>
       val now = System.nanoTime()
       stage.info = event.stageInfo
@@ -785,7 +786,7 @@ private[spark] class AppStatusListener(
   }
 
   private def getOrCreateStage(info: StageInfo): LiveStage = {
-    val stage = liveStages.computeIfAbsent((info.stageId, info.attemptId),
+    val stage = liveStages.computeIfAbsent((info.stageId, info.attemptNumber),
       new Function[(Int, Int), LiveStage]() {
         override def apply(key: (Int, Int)): LiveStage = new LiveStage()
       })
@@ -912,7 +913,7 @@ private[spark] class AppStatusListener(
   private def cleanupTasks(stage: LiveStage): Unit = {
     val countToDelete = calculateNumberToRemove(stage.savedTasks.get(), maxTasksPerStage).toInt
     if (countToDelete > 0) {
-      val stageKey = Array(stage.info.stageId, stage.info.attemptId)
+      val stageKey = Array(stage.info.stageId, stage.info.attemptNumber)
       val view = kvstore.view(classOf[TaskDataWrapper]).index("stage").first(stageKey)
         .last(stageKey)
 
diff --git a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
index 52e83f250d34e..305c2fafa6aac 100644
--- a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
+++ b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
@@ -412,14 +412,14 @@ private class LiveStage extends LiveEntity {
 
   def executorSummary(executorId: String): LiveExecutorStageSummary = {
     executorSummaries.getOrElseUpdate(executorId,
-      new LiveExecutorStageSummary(info.stageId, info.attemptId, executorId))
+      new LiveExecutorStageSummary(info.stageId, info.attemptNumber, executorId))
   }
 
   def toApi(): v1.StageData = {
     new v1.StageData(
       status,
       info.stageId,
-      info.attemptId,
+      info.attemptNumber,
 
       info.numTasks,
       activeTasks,
diff --git a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
index 827a8637b9bd2..948858224d724 100644
--- a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
+++ b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
@@ -116,7 +116,7 @@ private[spark] object RDDOperationGraph extends Logging {
     // Use a special prefix here to differentiate this cluster from other operation clusters
     val stageClusterId = STAGE_CLUSTER_PREFIX + stage.stageId
     val stageClusterName = s"Stage ${stage.stageId}" +
-      { if (stage.attemptId == 0) "" else s" (attempt ${stage.attemptId})" }
+      { if (stage.attemptNumber == 0) "" else s" (attempt ${stage.attemptNumber})" }
     val rootCluster = new RDDOperationCluster(stageClusterId, stageClusterName)
 
     var rootNodeCount = 0
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index 5e60218c5740b..ff83301d631c4 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -263,7 +263,7 @@ private[spark] object JsonProtocol {
     val completionTime = stageInfo.completionTime.map(JInt(_)).getOrElse(JNothing)
     val failureReason = stageInfo.failureReason.map(JString(_)).getOrElse(JNothing)
     ("Stage ID" -> stageInfo.stageId) ~
-    ("Stage Attempt ID" -> stageInfo.attemptId) ~
+    ("Stage Attempt ID" -> stageInfo.attemptNumber) ~
     ("Stage Name" -> stageInfo.name) ~
     ("Number of Tasks" -> stageInfo.numTasks) ~
     ("RDD Info" -> rddInfo) ~
diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
index 997c7de8dd02b..b8c84e24c2c3f 100644
--- a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
@@ -195,7 +195,9 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
     val s1Tasks = createTasks(4, execIds)
     s1Tasks.foreach { task =>
-      listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptId, task))
+      listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId,
+        stages.head.attemptNumber,
+        task))
     }
 
     assert(store.count(classOf[TaskDataWrapper]) === s1Tasks.size)
@@ -213,10 +215,11 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
       check[TaskDataWrapper](task.taskId) { wrapper =>
         assert(wrapper.info.taskId === task.taskId)
         assert(wrapper.stageId === stages.head.stageId)
-        assert(wrapper.stageAttemptId === stages.head.attemptId)
-        assert(Arrays.equals(wrapper.stage, Array(stages.head.stageId, stages.head.attemptId)))
+        assert(wrapper.stageAttemptId === stages.head.attemptNumber)
+        assert(Arrays.equals(wrapper.stage, Array(stages.head.stageId, stages.head.attemptNumber)))
 
-        val runtime = Array[AnyRef](stages.head.stageId: JInteger, stages.head.attemptId: JInteger,
+        val runtime = Array[AnyRef](stages.head.stageId: JInteger,
+          stages.head.attemptNumber: JInteger,
           -1L: JLong)
         assert(Arrays.equals(wrapper.runtime, runtime))
 
@@ -237,7 +240,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
         Some(1L), None, true, false, None)
       listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate(
         task.executorId,
-        Seq((task.taskId, stages.head.stageId, stages.head.attemptId, Seq(accum)))))
+        Seq((task.taskId, stages.head.stageId, stages.head.attemptNumber, Seq(accum)))))
     }
 
     check[StageDataWrapper](key(stages.head)) { stage =>
@@ -254,12 +257,12 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     // Fail one of the tasks, re-start it.
     time += 1
     s1Tasks.head.markFinished(TaskState.FAILED, time)
-    listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptId,
+    listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptNumber,
       "taskType", TaskResultLost, s1Tasks.head, null))
 
     time += 1
     val reattempt = newAttempt(s1Tasks.head, nextTaskId())
-    listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptId,
+    listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptNumber,
       reattempt))
 
     assert(store.count(classOf[TaskDataWrapper]) === s1Tasks.size + 1)
@@ -289,7 +292,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     val killed = s1Tasks.drop(1).head
     killed.finishTime = time
     killed.failed = true
-    listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptId,
+    listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptNumber,
       "taskType", TaskKilled("killed"), killed, null))
 
     check[JobDataWrapper](1) { job =>
@@ -311,13 +314,13 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     time += 1
     val denied = newAttempt(killed, nextTaskId())
     val denyReason = TaskCommitDenied(1, 1, 1)
-    listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptId,
+    listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptNumber,
       denied))
 
     time += 1
     denied.finishTime = time
     denied.failed = true
-    listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptId,
+    listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptNumber,
       "taskType", denyReason, denied, null))
 
     check[JobDataWrapper](1) { job =>
@@ -337,7 +340,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
     // Start a new attempt.
     val reattempt2 = newAttempt(denied, nextTaskId())
-    listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptId,
+    listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptNumber,
       reattempt2))
 
     // Succeed all tasks in stage 1.
@@ -350,7 +353,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     time += 1
     pending.foreach { task =>
       task.markFinished(TaskState.FINISHED, time)
-      listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptId,
+      listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptNumber,
         "taskType", Success, task, s1Metrics))
     }
 
@@ -414,13 +417,15 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     time += 1
     val s2Tasks = createTasks(4, execIds)
     s2Tasks.foreach { task =>
-      listener.onTaskStart(SparkListenerTaskStart(stages.last.stageId, stages.last.attemptId, task))
+      listener.onTaskStart(SparkListenerTaskStart(stages.last.stageId,
+        stages.last.attemptNumber,
+        task))
     }
 
     time += 1
     s2Tasks.foreach { task =>
       task.markFinished(TaskState.FAILED, time)
-      listener.onTaskEnd(SparkListenerTaskEnd(stages.last.stageId, stages.last.attemptId,
+      listener.onTaskEnd(SparkListenerTaskEnd(stages.last.stageId, stages.last.attemptNumber,
         "taskType", TaskResultLost, task, null))
     }
 
@@ -455,7 +460,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
     // - Re-submit stage 2, all tasks, and succeed them and the stage.
     val oldS2 = stages.last
-    val newS2 = new StageInfo(oldS2.stageId, oldS2.attemptId + 1, oldS2.name, oldS2.numTasks,
+    val newS2 = new StageInfo(oldS2.stageId, oldS2.attemptNumber + 1, oldS2.name, oldS2.numTasks,
       oldS2.rddInfos, oldS2.parentIds, oldS2.details, oldS2.taskMetrics)
 
     time += 1
@@ -466,14 +471,14 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     val newS2Tasks = createTasks(4, execIds)
 
     newS2Tasks.foreach { task =>
-      listener.onTaskStart(SparkListenerTaskStart(newS2.stageId, newS2.attemptId, task))
+      listener.onTaskStart(SparkListenerTaskStart(newS2.stageId, newS2.attemptNumber, task))
     }
 
     time += 1
     newS2Tasks.foreach { task =>
       task.markFinished(TaskState.FINISHED, time)
-      listener.onTaskEnd(SparkListenerTaskEnd(newS2.stageId, newS2.attemptId, "taskType", Success,
-        task, null))
+      listener.onTaskEnd(SparkListenerTaskEnd(newS2.stageId, newS2.attemptNumber, "taskType",
+        Success, task, null))
     }
 
     time += 1
@@ -522,14 +527,15 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     val j2s2Tasks = createTasks(4, execIds)
 
     j2s2Tasks.foreach { task =>
-      listener.onTaskStart(SparkListenerTaskStart(j2Stages.last.stageId, j2Stages.last.attemptId,
+      listener.onTaskStart(SparkListenerTaskStart(j2Stages.last.stageId,
+        j2Stages.last.attemptNumber,
         task))
     }
 
     time += 1
     j2s2Tasks.foreach { task =>
       task.markFinished(TaskState.FINISHED, time)
-      listener.onTaskEnd(SparkListenerTaskEnd(j2Stages.last.stageId, j2Stages.last.attemptId,
+      listener.onTaskEnd(SparkListenerTaskEnd(j2Stages.last.stageId, j2Stages.last.attemptNumber,
         "taskType", Success, task, null))
     }
 
@@ -919,13 +925,13 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     time += 1
     val tasks = createTasks(2, Array("1"))
     tasks.foreach { task =>
-      listener.onTaskStart(SparkListenerTaskStart(attempt2.stageId, attempt2.attemptId, task))
+      listener.onTaskStart(SparkListenerTaskStart(attempt2.stageId, attempt2.attemptNumber, task))
     }
     assert(store.count(classOf[TaskDataWrapper]) === 2)
 
     // Start a 3rd task. The finished tasks should be deleted.
     createTasks(1, Array("1")).foreach { task =>
-      listener.onTaskStart(SparkListenerTaskStart(attempt2.stageId, attempt2.attemptId, task))
+      listener.onTaskStart(SparkListenerTaskStart(attempt2.stageId, attempt2.attemptNumber, task))
     }
     assert(store.count(classOf[TaskDataWrapper]) === 2)
     intercept[NoSuchElementException] {
@@ -934,7 +940,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
     // Start a 4th task. The first task should be deleted, even if it's still running.
     createTasks(1, Array("1")).foreach { task =>
-      listener.onTaskStart(SparkListenerTaskStart(attempt2.stageId, attempt2.attemptId, task))
+      listener.onTaskStart(SparkListenerTaskStart(attempt2.stageId, attempt2.attemptNumber, task))
     }
     assert(store.count(classOf[TaskDataWrapper]) === 2)
     intercept[NoSuchElementException] {
@@ -960,7 +966,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     }
   }
 
-  private def key(stage: StageInfo): Array[Int] = Array(stage.stageId, stage.attemptId)
+  private def key(stage: StageInfo): Array[Int] = Array(stage.stageId, stage.attemptNumber)
 
   private def check[T: ClassTag](key: Any)(fn: T => Unit): Unit = {
     val value = store.read(classTag[T].runtimeClass, key).asInstanceOf[T]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
index d8adbe7bee13e..73a105266e1c1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
@@ -99,7 +99,7 @@ class SQLAppStatusListener(
     // Reset the metrics tracking object for the new attempt.
     Option(stageMetrics.get(event.stageInfo.stageId)).foreach { metrics =>
       metrics.taskMetrics.clear()
-      metrics.attemptId = event.stageInfo.attemptId
+      metrics.attemptId = event.stageInfo.attemptNumber
     }
   }
 

From eecd83cb2d24907aba303095b052997471247500 Mon Sep 17 00:00:00 2001
From: foxish <ramanathana@google.com>
Date: Mon, 8 Jan 2018 13:01:45 -0800
Subject: [PATCH 041/566] [SPARK-22992][K8S] Remove assumption of the DNS
 domain

## What changes were proposed in this pull request?

Remove the use of FQDN to access the driver because it assumes that it's set up in a DNS zone - `cluster.local` which is common but not ubiquitous
Note that we already access the in-cluster API server through `kubernetes.default.svc`, so, by extension, this should work as well.
The alternative is to introduce DNS zones for both of those addresses.

## How was this patch tested?
Unit tests

cc vanzin liyinan926 mridulm mccheah

Author: foxish <ramanathana@google.com>

Closes #20187 from foxish/cluster.local.

(cherry picked from commit eed82a0b211352215316ec70dc48aefc013ad0b2)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala  | 2 +-
 .../k8s/submit/steps/DriverServiceBootstrapStepSuite.scala    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala
index eb594e4f16ec0..34af7cde6c1a9 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala
@@ -83,7 +83,7 @@ private[spark] class DriverServiceBootstrapStep(
       .build()
 
     val namespace = sparkConf.get(KUBERNETES_NAMESPACE)
-    val driverHostname = s"${driverService.getMetadata.getName}.$namespace.svc.cluster.local"
+    val driverHostname = s"${driverService.getMetadata.getName}.$namespace.svc"
     val resolvedSparkConf = driverSpec.driverSparkConf.clone()
       .set(DRIVER_HOST_KEY, driverHostname)
       .set("spark.driver.port", driverPort.toString)
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStepSuite.scala
index 006ce2668f8a0..78c8c3ba1afbd 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStepSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStepSuite.scala
@@ -85,7 +85,7 @@ class DriverServiceBootstrapStepSuite extends SparkFunSuite with BeforeAndAfter
     val resolvedDriverSpec = configurationStep.configureDriver(baseDriverSpec)
     val expectedServiceName = SHORT_RESOURCE_NAME_PREFIX +
       DriverServiceBootstrapStep.DRIVER_SVC_POSTFIX
-    val expectedHostName = s"$expectedServiceName.my-namespace.svc.cluster.local"
+    val expectedHostName = s"$expectedServiceName.my-namespace.svc"
     verifySparkConfHostNames(resolvedDriverSpec.driverSparkConf, expectedHostName)
   }
 
@@ -120,7 +120,7 @@ class DriverServiceBootstrapStepSuite extends SparkFunSuite with BeforeAndAfter
     val driverService = resolvedDriverSpec.otherKubernetesResources.head.asInstanceOf[Service]
     val expectedServiceName = s"spark-10000${DriverServiceBootstrapStep.DRIVER_SVC_POSTFIX}"
     assert(driverService.getMetadata.getName === expectedServiceName)
-    val expectedHostName = s"$expectedServiceName.my-namespace.svc.cluster.local"
+    val expectedHostName = s"$expectedServiceName.my-namespace.svc"
     verifySparkConfHostNames(resolvedDriverSpec.driverSparkConf, expectedHostName)
   }
 

From 8032cf852fccd0ab8754f633affdc9ba8fc99e58 Mon Sep 17 00:00:00 2001
From: xubo245 <601450868@qq.com>
Date: Tue, 9 Jan 2018 10:15:01 +0800
Subject: [PATCH 042/566] [SPARK-22972] Couldn't find corresponding Hive SerDe
 for data source provider org.apache.spark.sql.hive.orc

## What changes were proposed in this pull request?
Fix the warning: Couldn't find corresponding Hive SerDe for data source provider org.apache.spark.sql.hive.orc.

## How was this patch tested?
 test("SPARK-22972: hive orc source")
    assert(HiveSerDe.sourceToSerDe("org.apache.spark.sql.hive.orc")
      .equals(HiveSerDe.sourceToSerDe("orc")))

Author: xubo245 <601450868@qq.com>

Closes #20165 from xubo245/HiveSerDe.

(cherry picked from commit 68ce792b5857f0291154f524ac651036db868bb9)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../apache/spark/sql/internal/HiveSerDe.scala |  1 +
 .../sql/hive/orc/HiveOrcSourceSuite.scala     | 29 +++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala
index b9515ec7bca2a..dac463641cfab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala
@@ -73,6 +73,7 @@ object HiveSerDe {
     val key = source.toLowerCase(Locale.ROOT) match {
       case s if s.startsWith("org.apache.spark.sql.parquet") => "parquet"
       case s if s.startsWith("org.apache.spark.sql.orc") => "orc"
+      case s if s.startsWith("org.apache.spark.sql.hive.orc") => "orc"
       case s if s.equals("orcfile") => "orc"
       case s if s.equals("parquetfile") => "parquet"
       case s if s.equals("avrofile") => "avro"
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
index 17b7d8cfe127e..d556a030e2186 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
@@ -20,8 +20,10 @@ package org.apache.spark.sql.hive.orc
 import java.io.File
 
 import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.execution.datasources.orc.OrcSuite
 import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.HiveSerDe
 import org.apache.spark.util.Utils
 
 class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton {
@@ -62,6 +64,33 @@ class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton {
        """.stripMargin)
   }
 
+  test("SPARK-22972: hive orc source") {
+    val tableName = "normal_orc_as_source_hive"
+    withTable(tableName) {
+      sql(
+        s"""
+          |CREATE TABLE $tableName
+          |USING org.apache.spark.sql.hive.orc
+          |OPTIONS (
+          |  PATH '${new File(orcTableAsDir.getAbsolutePath).toURI}'
+          |)
+        """.stripMargin)
+
+      val tableMetadata = spark.sessionState.catalog.getTableMetadata(
+        TableIdentifier(tableName))
+      assert(tableMetadata.storage.inputFormat ==
+        Option("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"))
+      assert(tableMetadata.storage.outputFormat ==
+        Option("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"))
+      assert(tableMetadata.storage.serde ==
+        Option("org.apache.hadoop.hive.ql.io.orc.OrcSerde"))
+      assert(HiveSerDe.sourceToSerDe("org.apache.spark.sql.hive.orc")
+        .equals(HiveSerDe.sourceToSerDe("orc")))
+      assert(HiveSerDe.sourceToSerDe("org.apache.spark.sql.orc")
+        .equals(HiveSerDe.sourceToSerDe("orc")))
+    }
+  }
+
   test("SPARK-19459/SPARK-18220: read char/varchar column written by Hive") {
     val location = Utils.createTempDir()
     val uri = location.toURI

From 850b9f39186665fd727737a98b29abe5236830db Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Tue, 9 Jan 2018 10:44:21 +0800
Subject: [PATCH 043/566] [SPARK-22990][CORE] Fix method isFairScheduler in
 JobsTab and StagesTab

## What changes were proposed in this pull request?

In current implementation, the function `isFairScheduler` is always false, since it is comparing String with `SchedulingMode`

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #20186 from gengliangwang/isFairScheduler.

(cherry picked from commit 849043ce1d28a976659278d29368da0799329db8)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala | 8 ++++----
 .../main/scala/org/apache/spark/ui/jobs/StagesTab.scala   | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala
index 99eab1b2a27d8..ff1b75e5c5065 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala
@@ -34,10 +34,10 @@ private[ui] class JobsTab(parent: SparkUI, store: AppStatusStore)
   val killEnabled = parent.killEnabled
 
   def isFairScheduler: Boolean = {
-    store.environmentInfo().sparkProperties.toMap
-      .get("spark.scheduler.mode")
-      .map { mode => mode == SchedulingMode.FAIR }
-      .getOrElse(false)
+    store
+      .environmentInfo()
+      .sparkProperties
+      .contains(("spark.scheduler.mode", SchedulingMode.FAIR.toString))
   }
 
   def getSparkUser: String = parent.getSparkUser
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala
index be05a963f0e68..10b032084ce4f 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala
@@ -37,10 +37,10 @@ private[ui] class StagesTab(val parent: SparkUI, val store: AppStatusStore)
   attachPage(new PoolPage(this))
 
   def isFairScheduler: Boolean = {
-    store.environmentInfo().sparkProperties.toMap
-      .get("spark.scheduler.mode")
-      .map { mode => mode == SchedulingMode.FAIR }
-      .getOrElse(false)
+    store
+      .environmentInfo()
+      .sparkProperties
+      .contains(("spark.scheduler.mode", SchedulingMode.FAIR.toString))
   }
 
   def handleKillRequest(request: HttpServletRequest): Unit = {

From fd46a276c04d2a6617c56abe84be80a1be5d7f99 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 9 Jan 2018 11:49:10 +0800
Subject: [PATCH 044/566] [SPARK-22984] Fix incorrect bitmap copying and offset
 adjustment in GenerateUnsafeRowJoiner

## What changes were proposed in this pull request?

This PR fixes a longstanding correctness bug in `GenerateUnsafeRowJoiner`. This class was introduced in https://github.com/apache/spark/pull/7821 (July 2015 / Spark 1.5.0+) and is used to combine pairs of UnsafeRows in TungstenAggregationIterator, CartesianProductExec, and AppendColumns.

### Bugs fixed by this patch

1. **Incorrect combining of null-tracking bitmaps**: when concatenating two UnsafeRows, the implementation "Concatenate the two bitsets together into a single one, taking padding into account". If one row has no columns then it has a bitset size of 0, but the code was incorrectly assuming that if the left row had a non-zero number of fields then the right row would also have at least one field, so it was copying invalid bytes and and treating them as part of the bitset. I'm not sure whether this bug was also present in the original implementation or whether it was introduced in https://github.com/apache/spark/pull/7892 (which fixed another bug in this code).
2. **Incorrect updating of data offsets for null variable-length fields**: after updating the bitsets and copying fixed-length and variable-length data, we need to perform adjustments to the offsets pointing the start of variable length fields's data. The existing code was _conditionally_ adding a fixed offset to correct for the new length of the combined row, but it is unsafe to do this if the variable-length field has a null value: we always represent nulls by storing `0` in the fixed-length slot, but this code was incorrectly incrementing those values. This bug was present since the original version of `GenerateUnsafeRowJoiner`.

### Why this bug remained latent for so long

The PR which introduced `GenerateUnsafeRowJoiner` features several randomized tests, including tests of the cases where one side of the join has no fields and where string-valued fields are null. However, the existing assertions were too weak to uncover this bug:

- If a null field has a non-zero value in its fixed-length data slot then this will not cause problems for field accesses because the null-tracking bitmap should still be correct and we will not try to use the incorrect offset for anything.
- If the null tracking bitmap is corrupted by joining against a row with no fields then the corruption occurs in field numbers past the actual field numbers contained in the row. Thus valid `isNullAt()` calls will not read the incorrectly-set bits.

The existing `GenerateUnsafeRowJoinerSuite` tests only exercised `.get()` and `isNullAt()`, but didn't actually check the UnsafeRows for bit-for-bit equality, preventing these bugs from failing assertions. It turns out that there was even a [GenerateUnsafeRowJoinerBitsetSuite](https://github.com/apache/spark/blob/03377d2522776267a07b7d6ae9bddf79a4e0f516/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala) but it looks like it also didn't catch this problem because it only tested the bitsets in an end-to-end fashion by accessing them through the `UnsafeRow` interface instead of actually comparing the bitsets' bytes.

### Impact of these bugs

- This bug will cause `equals()` and `hashCode()` to be incorrect for these rows, which will be problematic in case`GenerateUnsafeRowJoiner`'s results are used as join or grouping keys.
- Chained / repeated invocations of `GenerateUnsafeRowJoiner` may result in reads from invalid null bitmap positions causing fields to incorrectly become NULL (see the end-to-end example below).
  - It looks like this generally only happens in `CartesianProductExec`, which our query optimizer often avoids executing (usually we try to plan a `BroadcastNestedLoopJoin` instead).

### End-to-end test case demonstrating the problem

The following query demonstrates how this bug may result in incorrect query results:

```sql
set spark.sql.autoBroadcastJoinThreshold=-1; -- Needed to trigger CartesianProductExec

create table a as select * from values 1;
create table b as select * from values 2;

SELECT
  t3.col1,
  t1.col1
FROM a t1
CROSS JOIN b t2
CROSS JOIN b t3
```

This should return `(2, 1)` but instead was returning `(null, 1)`.

Column pruning ends up trimming off all columns from `t2`, so when `t2` joins with another table this triggers the bitmap-copying bug. This incorrect bitmap is subsequently copied again when performing the final join, causing the final output to have an incorrectly-set null bit for the first field.

## How was this patch tested?

Strengthened the assertions in existing tests in GenerateUnsafeRowJoinerSuite. Also verified that the end-to-end test case which uncovered this now passes.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #20181 from JoshRosen/SPARK-22984-fix-generate-unsaferow-joiner-bitmap-bugs.

(cherry picked from commit f20131dd35939734fe16b0005a086aa72400893b)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../codegen/GenerateUnsafeRowJoiner.scala     | 52 +++++++++-
 .../GenerateUnsafeRowJoinerSuite.scala        | 95 ++++++++++++++++++-
 2 files changed, 138 insertions(+), 9 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala
index be5f5a73b5d47..febf7b0c96c2a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala
@@ -70,7 +70,7 @@ object GenerateUnsafeRowJoiner extends CodeGenerator[(StructType, StructType), U
 
     // --------------------- copy bitset from row 1 and row 2 --------------------------- //
     val copyBitset = Seq.tabulate(outputBitsetWords) { i =>
-      val bits = if (bitset1Remainder > 0) {
+      val bits = if (bitset1Remainder > 0 && bitset2Words != 0) {
         if (i < bitset1Words - 1) {
           s"$getLong(obj1, offset1 + ${i * 8})"
         } else if (i == bitset1Words - 1) {
@@ -152,7 +152,9 @@ object GenerateUnsafeRowJoiner extends CodeGenerator[(StructType, StructType), U
       } else {
         // Number of bytes to increase for the offset. Note that since in UnsafeRow we store the
         // offset in the upper 32 bit of the words, we can just shift the offset to the left by
-        // 32 and increment that amount in place.
+        // 32 and increment that amount in place. However, we need to handle the important special
+        // case of a null field, in which case the offset should be zero and should not have a
+        // shift added to it.
         val shift =
           if (i < schema1.size) {
             s"${(outputBitsetWords - bitset1Words + schema2.size) * 8}L"
@@ -160,14 +162,55 @@ object GenerateUnsafeRowJoiner extends CodeGenerator[(StructType, StructType), U
             s"(${(outputBitsetWords - bitset2Words + schema1.size) * 8}L + numBytesVariableRow1)"
           }
         val cursor = offset + outputBitsetWords * 8 + i * 8
-        s"$putLong(buf, $cursor, $getLong(buf, $cursor) + ($shift << 32));\n"
+        // UnsafeRow is a little underspecified, so in what follows we'll treat UnsafeRowWriter's
+        // output as a de-facto specification for the internal layout of data.
+        //
+        // Null-valued fields will always have a data offset of 0 because
+        // UnsafeRowWriter.setNullAt(ordinal) sets the null bit and stores 0 to in field's
+        // position in the fixed-length section of the row. As a result, we must NOT add
+        // `shift` to the offset for null fields.
+        //
+        // We could perform a null-check here by inspecting the null-tracking bitmap, but doing
+        // so could be expensive and will add significant bloat to the generated code. Instead,
+        // we'll rely on the invariant "stored offset == 0 for variable-length data type implies
+        // that the field's value is null."
+        //
+        // To establish that this invariant holds, we'll prove that a non-null field can never
+        // have a stored offset of 0. There are two cases to consider:
+        //
+        //   1. The non-null field's data is of non-zero length: reading this field's value
+        //      must read data from the variable-length section of the row, so the stored offset
+        //      will actually be used in address calculation and must be correct. The offsets
+        //      count bytes from the start of the UnsafeRow so these offsets will always be
+        //      non-zero because the storage of the offsets themselves takes up space at the
+        //      start of the row.
+        //   2. The non-null field's data is of zero length (i.e. its data is empty). In this
+        //      case, we have to worry about the possibility that an arbitrary offset value was
+        //      stored because we never actually read any bytes using this offset and therefore
+        //      would not crash if it was incorrect. The variable-sized data writing paths in
+        //      UnsafeRowWriter unconditionally calls setOffsetAndSize(ordinal, numBytes) with
+        //      no special handling for the case where `numBytes == 0`. Internally,
+        //      setOffsetAndSize computes the offset without taking the size into account. Thus
+        //      the stored offset is the same non-zero offset that would be used if the field's
+        //      dataSize was non-zero (and in (1) above we've shown that case behaves as we
+        //      expect).
+        //
+        // Thus it is safe to perform `existingOffset != 0` checks here in the place of
+        // more expensive null-bit checks.
+        s"""
+           |existingOffset = $getLong(buf, $cursor);
+           |if (existingOffset != 0) {
+           |    $putLong(buf, $cursor, existingOffset + ($shift << 32));
+           |}
+         """.stripMargin
       }
     }
 
     val updateOffsets = ctx.splitExpressions(
       expressions = updateOffset,
       funcName = "copyBitsetFunc",
-      arguments = ("long", "numBytesVariableRow1") :: Nil)
+      arguments = ("long", "numBytesVariableRow1") :: Nil,
+      makeSplitFunction = (s: String) => "long existingOffset;\n" + s)
 
     // ------------------------ Finally, put everything together  --------------------------- //
     val codeBody = s"""
@@ -200,6 +243,7 @@ object GenerateUnsafeRowJoiner extends CodeGenerator[(StructType, StructType), U
        |    $copyFixedLengthRow2
        |    $copyVariableLengthRow1
        |    $copyVariableLengthRow2
+       |    long existingOffset;
        |    $updateOffsets
        |
        |    out.pointTo(buf, sizeInBytes);
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerSuite.scala
index f203f25ad10d4..75c6beeb32150 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerSuite.scala
@@ -22,8 +22,10 @@ import scala.util.Random
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.RandomDataGenerator
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
-import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
+import org.apache.spark.sql.catalyst.expressions.{JoinedRow, UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * Test suite for [[GenerateUnsafeRowJoiner]].
@@ -45,6 +47,32 @@ class GenerateUnsafeRowJoinerSuite extends SparkFunSuite {
     testConcat(64, 64, fixed)
   }
 
+  test("rows with all empty strings") {
+    val schema = StructType(Seq(
+      StructField("f1", StringType), StructField("f2", StringType)))
+    val row: UnsafeRow = UnsafeProjection.create(schema).apply(
+      InternalRow(UTF8String.EMPTY_UTF8, UTF8String.EMPTY_UTF8))
+     testConcat(schema, row, schema, row)
+  }
+
+  test("rows with all empty int arrays") {
+    val schema = StructType(Seq(
+      StructField("f1", ArrayType(IntegerType)), StructField("f2", ArrayType(IntegerType))))
+    val emptyIntArray =
+      ExpressionEncoder[Array[Int]]().resolveAndBind().toRow(Array.emptyIntArray).getArray(0)
+    val row: UnsafeRow = UnsafeProjection.create(schema).apply(
+      InternalRow(emptyIntArray, emptyIntArray))
+    testConcat(schema, row, schema, row)
+  }
+
+  test("alternating empty and non-empty strings") {
+    val schema = StructType(Seq(
+      StructField("f1", StringType), StructField("f2", StringType)))
+    val row: UnsafeRow = UnsafeProjection.create(schema).apply(
+      InternalRow(UTF8String.EMPTY_UTF8, UTF8String.fromString("foo")))
+    testConcat(schema, row, schema, row)
+  }
+
   test("randomized fix width types") {
     for (i <- 0 until 20) {
       testConcatOnce(Random.nextInt(100), Random.nextInt(100), fixed)
@@ -94,27 +122,84 @@ class GenerateUnsafeRowJoinerSuite extends SparkFunSuite {
     val extRow2 = RandomDataGenerator.forType(schema2, nullable = false).get.apply()
     val row1 = converter1.apply(internalConverter1.apply(extRow1).asInstanceOf[InternalRow])
     val row2 = converter2.apply(internalConverter2.apply(extRow2).asInstanceOf[InternalRow])
+    testConcat(schema1, row1, schema2, row2)
+  }
+
+  private def testConcat(
+      schema1: StructType,
+      row1: UnsafeRow,
+      schema2: StructType,
+      row2: UnsafeRow) {
 
     // Run the joiner.
     val mergedSchema = StructType(schema1 ++ schema2)
     val concater = GenerateUnsafeRowJoiner.create(schema1, schema2)
-    val output = concater.join(row1, row2)
+    val output: UnsafeRow = concater.join(row1, row2)
+
+    // We'll also compare to an UnsafeRow produced with JoinedRow + UnsafeProjection. This ensures
+    // that unused space in the row (e.g. leftover bits in the null-tracking bitmap) is written
+    // correctly.
+    val expectedOutput: UnsafeRow = {
+      val joinedRowProjection = UnsafeProjection.create(mergedSchema)
+      val joined = new JoinedRow()
+      joinedRowProjection.apply(joined.apply(row1, row2))
+    }
 
     // Test everything equals ...
     for (i <- mergedSchema.indices) {
+      val dataType = mergedSchema(i).dataType
       if (i < schema1.size) {
         assert(output.isNullAt(i) === row1.isNullAt(i))
         if (!output.isNullAt(i)) {
-          assert(output.get(i, mergedSchema(i).dataType) === row1.get(i, mergedSchema(i).dataType))
+          assert(output.get(i, dataType) === row1.get(i, dataType))
+          assert(output.get(i, dataType) === expectedOutput.get(i, dataType))
         }
       } else {
         assert(output.isNullAt(i) === row2.isNullAt(i - schema1.size))
         if (!output.isNullAt(i)) {
-          assert(output.get(i, mergedSchema(i).dataType) ===
-            row2.get(i - schema1.size, mergedSchema(i).dataType))
+          assert(output.get(i, dataType) === row2.get(i - schema1.size, dataType))
+          assert(output.get(i, dataType) === expectedOutput.get(i, dataType))
         }
       }
     }
+
+
+    assert(
+      expectedOutput.getSizeInBytes == output.getSizeInBytes,
+      "output isn't same size in bytes as slow path")
+
+    // Compare the UnsafeRows byte-by-byte so that we can print more useful debug information in
+    // case this assertion fails:
+    val actualBytes = output.getBaseObject.asInstanceOf[Array[Byte]]
+      .take(output.getSizeInBytes)
+    val expectedBytes = expectedOutput.getBaseObject.asInstanceOf[Array[Byte]]
+      .take(expectedOutput.getSizeInBytes)
+
+    val bitsetWidth = UnsafeRow.calculateBitSetWidthInBytes(expectedOutput.numFields())
+    val actualBitset = actualBytes.take(bitsetWidth)
+    val expectedBitset = expectedBytes.take(bitsetWidth)
+    assert(actualBitset === expectedBitset, "bitsets were not equal")
+
+    val fixedLengthSize = expectedOutput.numFields() * 8
+    val actualFixedLength = actualBytes.slice(bitsetWidth, bitsetWidth + fixedLengthSize)
+    val expectedFixedLength = expectedBytes.slice(bitsetWidth, bitsetWidth + fixedLengthSize)
+    if (actualFixedLength !== expectedFixedLength) {
+      actualFixedLength.grouped(8)
+        .zip(expectedFixedLength.grouped(8))
+        .zip(mergedSchema.fields.toIterator)
+        .foreach {
+          case ((actual, expected), field) =>
+            assert(actual === expected, s"Fixed length sections are not equal for field $field")
+      }
+      fail("Fixed length sections were not equal")
+    }
+
+    val variableLengthStart = bitsetWidth + fixedLengthSize
+    val actualVariableLength = actualBytes.drop(variableLengthStart)
+    val expectedVariableLength = expectedBytes.drop(variableLengthStart)
+    assert(actualVariableLength === expectedVariableLength, "fixed length sections were not equal")
+
+    assert(output.hashCode() == expectedOutput.hashCode(), "hash codes were not equal")
   }
 
 }

From 911a4dbe7da45dbb62ed08b0c99bb97ea4fc9e3e Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Mon, 8 Jan 2018 21:58:26 -0800
Subject: [PATCH 045/566] [SPARK-21292][DOCS] refreshtable example

## What changes were proposed in this pull request?

doc update

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #20198 from felixcheung/rrefreshdoc.

(cherry picked from commit 8486ad419d8f1779e277ec71c39e1516673a83ab)
Signed-off-by: Felix Cheung <felixcheung@apache.org>
---
 docs/sql-programming-guide.md | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 3ccaaf4d5b1fa..72f79d6909ecc 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -915,6 +915,14 @@ spark.catalog.refreshTable("my_table")
 
 </div>
 
+<div data-lang="r"  markdown="1">
+
+{% highlight r %}
+refreshTable("my_table")
+{% endhighlight %}
+
+</div>
+
 <div data-lang="sql"  markdown="1">
 
 {% highlight sql %}
@@ -1498,10 +1506,10 @@ that these options will be deprecated in future release as more optimizations ar
 ## Broadcast Hint for SQL Queries
 
 The `BROADCAST` hint guides Spark to broadcast each specified table when joining them with another table or view.
-When Spark deciding the join methods, the broadcast hash join (i.e., BHJ) is preferred, 
+When Spark deciding the join methods, the broadcast hash join (i.e., BHJ) is preferred,
 even if the statistics is above the configuration `spark.sql.autoBroadcastJoinThreshold`.
 When both sides of a join are specified, Spark broadcasts the one having the lower statistics.
-Note Spark does not guarantee BHJ is always chosen, since not all cases (e.g. full outer join) 
+Note Spark does not guarantee BHJ is always chosen, since not all cases (e.g. full outer join)
 support BHJ. When the broadcast nested loop join is selected, we still respect the hint.
 
 <div class="codetabs">
@@ -1780,7 +1788,7 @@ options.
     Note that, for <b>DecimalType(38,0)*</b>, the table above intentionally does not cover all other combinations of scales and precisions because currently we only infer decimal type like `BigInteger`/`BigInt`. For example, 1.1 is inferred as double type.
   - In PySpark, now we need Pandas 0.19.2 or upper if you want to use Pandas related functionalities, such as `toPandas`, `createDataFrame` from Pandas DataFrame, etc.
   - In PySpark, the behavior of timestamp values for Pandas related functionalities was changed to respect session timezone. If you want to use the old behavior, you need to set a configuration `spark.sql.execution.pandas.respectSessionTimeZone` to `False`. See [SPARK-22395](https://issues.apache.org/jira/browse/SPARK-22395) for details.
- 
+
  - Since Spark 2.3, when either broadcast hash join or broadcast nested loop join is applicable, we prefer to broadcasting the table that is explicitly specified in a broadcast hint. For details, see the section [Broadcast Hint](#broadcast-hint-for-sql-queries) and [SPARK-22489](https://issues.apache.org/jira/browse/SPARK-22489).
 
  - Since Spark 2.3, when all inputs are binary, `functions.concat()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.concatBinaryAsString` to `true`.
@@ -2167,7 +2175,7 @@ Not all the APIs of the Hive UDF/UDTF/UDAF are supported by Spark SQL. Below are
   Spark SQL currently does not support the reuse of aggregation.
 * `getWindowingEvaluator` (`GenericUDAFEvaluator`) is a function to optimize aggregation by evaluating
   an aggregate over a fixed window.
-  
+
 ### Incompatible Hive UDF
 
 Below are the scenarios in which Hive and Spark generate different results:

From a23c07ecb1dba0dbd52a0d2362d8d21e9cdd8b5a Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Mon, 8 Jan 2018 22:08:19 -0800
Subject: [PATCH 046/566] [SPARK-21293][SPARKR][DOCS] structured streaming doc
 update

## What changes were proposed in this pull request?

doc update

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #20197 from felixcheung/rwadoc.

(cherry picked from commit 02214b094390e913f52e71d55c9bb8a81c9e7ef9)
Signed-off-by: Felix Cheung <felixcheung@apache.org>
---
 R/pkg/vignettes/sparkr-vignettes.Rmd          |  2 +-
 docs/sparkr.md                                |  2 +-
 .../structured-streaming-programming-guide.md | 32 +++++++++++++++++--
 3 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 2e662424b25f2..feca617c2554c 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -1042,7 +1042,7 @@ unlink(modelPath)
 
 ## Structured Streaming
 
-SparkR supports the Structured Streaming API (experimental).
+SparkR supports the Structured Streaming API.
 
 You can check the Structured Streaming Programming Guide for [an introduction](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#programming-model) to its programming model and basic concepts.
 
diff --git a/docs/sparkr.md b/docs/sparkr.md
index 997ea60fb6cf0..6685b585a393a 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -596,7 +596,7 @@ The following example shows how to save/load a MLlib model by SparkR.
 
 # Structured Streaming
 
-SparkR supports the Structured Streaming API (experimental). Structured Streaming is a scalable and fault-tolerant stream processing engine built on the Spark SQL engine. For more information see the R API on the [Structured Streaming Programming Guide](structured-streaming-programming-guide.html)
+SparkR supports the Structured Streaming API. Structured Streaming is a scalable and fault-tolerant stream processing engine built on the Spark SQL engine. For more information see the R API on the [Structured Streaming Programming Guide](structured-streaming-programming-guide.html)
 
 # R Function Name Conflicts
 
diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index 31fcfabb9cacc..de13e281916db 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -827,8 +827,8 @@ df.isStreaming()
 {% endhighlight %}
 </div>
 <div data-lang="r"  markdown="1">
-{% highlight bash %}
-Not available.
+{% highlight r %}
+isStreaming(df)
 {% endhighlight %}
 </div>
 </div>
@@ -885,6 +885,19 @@ windowedCounts = words.groupBy(
 ).count()
 {% endhighlight %}
 
+</div>
+<div data-lang="r"  markdown="1">
+{% highlight r %}
+words <- ...  # streaming DataFrame of schema { timestamp: Timestamp, word: String }
+
+# Group the data by window and word and compute the count of each group
+windowedCounts <- count(
+                    groupBy(
+                      words,
+                      window(words$timestamp, "10 minutes", "5 minutes"),
+                      words$word))
+{% endhighlight %}
+
 </div>
 </div>
 
@@ -959,6 +972,21 @@ windowedCounts = words \
     .count()
 {% endhighlight %}
 
+</div>
+<div data-lang="r"  markdown="1">
+{% highlight r %}
+words <- ...  # streaming DataFrame of schema { timestamp: Timestamp, word: String }
+
+# Group the data by window and word and compute the count of each group
+
+words <- withWatermark(words, "timestamp", "10 minutes")
+windowedCounts <- count(
+                    groupBy(
+                      words,
+                      window(words$timestamp, "10 minutes", "5 minutes"),
+                      words$word))
+{% endhighlight %}
+
 </div>
 </div>
 

From e79480e5de0826bd77ff2a76a38059010a55149c Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 9 Jan 2018 16:31:20 +0800
Subject: [PATCH 047/566] [SPARK-23000] Fix Flaky test suite
 DataSourceWithHiveMetastoreCatalogSuite in Spark 2.3

## What changes were proposed in this pull request?
https://amplab.cs.berkeley.edu/jenkins/job/spark-branch-2.3-test-sbt-hadoop-2.6/

The test suite DataSourceWithHiveMetastoreCatalogSuite of Branch 2.3 always failed in hadoop 2.6

The table `t` exists in `default`, but `runSQLHive` reported the table does not exist. Obviously, Hive client's default database is different. The fix is to clean the environment and use `DEFAULT` as the database.

```
org.apache.spark.sql.execution.QueryExecutionException: FAILED: SemanticException [Error 10001]: Line 1:14 Table not found 't'
Stacktrace

sbt.ForkMain$ForkError: org.apache.spark.sql.execution.QueryExecutionException: FAILED: SemanticException [Error 10001]: Line 1:14 Table not found 't'
	at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$runHive$1.apply(HiveClientImpl.scala:699)
	at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$runHive$1.apply(HiveClientImpl.scala:683)
	at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$withHiveState$1.apply(HiveClientImpl.scala:272)
	at org.apache.spark.sql.hive.client.HiveClientImpl.liftedTree1$1(HiveClientImpl.scala:210)
	at org.apache.spark.sql.hive.client.HiveClientImpl.retryLocked(HiveClientImpl.scala:209)
	at org.apache.spark.sql.hive.client.HiveClientImpl.withHiveState(HiveClientImpl.scala:255)
	at org.apache.spark.sql.hive.client.HiveClientImpl.runHive(HiveClientImpl.scala:683)
	at org.apache.spark.sql.hive.client.HiveClientImpl.runSqlHive(HiveClientImpl.scala:673)
```

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20196 from gatorsmile/testFix.

(cherry picked from commit 0959aa581a399279be3f94214bcdffc6a1b6d60a)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../org/apache/spark/sql/hive/client/HiveClientImpl.scala   | 6 +++++-
 .../apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala   | 5 +++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 7b7f4e0f10210..102f40bacc985 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -823,7 +823,8 @@ private[hive] class HiveClientImpl(
   }
 
   def reset(): Unit = withHiveState {
-    client.getAllTables("default").asScala.foreach { t =>
+    try {
+      client.getAllTables("default").asScala.foreach { t =>
         logDebug(s"Deleting table $t")
         val table = client.getTable("default", t)
         client.getIndexes("default", t, 255).asScala.foreach { index =>
@@ -837,6 +838,9 @@ private[hive] class HiveClientImpl(
         logDebug(s"Dropping Database: $db")
         client.dropDatabase(db, true, false, true)
       }
+    } finally {
+      runSqlHive("USE default")
+    }
   }
 }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index 18137e7ea1d63..cf4ce83124d88 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -146,6 +146,11 @@ class DataSourceWithHiveMetastoreCatalogSuite
     'id cast StringType as 'd2
   ).coalesce(1)
 
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    sparkSession.metadataHive.reset()
+  }
+
   Seq(
     "parquet" -> ((
       "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",

From 47f975b4243e53c1905720d4e007ea18f4822906 Mon Sep 17 00:00:00 2001
From: Yinan Li <liyinan926@gmail.com>
Date: Tue, 9 Jan 2018 01:32:48 -0800
Subject: [PATCH 048/566] [SPARK-22998][K8S] Set missing value for
 SPARK_MOUNTED_CLASSPATH in the executors

## What changes were proposed in this pull request?

The environment variable `SPARK_MOUNTED_CLASSPATH` is referenced in the executor's Dockerfile, where its value is added to the classpath of the executor. However, the scheduler backend code missed setting it when creating the executor pods. This PR fixes it.

## How was this patch tested?

Unit tested.

vanzin Can you help take a look? Thanks!
foxish

Author: Yinan Li <liyinan926@gmail.com>

Closes #20193 from liyinan926/master.

(cherry picked from commit 6a4206ff04746481d7c8e307dfd0d31ff1402555)
Signed-off-by: Felix Cheung <felixcheung@apache.org>
---
 .../spark/scheduler/cluster/k8s/ExecutorPodFactory.scala     | 5 ++++-
 .../scheduler/cluster/k8s/ExecutorPodFactorySuite.scala      | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
index 066d7e9f70ca5..bcacb3934d36a 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
@@ -94,6 +94,8 @@ private[spark] class ExecutorPodFactory(
   private val executorCores = sparkConf.getDouble("spark.executor.cores", 1)
   private val executorLimitCores = sparkConf.get(KUBERNETES_EXECUTOR_LIMIT_CORES)
 
+  private val executorJarsDownloadDir = sparkConf.get(JARS_DOWNLOAD_LOCATION)
+
   /**
    * Configure and construct an executor pod with the given parameters.
    */
@@ -145,7 +147,8 @@ private[spark] class ExecutorPodFactory(
       (ENV_EXECUTOR_CORES, math.ceil(executorCores).toInt.toString),
       (ENV_EXECUTOR_MEMORY, executorMemoryString),
       (ENV_APPLICATION_ID, applicationId),
-      (ENV_EXECUTOR_ID, executorId)) ++ executorEnvs)
+      (ENV_EXECUTOR_ID, executorId),
+      (ENV_MOUNTED_CLASSPATH, s"$executorJarsDownloadDir/*")) ++ executorEnvs)
       .map(env => new EnvVarBuilder()
         .withName(env._1)
         .withValue(env._2)
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
index 884da8aabd880..7cfbe54c95390 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
@@ -197,7 +197,8 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
       ENV_EXECUTOR_CORES -> "1",
       ENV_EXECUTOR_MEMORY -> "1g",
       ENV_APPLICATION_ID -> "dummy",
-      ENV_EXECUTOR_POD_IP -> null) ++ additionalEnvVars
+      ENV_EXECUTOR_POD_IP -> null,
+      ENV_MOUNTED_CLASSPATH -> "/var/spark-data/spark-jars/*") ++ additionalEnvVars
 
     assert(executor.getSpec.getContainers.size() === 1)
     assert(executor.getSpec.getContainers.get(0).getEnv.size() === defaultEnvs.size)

From 60f6b994505e3f82091a04eed2dc0a9e8bd523ce Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 9 Jan 2018 21:48:14 +0800
Subject: [PATCH 049/566] [SPARK-16060][SQL] Support Vectorized ORC Reader

## What changes were proposed in this pull request?

This PR adds an ORC columnar-batch reader to native `OrcFileFormat`. Since both Spark `ColumnarBatch` and ORC `RowBatch` are used together, it is faster than the current Spark implementation. This replaces the prior PR, #17924.

Also, this PR adds `OrcReadBenchmark` to show the performance improvement.

## How was this patch tested?

Pass the existing test cases.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19943 from dongjoon-hyun/SPARK-16060.

(cherry picked from commit f44ba910f58083458e1133502e193a9d6f2bf766)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/sql/internal/SQLConf.scala   |   7 +
 .../orc/OrcColumnarBatchReader.java           | 523 ++++++++++++++++++
 .../datasources/orc/OrcFileFormat.scala       |  75 ++-
 .../execution/datasources/orc/OrcUtils.scala  |   7 +-
 .../spark/sql/hive/orc/OrcReadBenchmark.scala | 435 +++++++++++++++
 5 files changed, 1022 insertions(+), 25 deletions(-)
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 5c61f10bb71ad..74949db883f7a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -386,6 +386,11 @@ object SQLConf {
     .checkValues(Set("hive", "native"))
     .createWithDefault("native")
 
+  val ORC_VECTORIZED_READER_ENABLED = buildConf("spark.sql.orc.enableVectorizedReader")
+    .doc("Enables vectorized orc decoding.")
+    .booleanConf
+    .createWithDefault(true)
+
   val ORC_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.orc.filterPushdown")
     .doc("When true, enable filter pushdown for ORC files.")
     .booleanConf
@@ -1183,6 +1188,8 @@ class SQLConf extends Serializable with Logging {
 
   def orcCompressionCodec: String = getConf(ORC_COMPRESSION)
 
+  def orcVectorizedReaderEnabled: Boolean = getConf(ORC_VECTORIZED_READER_ENABLED)
+
   def parquetCompressionCodec: String = getConf(PARQUET_COMPRESSION)
 
   def parquetVectorizedReaderEnabled: Boolean = getConf(PARQUET_VECTORIZED_READER_ENABLED)
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
new file mode 100644
index 0000000000000..5c28d0e6e507a
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
@@ -0,0 +1,523 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.orc;
+
+import java.io.IOException;
+import java.util.stream.IntStream;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.orc.OrcConf;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.mapred.OrcInputFormat;
+import org.apache.orc.storage.common.type.HiveDecimal;
+import org.apache.orc.storage.ql.exec.vector.*;
+import org.apache.orc.storage.serde2.io.HiveDecimalWritable;
+
+import org.apache.spark.memory.MemoryMode;
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.execution.vectorized.ColumnVectorUtils;
+import org.apache.spark.sql.execution.vectorized.OffHeapColumnVector;
+import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector;
+import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
+import org.apache.spark.sql.types.*;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
+
+
+/**
+ * To support vectorization in WholeStageCodeGen, this reader returns ColumnarBatch.
+ * After creating, `initialize` and `initBatch` should be called sequentially.
+ */
+public class OrcColumnarBatchReader extends RecordReader<Void, ColumnarBatch> {
+
+  /**
+   * The default size of batch. We use this value for both ORC and Spark consistently
+   * because they have different default values like the following.
+   *
+   * - ORC's VectorizedRowBatch.DEFAULT_SIZE = 1024
+   * - Spark's ColumnarBatch.DEFAULT_BATCH_SIZE = 4 * 1024
+   */
+  public static final int DEFAULT_SIZE = 4 * 1024;
+
+  // ORC File Reader
+  private Reader reader;
+
+  // Vectorized ORC Row Batch
+  private VectorizedRowBatch batch;
+
+  /**
+   * The column IDs of the physical ORC file schema which are required by this reader.
+   * -1 means this required column doesn't exist in the ORC file.
+   */
+  private int[] requestedColIds;
+
+  // Record reader from ORC row batch.
+  private org.apache.orc.RecordReader recordReader;
+
+  private StructField[] requiredFields;
+
+  // The result columnar batch for vectorized execution by whole-stage codegen.
+  private ColumnarBatch columnarBatch;
+
+  // Writable column vectors of the result columnar batch.
+  private WritableColumnVector[] columnVectors;
+
+  /**
+   * The memory mode of the columnarBatch
+   */
+  private final MemoryMode MEMORY_MODE;
+
+  public OrcColumnarBatchReader(boolean useOffHeap) {
+    MEMORY_MODE = useOffHeap ? MemoryMode.OFF_HEAP : MemoryMode.ON_HEAP;
+  }
+
+
+  @Override
+  public Void getCurrentKey() throws IOException, InterruptedException {
+    return null;
+  }
+
+  @Override
+  public ColumnarBatch getCurrentValue() throws IOException, InterruptedException {
+    return columnarBatch;
+  }
+
+  @Override
+  public float getProgress() throws IOException, InterruptedException {
+    return recordReader.getProgress();
+  }
+
+  @Override
+  public boolean nextKeyValue() throws IOException, InterruptedException {
+    return nextBatch();
+  }
+
+  @Override
+  public void close() throws IOException {
+    if (columnarBatch != null) {
+      columnarBatch.close();
+      columnarBatch = null;
+    }
+    if (recordReader != null) {
+      recordReader.close();
+      recordReader = null;
+    }
+  }
+
+  /**
+   * Initialize ORC file reader and batch record reader.
+   * Please note that `initBatch` is needed to be called after this.
+   */
+  @Override
+  public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
+      throws IOException, InterruptedException {
+    FileSplit fileSplit = (FileSplit)inputSplit;
+    Configuration conf = taskAttemptContext.getConfiguration();
+    reader = OrcFile.createReader(
+      fileSplit.getPath(),
+      OrcFile.readerOptions(conf)
+        .maxLength(OrcConf.MAX_FILE_LENGTH.getLong(conf))
+        .filesystem(fileSplit.getPath().getFileSystem(conf)));
+
+    Reader.Options options =
+      OrcInputFormat.buildOptions(conf, reader, fileSplit.getStart(), fileSplit.getLength());
+    recordReader = reader.rows(options);
+  }
+
+  /**
+   * Initialize columnar batch by setting required schema and partition information.
+   * With this information, this creates ColumnarBatch with the full schema.
+   */
+  public void initBatch(
+      TypeDescription orcSchema,
+      int[] requestedColIds,
+      StructField[] requiredFields,
+      StructType partitionSchema,
+      InternalRow partitionValues) {
+    batch = orcSchema.createRowBatch(DEFAULT_SIZE);
+    assert(!batch.selectedInUse); // `selectedInUse` should be initialized with `false`.
+
+    this.requiredFields = requiredFields;
+    this.requestedColIds = requestedColIds;
+    assert(requiredFields.length == requestedColIds.length);
+
+    StructType resultSchema = new StructType(requiredFields);
+    for (StructField f : partitionSchema.fields()) {
+      resultSchema = resultSchema.add(f);
+    }
+
+    int capacity = DEFAULT_SIZE;
+    if (MEMORY_MODE == MemoryMode.OFF_HEAP) {
+      columnVectors = OffHeapColumnVector.allocateColumns(capacity, resultSchema);
+    } else {
+      columnVectors = OnHeapColumnVector.allocateColumns(capacity, resultSchema);
+    }
+    columnarBatch = new ColumnarBatch(resultSchema, columnVectors, capacity);
+
+    if (partitionValues.numFields() > 0) {
+      int partitionIdx = requiredFields.length;
+      for (int i = 0; i < partitionValues.numFields(); i++) {
+        ColumnVectorUtils.populate(columnVectors[i + partitionIdx], partitionValues, i);
+        columnVectors[i + partitionIdx].setIsConstant();
+      }
+    }
+
+    // Initialize the missing columns once.
+    for (int i = 0; i < requiredFields.length; i++) {
+      if (requestedColIds[i] == -1) {
+        columnVectors[i].putNulls(0, columnarBatch.capacity());
+        columnVectors[i].setIsConstant();
+      }
+    }
+  }
+
+  /**
+   * Return true if there exists more data in the next batch. If exists, prepare the next batch
+   * by copying from ORC VectorizedRowBatch columns to Spark ColumnarBatch columns.
+   */
+  private boolean nextBatch() throws IOException {
+    for (WritableColumnVector vector : columnVectors) {
+      vector.reset();
+    }
+    columnarBatch.setNumRows(0);
+
+    recordReader.nextBatch(batch);
+    int batchSize = batch.size;
+    if (batchSize == 0) {
+      return false;
+    }
+    columnarBatch.setNumRows(batchSize);
+    for (int i = 0; i < requiredFields.length; i++) {
+      StructField field = requiredFields[i];
+      WritableColumnVector toColumn = columnVectors[i];
+
+      if (requestedColIds[i] >= 0) {
+        ColumnVector fromColumn = batch.cols[requestedColIds[i]];
+
+        if (fromColumn.isRepeating) {
+          putRepeatingValues(batchSize, field, fromColumn, toColumn);
+        } else if (fromColumn.noNulls) {
+          putNonNullValues(batchSize, field, fromColumn, toColumn);
+        } else {
+          putValues(batchSize, field, fromColumn, toColumn);
+        }
+      }
+    }
+    return true;
+  }
+
+  private void putRepeatingValues(
+      int batchSize,
+      StructField field,
+      ColumnVector fromColumn,
+      WritableColumnVector toColumn) {
+    if (fromColumn.isNull[0]) {
+      toColumn.putNulls(0, batchSize);
+    } else {
+      DataType type = field.dataType();
+      if (type instanceof BooleanType) {
+        toColumn.putBooleans(0, batchSize, ((LongColumnVector)fromColumn).vector[0] == 1);
+      } else if (type instanceof ByteType) {
+        toColumn.putBytes(0, batchSize, (byte)((LongColumnVector)fromColumn).vector[0]);
+      } else if (type instanceof ShortType) {
+        toColumn.putShorts(0, batchSize, (short)((LongColumnVector)fromColumn).vector[0]);
+      } else if (type instanceof IntegerType || type instanceof DateType) {
+        toColumn.putInts(0, batchSize, (int)((LongColumnVector)fromColumn).vector[0]);
+      } else if (type instanceof LongType) {
+        toColumn.putLongs(0, batchSize, ((LongColumnVector)fromColumn).vector[0]);
+      } else if (type instanceof TimestampType) {
+        toColumn.putLongs(0, batchSize,
+          fromTimestampColumnVector((TimestampColumnVector)fromColumn, 0));
+      } else if (type instanceof FloatType) {
+        toColumn.putFloats(0, batchSize, (float)((DoubleColumnVector)fromColumn).vector[0]);
+      } else if (type instanceof DoubleType) {
+        toColumn.putDoubles(0, batchSize, ((DoubleColumnVector)fromColumn).vector[0]);
+      } else if (type instanceof StringType || type instanceof BinaryType) {
+        BytesColumnVector data = (BytesColumnVector)fromColumn;
+        WritableColumnVector arrayData = toColumn.getChildColumn(0);
+        int size = data.vector[0].length;
+        arrayData.reserve(size);
+        arrayData.putBytes(0, size, data.vector[0], 0);
+        for (int index = 0; index < batchSize; index++) {
+          toColumn.putArray(index, 0, size);
+        }
+      } else if (type instanceof DecimalType) {
+        DecimalType decimalType = (DecimalType)type;
+        putDecimalWritables(
+          toColumn,
+          batchSize,
+          decimalType.precision(),
+          decimalType.scale(),
+          ((DecimalColumnVector)fromColumn).vector[0]);
+      } else {
+        throw new UnsupportedOperationException("Unsupported Data Type: " + type);
+      }
+    }
+  }
+
+  private void putNonNullValues(
+      int batchSize,
+      StructField field,
+      ColumnVector fromColumn,
+      WritableColumnVector toColumn) {
+    DataType type = field.dataType();
+    if (type instanceof BooleanType) {
+      long[] data = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        toColumn.putBoolean(index, data[index] == 1);
+      }
+    } else if (type instanceof ByteType) {
+      long[] data = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        toColumn.putByte(index, (byte)data[index]);
+      }
+    } else if (type instanceof ShortType) {
+      long[] data = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        toColumn.putShort(index, (short)data[index]);
+      }
+    } else if (type instanceof IntegerType || type instanceof DateType) {
+      long[] data = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        toColumn.putInt(index, (int)data[index]);
+      }
+    } else if (type instanceof LongType) {
+      toColumn.putLongs(0, batchSize, ((LongColumnVector)fromColumn).vector, 0);
+    } else if (type instanceof TimestampType) {
+      TimestampColumnVector data = ((TimestampColumnVector)fromColumn);
+      for (int index = 0; index < batchSize; index++) {
+        toColumn.putLong(index, fromTimestampColumnVector(data, index));
+      }
+    } else if (type instanceof FloatType) {
+      double[] data = ((DoubleColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        toColumn.putFloat(index, (float)data[index]);
+      }
+    } else if (type instanceof DoubleType) {
+      toColumn.putDoubles(0, batchSize, ((DoubleColumnVector)fromColumn).vector, 0);
+    } else if (type instanceof StringType || type instanceof BinaryType) {
+      BytesColumnVector data = ((BytesColumnVector)fromColumn);
+      WritableColumnVector arrayData = toColumn.getChildColumn(0);
+      int totalNumBytes = IntStream.of(data.length).sum();
+      arrayData.reserve(totalNumBytes);
+      for (int index = 0, pos = 0; index < batchSize; pos += data.length[index], index++) {
+        arrayData.putBytes(pos, data.length[index], data.vector[index], data.start[index]);
+        toColumn.putArray(index, pos, data.length[index]);
+      }
+    } else if (type instanceof DecimalType) {
+      DecimalType decimalType = (DecimalType)type;
+      DecimalColumnVector data = ((DecimalColumnVector)fromColumn);
+      if (decimalType.precision() > Decimal.MAX_LONG_DIGITS()) {
+        WritableColumnVector arrayData = toColumn.getChildColumn(0);
+        arrayData.reserve(batchSize * 16);
+      }
+      for (int index = 0; index < batchSize; index++) {
+        putDecimalWritable(
+          toColumn,
+          index,
+          decimalType.precision(),
+          decimalType.scale(),
+          data.vector[index]);
+      }
+    } else {
+      throw new UnsupportedOperationException("Unsupported Data Type: " + type);
+    }
+  }
+
+  private void putValues(
+      int batchSize,
+      StructField field,
+      ColumnVector fromColumn,
+      WritableColumnVector toColumn) {
+    DataType type = field.dataType();
+    if (type instanceof BooleanType) {
+      long[] vector = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putBoolean(index, vector[index] == 1);
+        }
+      }
+    } else if (type instanceof ByteType) {
+      long[] vector = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putByte(index, (byte)vector[index]);
+        }
+      }
+    } else if (type instanceof ShortType) {
+      long[] vector = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putShort(index, (short)vector[index]);
+        }
+      }
+    } else if (type instanceof IntegerType || type instanceof DateType) {
+      long[] vector = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putInt(index, (int)vector[index]);
+        }
+      }
+    } else if (type instanceof LongType) {
+      long[] vector = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putLong(index, vector[index]);
+        }
+      }
+    } else if (type instanceof TimestampType) {
+      TimestampColumnVector vector = ((TimestampColumnVector)fromColumn);
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putLong(index, fromTimestampColumnVector(vector, index));
+        }
+      }
+    } else if (type instanceof FloatType) {
+      double[] vector = ((DoubleColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putFloat(index, (float)vector[index]);
+        }
+      }
+    } else if (type instanceof DoubleType) {
+      double[] vector = ((DoubleColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putDouble(index, vector[index]);
+        }
+      }
+    } else if (type instanceof StringType || type instanceof BinaryType) {
+      BytesColumnVector vector = (BytesColumnVector)fromColumn;
+      WritableColumnVector arrayData = toColumn.getChildColumn(0);
+      int totalNumBytes = IntStream.of(vector.length).sum();
+      arrayData.reserve(totalNumBytes);
+      for (int index = 0, pos = 0; index < batchSize; pos += vector.length[index], index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          arrayData.putBytes(pos, vector.length[index], vector.vector[index], vector.start[index]);
+          toColumn.putArray(index, pos, vector.length[index]);
+        }
+      }
+    } else if (type instanceof DecimalType) {
+      DecimalType decimalType = (DecimalType)type;
+      HiveDecimalWritable[] vector = ((DecimalColumnVector)fromColumn).vector;
+      if (decimalType.precision() > Decimal.MAX_LONG_DIGITS()) {
+        WritableColumnVector arrayData = toColumn.getChildColumn(0);
+        arrayData.reserve(batchSize * 16);
+      }
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          putDecimalWritable(
+            toColumn,
+            index,
+            decimalType.precision(),
+            decimalType.scale(),
+            vector[index]);
+        }
+      }
+    } else {
+      throw new UnsupportedOperationException("Unsupported Data Type: " + type);
+    }
+  }
+
+  /**
+   * Returns the number of micros since epoch from an element of TimestampColumnVector.
+   */
+  private static long fromTimestampColumnVector(TimestampColumnVector vector, int index) {
+    return vector.time[index] * 1000L + vector.nanos[index] / 1000L;
+  }
+
+  /**
+   * Put a `HiveDecimalWritable` to a `WritableColumnVector`.
+   */
+  private static void putDecimalWritable(
+      WritableColumnVector toColumn,
+      int index,
+      int precision,
+      int scale,
+      HiveDecimalWritable decimalWritable) {
+    HiveDecimal decimal = decimalWritable.getHiveDecimal();
+    Decimal value =
+      Decimal.apply(decimal.bigDecimalValue(), decimal.precision(), decimal.scale());
+    value.changePrecision(precision, scale);
+
+    if (precision <= Decimal.MAX_INT_DIGITS()) {
+      toColumn.putInt(index, (int) value.toUnscaledLong());
+    } else if (precision <= Decimal.MAX_LONG_DIGITS()) {
+      toColumn.putLong(index, value.toUnscaledLong());
+    } else {
+      byte[] bytes = value.toJavaBigDecimal().unscaledValue().toByteArray();
+      WritableColumnVector arrayData = toColumn.getChildColumn(0);
+      arrayData.putBytes(index * 16, bytes.length, bytes, 0);
+      toColumn.putArray(index, index * 16, bytes.length);
+    }
+  }
+
+  /**
+   * Put `HiveDecimalWritable`s to a `WritableColumnVector`.
+   */
+  private static void putDecimalWritables(
+      WritableColumnVector toColumn,
+      int size,
+      int precision,
+      int scale,
+      HiveDecimalWritable decimalWritable) {
+    HiveDecimal decimal = decimalWritable.getHiveDecimal();
+    Decimal value =
+      Decimal.apply(decimal.bigDecimalValue(), decimal.precision(), decimal.scale());
+    value.changePrecision(precision, scale);
+
+    if (precision <= Decimal.MAX_INT_DIGITS()) {
+      toColumn.putInts(0, size, (int) value.toUnscaledLong());
+    } else if (precision <= Decimal.MAX_LONG_DIGITS()) {
+      toColumn.putLongs(0, size, value.toUnscaledLong());
+    } else {
+      byte[] bytes = value.toJavaBigDecimal().unscaledValue().toByteArray();
+      WritableColumnVector arrayData = toColumn.getChildColumn(0);
+      arrayData.reserve(bytes.length);
+      arrayData.putBytes(0, bytes.length, bytes, 0);
+      for (int index = 0; index < size; index++) {
+        toColumn.putArray(index, 0, bytes.length);
+      }
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
index f7471cd7debce..b8bacfa1838ae 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
@@ -118,6 +118,13 @@ class OrcFileFormat
     }
   }
 
+  override def supportBatch(sparkSession: SparkSession, schema: StructType): Boolean = {
+    val conf = sparkSession.sessionState.conf
+    conf.orcVectorizedReaderEnabled && conf.wholeStageEnabled &&
+      schema.length <= conf.wholeStageMaxNumFields &&
+      schema.forall(_.dataType.isInstanceOf[AtomicType])
+  }
+
   override def isSplitable(
       sparkSession: SparkSession,
       options: Map[String, String],
@@ -139,6 +146,11 @@ class OrcFileFormat
       }
     }
 
+    val resultSchema = StructType(requiredSchema.fields ++ partitionSchema.fields)
+    val sqlConf = sparkSession.sessionState.conf
+    val enableOffHeapColumnVector = sqlConf.offHeapColumnVectorEnabled
+    val enableVectorizedReader = supportBatch(sparkSession, resultSchema)
+
     val broadcastedConf =
       sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
     val isCaseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis
@@ -146,8 +158,14 @@ class OrcFileFormat
     (file: PartitionedFile) => {
       val conf = broadcastedConf.value.value
 
+      val filePath = new Path(new URI(file.filePath))
+
+      val fs = filePath.getFileSystem(conf)
+      val readerOptions = OrcFile.readerOptions(conf).filesystem(fs)
+      val reader = OrcFile.createReader(filePath, readerOptions)
+
       val requestedColIdsOrEmptyFile = OrcUtils.requestedColumnIds(
-        isCaseSensitive, dataSchema, requiredSchema, new Path(new URI(file.filePath)), conf)
+        isCaseSensitive, dataSchema, requiredSchema, reader, conf)
 
       if (requestedColIdsOrEmptyFile.isEmpty) {
         Iterator.empty
@@ -155,29 +173,46 @@ class OrcFileFormat
         val requestedColIds = requestedColIdsOrEmptyFile.get
         assert(requestedColIds.length == requiredSchema.length,
           "[BUG] requested column IDs do not match required schema")
-        conf.set(OrcConf.INCLUDE_COLUMNS.getAttribute,
+        val taskConf = new Configuration(conf)
+        taskConf.set(OrcConf.INCLUDE_COLUMNS.getAttribute,
           requestedColIds.filter(_ != -1).sorted.mkString(","))
 
-        val fileSplit =
-          new FileSplit(new Path(new URI(file.filePath)), file.start, file.length, Array.empty)
+        val fileSplit = new FileSplit(filePath, file.start, file.length, Array.empty)
         val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
-        val taskAttemptContext = new TaskAttemptContextImpl(conf, attemptId)
-
-        val orcRecordReader = new OrcInputFormat[OrcStruct]
-          .createRecordReader(fileSplit, taskAttemptContext)
-        val iter = new RecordReaderIterator[OrcStruct](orcRecordReader)
-        Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => iter.close()))
-
-        val fullSchema = requiredSchema.toAttributes ++ partitionSchema.toAttributes
-        val unsafeProjection = GenerateUnsafeProjection.generate(fullSchema, fullSchema)
-        val deserializer = new OrcDeserializer(dataSchema, requiredSchema, requestedColIds)
-
-        if (partitionSchema.length == 0) {
-          iter.map(value => unsafeProjection(deserializer.deserialize(value)))
+        val taskAttemptContext = new TaskAttemptContextImpl(taskConf, attemptId)
+
+        val taskContext = Option(TaskContext.get())
+        if (enableVectorizedReader) {
+          val batchReader =
+            new OrcColumnarBatchReader(enableOffHeapColumnVector && taskContext.isDefined)
+          batchReader.initialize(fileSplit, taskAttemptContext)
+          batchReader.initBatch(
+            reader.getSchema,
+            requestedColIds,
+            requiredSchema.fields,
+            partitionSchema,
+            file.partitionValues)
+
+          val iter = new RecordReaderIterator(batchReader)
+          Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => iter.close()))
+          iter.asInstanceOf[Iterator[InternalRow]]
         } else {
-          val joinedRow = new JoinedRow()
-          iter.map(value =>
-            unsafeProjection(joinedRow(deserializer.deserialize(value), file.partitionValues)))
+          val orcRecordReader = new OrcInputFormat[OrcStruct]
+            .createRecordReader(fileSplit, taskAttemptContext)
+          val iter = new RecordReaderIterator[OrcStruct](orcRecordReader)
+          Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => iter.close()))
+
+          val fullSchema = requiredSchema.toAttributes ++ partitionSchema.toAttributes
+          val unsafeProjection = GenerateUnsafeProjection.generate(fullSchema, fullSchema)
+          val deserializer = new OrcDeserializer(dataSchema, requiredSchema, requestedColIds)
+
+          if (partitionSchema.length == 0) {
+            iter.map(value => unsafeProjection(deserializer.deserialize(value)))
+          } else {
+            val joinedRow = new JoinedRow()
+            iter.map(value =>
+              unsafeProjection(joinedRow(deserializer.deserialize(value), file.partitionValues)))
+          }
         }
       }
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
index b03ee06d04a16..13a23996f4ade 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
@@ -21,7 +21,7 @@ import scala.collection.JavaConverters._
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
-import org.apache.orc.{OrcFile, TypeDescription}
+import org.apache.orc.{OrcFile, Reader, TypeDescription}
 
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
@@ -80,11 +80,8 @@ object OrcUtils extends Logging {
       isCaseSensitive: Boolean,
       dataSchema: StructType,
       requiredSchema: StructType,
-      file: Path,
+      reader: Reader,
       conf: Configuration): Option[Array[Int]] = {
-    val fs = file.getFileSystem(conf)
-    val readerOptions = OrcFile.readerOptions(conf).filesystem(fs)
-    val reader = OrcFile.createReader(file, readerOptions)
     val orcFieldNames = reader.getSchema.getFieldNames.asScala
     if (orcFieldNames.isEmpty) {
       // SPARK-8501: Some old empty ORC files always have an empty schema stored in their footer.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala
new file mode 100644
index 0000000000000..37ed846acd1eb
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala
@@ -0,0 +1,435 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import java.io.File
+
+import scala.util.{Random, Try}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types._
+import org.apache.spark.util.{Benchmark, Utils}
+
+
+/**
+ * Benchmark to measure ORC read performance.
+ *
+ * This is in `sql/hive` module in order to compare `sql/core` and `sql/hive` ORC data sources.
+ */
+// scalastyle:off line.size.limit
+object OrcReadBenchmark {
+  val conf = new SparkConf()
+  conf.set("orc.compression", "snappy")
+
+  private val spark = SparkSession.builder()
+    .master("local[1]")
+    .appName("OrcReadBenchmark")
+    .config(conf)
+    .getOrCreate()
+
+  // Set default configs. Individual cases will change them if necessary.
+  spark.conf.set(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key, "true")
+
+  def withTempPath(f: File => Unit): Unit = {
+    val path = Utils.createTempDir()
+    path.delete()
+    try f(path) finally Utils.deleteRecursively(path)
+  }
+
+  def withTempTable(tableNames: String*)(f: => Unit): Unit = {
+    try f finally tableNames.foreach(spark.catalog.dropTempView)
+  }
+
+  def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
+    val (keys, values) = pairs.unzip
+    val currentValues = keys.map(key => Try(spark.conf.get(key)).toOption)
+    (keys, values).zipped.foreach(spark.conf.set)
+    try f finally {
+      keys.zip(currentValues).foreach {
+        case (key, Some(value)) => spark.conf.set(key, value)
+        case (key, None) => spark.conf.unset(key)
+      }
+    }
+  }
+
+  private val NATIVE_ORC_FORMAT = classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat].getCanonicalName
+  private val HIVE_ORC_FORMAT = classOf[org.apache.spark.sql.hive.orc.OrcFileFormat].getCanonicalName
+
+  private def prepareTable(dir: File, df: DataFrame, partition: Option[String] = None): Unit = {
+    val dirORC = dir.getCanonicalPath
+
+    if (partition.isDefined) {
+      df.write.partitionBy(partition.get).orc(dirORC)
+    } else {
+      df.write.orc(dirORC)
+    }
+
+    spark.read.format(NATIVE_ORC_FORMAT).load(dirORC).createOrReplaceTempView("nativeOrcTable")
+    spark.read.format(HIVE_ORC_FORMAT).load(dirORC).createOrReplaceTempView("hiveOrcTable")
+  }
+
+  def numericScanBenchmark(values: Int, dataType: DataType): Unit = {
+    val sqlBenchmark = new Benchmark(s"SQL Single ${dataType.sql} Column Scan", values)
+
+    withTempPath { dir =>
+      withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+        import spark.implicits._
+        spark.range(values).map(_ => Random.nextLong).createOrReplaceTempView("t1")
+
+        prepareTable(dir, spark.sql(s"SELECT CAST(value as ${dataType.sql}) id FROM t1"))
+
+        sqlBenchmark.addCase("Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+          }
+        }
+
+        sqlBenchmark.addCase("Native ORC Vectorized") { _ =>
+          spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+        }
+
+        sqlBenchmark.addCase("Hive built-in ORC") { _ =>
+          spark.sql("SELECT sum(id) FROM hiveOrcTable").collect()
+        }
+
+        /*
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
+        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+
+        SQL Single TINYINT Column Scan:          Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1192 / 1221         13.2          75.8       1.0X
+        Native ORC Vectorized                          161 /  170         97.5          10.3       7.4X
+        Hive built-in ORC                             1399 / 1413         11.2          89.0       0.9X
+
+        SQL Single SMALLINT Column Scan:         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1287 / 1333         12.2          81.8       1.0X
+        Native ORC Vectorized                          164 /  172         95.6          10.5       7.8X
+        Hive built-in ORC                             1629 / 1650          9.7         103.6       0.8X
+
+        SQL Single INT Column Scan:              Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1304 / 1388         12.1          82.9       1.0X
+        Native ORC Vectorized                          227 /  240         69.3          14.4       5.7X
+        Hive built-in ORC                             1866 / 1867          8.4         118.6       0.7X
+
+        SQL Single BIGINT Column Scan:           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1331 / 1357         11.8          84.6       1.0X
+        Native ORC Vectorized                          289 /  297         54.4          18.4       4.6X
+        Hive built-in ORC                             1922 / 1929          8.2         122.2       0.7X
+
+        SQL Single FLOAT Column Scan:            Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1410 / 1428         11.2          89.7       1.0X
+        Native ORC Vectorized                          328 /  335         48.0          20.8       4.3X
+        Hive built-in ORC                             1929 / 2012          8.2         122.6       0.7X
+
+        SQL Single DOUBLE Column Scan:           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1467 / 1485         10.7          93.3       1.0X
+        Native ORC Vectorized                          402 /  411         39.1          25.6       3.6X
+        Hive built-in ORC                             2023 / 2042          7.8         128.6       0.7X
+        */
+        sqlBenchmark.run()
+      }
+    }
+  }
+
+  def intStringScanBenchmark(values: Int): Unit = {
+    val benchmark = new Benchmark("Int and String Scan", values)
+
+    withTempPath { dir =>
+      withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+        import spark.implicits._
+        spark.range(values).map(_ => Random.nextLong).createOrReplaceTempView("t1")
+
+        prepareTable(
+          dir,
+          spark.sql("SELECT CAST(value AS INT) AS c1, CAST(value as STRING) AS c2 FROM t1"))
+
+        benchmark.addCase("Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql("SELECT sum(c1), sum(length(c2)) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Native ORC Vectorized") { _ =>
+          spark.sql("SELECT sum(c1), sum(length(c2)) FROM nativeOrcTable").collect()
+        }
+
+        benchmark.addCase("Hive built-in ORC") { _ =>
+          spark.sql("SELECT sum(c1), sum(length(c2)) FROM hiveOrcTable").collect()
+        }
+
+        /*
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
+        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+
+        Int and String Scan:                     Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 2729 / 2744          3.8         260.2       1.0X
+        Native ORC Vectorized                         1318 / 1344          8.0         125.7       2.1X
+        Hive built-in ORC                             3731 / 3782          2.8         355.8       0.7X
+        */
+        benchmark.run()
+      }
+    }
+  }
+
+  def partitionTableScanBenchmark(values: Int): Unit = {
+    val benchmark = new Benchmark("Partitioned Table", values)
+
+    withTempPath { dir =>
+      withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+        import spark.implicits._
+        spark.range(values).map(_ => Random.nextLong).createOrReplaceTempView("t1")
+
+        prepareTable(dir, spark.sql("SELECT value % 2 AS p, value AS id FROM t1"), Some("p"))
+
+        benchmark.addCase("Read data column - Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Read data column - Native ORC Vectorized") { _ =>
+          spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+        }
+
+        benchmark.addCase("Read data column - Hive built-in ORC") { _ =>
+          spark.sql("SELECT sum(id) FROM hiveOrcTable").collect()
+        }
+
+        benchmark.addCase("Read partition column - Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql("SELECT sum(p) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Read partition column - Native ORC Vectorized") { _ =>
+          spark.sql("SELECT sum(p) FROM nativeOrcTable").collect()
+        }
+
+        benchmark.addCase("Read partition column - Hive built-in ORC") { _ =>
+          spark.sql("SELECT sum(p) FROM hiveOrcTable").collect()
+        }
+
+        benchmark.addCase("Read both columns - Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql("SELECT sum(p), sum(id) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Read both columns - Native ORC Vectorized") { _ =>
+          spark.sql("SELECT sum(p), sum(id) FROM nativeOrcTable").collect()
+        }
+
+        benchmark.addCase("Read both columns - Hive built-in ORC") { _ =>
+          spark.sql("SELECT sum(p), sum(id) FROM hiveOrcTable").collect()
+        }
+
+        /*
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
+        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+
+        Partitioned Table:                       Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Read data column - Native ORC MR               1531 / 1536         10.3          97.4       1.0X
+        Read data column - Native ORC Vectorized        295 /  298         53.3          18.8       5.2X
+        Read data column - Hive built-in ORC           2125 / 2126          7.4         135.1       0.7X
+        Read partition column - Native ORC MR          1049 / 1062         15.0          66.7       1.5X
+        Read partition column - Native ORC Vectorized    54 /   57        290.1           3.4      28.2X
+        Read partition column - Hive built-in ORC      1282 / 1291         12.3          81.5       1.2X
+        Read both columns - Native ORC MR              1594 / 1598          9.9         101.3       1.0X
+        Read both columns - Native ORC Vectorized       332 /  336         47.4          21.1       4.6X
+        Read both columns - Hive built-in ORC          2145 / 2187          7.3         136.4       0.7X
+        */
+        benchmark.run()
+      }
+    }
+  }
+
+  def repeatedStringScanBenchmark(values: Int): Unit = {
+    val benchmark = new Benchmark("Repeated String", values)
+
+    withTempPath { dir =>
+      withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+        spark.range(values).createOrReplaceTempView("t1")
+
+        prepareTable(dir, spark.sql("SELECT CAST((id % 200) + 10000 as STRING) AS c1 FROM t1"))
+
+        benchmark.addCase("Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql("SELECT sum(length(c1)) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Native ORC Vectorized") { _ =>
+          spark.sql("SELECT sum(length(c1)) FROM nativeOrcTable").collect()
+        }
+
+        benchmark.addCase("Hive built-in ORC") { _ =>
+          spark.sql("SELECT sum(length(c1)) FROM hiveOrcTable").collect()
+        }
+
+        /*
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
+        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+
+        Repeated String:                         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1325 / 1328          7.9         126.4       1.0X
+        Native ORC Vectorized                          320 /  330         32.8          30.5       4.1X
+        Hive built-in ORC                             1971 / 1972          5.3         188.0       0.7X
+        */
+        benchmark.run()
+      }
+    }
+  }
+
+  def stringWithNullsScanBenchmark(values: Int, fractionOfNulls: Double): Unit = {
+    withTempPath { dir =>
+      withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+        spark.range(values).createOrReplaceTempView("t1")
+
+        prepareTable(
+          dir,
+          spark.sql(
+            s"SELECT IF(RAND(1) < $fractionOfNulls, NULL, CAST(id as STRING)) AS c1, " +
+            s"IF(RAND(2) < $fractionOfNulls, NULL, CAST(id as STRING)) AS c2 FROM t1"))
+
+        val benchmark = new Benchmark(s"String with Nulls Scan ($fractionOfNulls%)", values)
+
+        benchmark.addCase("Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql("SELECT SUM(LENGTH(c2)) FROM nativeOrcTable " +
+              "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect()
+          }
+        }
+
+        benchmark.addCase("Native ORC Vectorized") { _ =>
+          spark.sql("SELECT SUM(LENGTH(c2)) FROM nativeOrcTable " +
+            "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect()
+        }
+
+        benchmark.addCase("Hive built-in ORC") { _ =>
+          spark.sql("SELECT SUM(LENGTH(c2)) FROM hiveOrcTable " +
+            "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect()
+        }
+
+        /*
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
+        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+
+        String with Nulls Scan (0.0%):           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 2553 / 2554          4.1         243.4       1.0X
+        Native ORC Vectorized                          953 /  954         11.0          90.9       2.7X
+        Hive built-in ORC                             3875 / 3898          2.7         369.6       0.7X
+
+        String with Nulls Scan (0.5%):           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 2389 / 2408          4.4         227.8       1.0X
+        Native ORC Vectorized                         1208 / 1209          8.7         115.2       2.0X
+        Hive built-in ORC                             2940 / 2952          3.6         280.4       0.8X
+
+        String with Nulls Scan (0.95%):          Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1295 / 1311          8.1         123.5       1.0X
+        Native ORC Vectorized                          449 /  457         23.4          42.8       2.9X
+        Hive built-in ORC                             1649 / 1660          6.4         157.3       0.8X
+        */
+        benchmark.run()
+      }
+    }
+  }
+
+  def columnsBenchmark(values: Int, width: Int): Unit = {
+    val sqlBenchmark = new Benchmark(s"SQL Single Column Scan from $width columns", values)
+
+    withTempPath { dir =>
+      withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+        import spark.implicits._
+        val middle = width / 2
+        val selectExpr = (1 to width).map(i => s"value as c$i")
+        spark.range(values).map(_ => Random.nextLong).toDF()
+          .selectExpr(selectExpr: _*).createOrReplaceTempView("t1")
+
+        prepareTable(dir, spark.sql("SELECT * FROM t1"))
+
+        sqlBenchmark.addCase("Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql(s"SELECT sum(c$middle) FROM nativeOrcTable").collect()
+          }
+        }
+
+        sqlBenchmark.addCase("Native ORC Vectorized") { _ =>
+          spark.sql(s"SELECT sum(c$middle) FROM nativeOrcTable").collect()
+        }
+
+        sqlBenchmark.addCase("Hive built-in ORC") { _ =>
+          spark.sql(s"SELECT sum(c$middle) FROM hiveOrcTable").collect()
+        }
+
+        /*
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
+        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+
+        SQL Single Column Scan from 100 columns: Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1103 / 1124          1.0        1052.0       1.0X
+        Native ORC Vectorized                           92 /  100         11.4          87.9      12.0X
+        Hive built-in ORC                              383 /  390          2.7         365.4       2.9X
+
+        SQL Single Column Scan from 200 columns: Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 2245 / 2250          0.5        2141.0       1.0X
+        Native ORC Vectorized                          157 /  165          6.7         150.2      14.3X
+        Hive built-in ORC                              587 /  593          1.8         559.4       3.8X
+
+        SQL Single Column Scan from 300 columns: Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 3343 / 3350          0.3        3188.3       1.0X
+        Native ORC Vectorized                          265 /  280          3.9         253.2      12.6X
+        Hive built-in ORC                              828 /  842          1.3         789.8       4.0X
+        */
+        sqlBenchmark.run()
+      }
+    }
+  }
+
+  def main(args: Array[String]): Unit = {
+    Seq(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType).foreach { dataType =>
+      numericScanBenchmark(1024 * 1024 * 15, dataType)
+    }
+    intStringScanBenchmark(1024 * 1024 * 10)
+    partitionTableScanBenchmark(1024 * 1024 * 15)
+    repeatedStringScanBenchmark(1024 * 1024 * 10)
+    for (fractionOfNulls <- List(0.0, 0.50, 0.95)) {
+      stringWithNullsScanBenchmark(1024 * 1024 * 10, fractionOfNulls)
+    }
+    columnsBenchmark(1024 * 1024 * 1, 100)
+    columnsBenchmark(1024 * 1024 * 1, 200)
+    columnsBenchmark(1024 * 1024 * 1, 300)
+  }
+}
+// scalastyle:on line.size.limit

From be5991902e7bdfab181036603e42ae15036f3506 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Tue, 9 Jan 2018 21:58:55 +0800
Subject: [PATCH 050/566] [SPARK-22981][SQL] Fix incorrect results of Casting
 Struct to String

## What changes were proposed in this pull request?
This pr fixed the issue when casting structs into strings;
```
scala> val df = Seq(((1, "a"), 0), ((2, "b"), 0)).toDF("a", "b")
scala> df.write.saveAsTable("t")
scala> sql("SELECT CAST(a AS STRING) FROM t").show
+-------------------+
|                  a|
+-------------------+
|[0,1,1800000001,61]|
|[0,2,1800000001,62]|
+-------------------+
```
This pr modified the result into;
```
+------+
|     a|
+------+
|[1, a]|
|[2, b]|
+------+
```

## How was this patch tested?
Added tests in `CastSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #20176 from maropu/SPARK-22981.

(cherry picked from commit 2250cb75b99d257e698fe5418a51d8cddb4d5104)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/catalyst/expressions/Cast.scala | 71 +++++++++++++++++++
 .../sql/catalyst/expressions/CastSuite.scala  | 16 +++++
 2 files changed, 87 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index f2de4c8e30bec..f21aa1e9e3135 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -259,6 +259,29 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
         builder.append("]")
         builder.build()
       })
+    case StructType(fields) =>
+      buildCast[InternalRow](_, row => {
+        val builder = new UTF8StringBuilder
+        builder.append("[")
+        if (row.numFields > 0) {
+          val st = fields.map(_.dataType)
+          val toUTF8StringFuncs = st.map(castToString)
+          if (!row.isNullAt(0)) {
+            builder.append(toUTF8StringFuncs(0)(row.get(0, st(0))).asInstanceOf[UTF8String])
+          }
+          var i = 1
+          while (i < row.numFields) {
+            builder.append(",")
+            if (!row.isNullAt(i)) {
+              builder.append(" ")
+              builder.append(toUTF8StringFuncs(i)(row.get(i, st(i))).asInstanceOf[UTF8String])
+            }
+            i += 1
+          }
+        }
+        builder.append("]")
+        builder.build()
+      })
     case _ => buildCast[Any](_, o => UTF8String.fromString(o.toString))
   }
 
@@ -732,6 +755,41 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
      """.stripMargin
   }
 
+  private def writeStructToStringBuilder(
+      st: Seq[DataType],
+      row: String,
+      buffer: String,
+      ctx: CodegenContext): String = {
+    val structToStringCode = st.zipWithIndex.map { case (ft, i) =>
+      val fieldToStringCode = castToStringCode(ft, ctx)
+      val field = ctx.freshName("field")
+      val fieldStr = ctx.freshName("fieldStr")
+      s"""
+         |${if (i != 0) s"""$buffer.append(",");""" else ""}
+         |if (!$row.isNullAt($i)) {
+         |  ${if (i != 0) s"""$buffer.append(" ");""" else ""}
+         |
+         |  // Append $i field into the string buffer
+         |  ${ctx.javaType(ft)} $field = ${ctx.getValue(row, ft, s"$i")};
+         |  UTF8String $fieldStr = null;
+         |  ${fieldToStringCode(field, fieldStr, null /* resultIsNull won't be used */)}
+         |  $buffer.append($fieldStr);
+         |}
+       """.stripMargin
+    }
+
+    val writeStructCode = ctx.splitExpressions(
+      expressions = structToStringCode,
+      funcName = "fieldToString",
+      arguments = ("InternalRow", row) :: (classOf[UTF8StringBuilder].getName, buffer) :: Nil)
+
+    s"""
+       |$buffer.append("[");
+       |$writeStructCode
+       |$buffer.append("]");
+     """.stripMargin
+  }
+
   private[this] def castToStringCode(from: DataType, ctx: CodegenContext): CastFunction = {
     from match {
       case BinaryType =>
@@ -765,6 +823,19 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
              |$evPrim = $buffer.build();
            """.stripMargin
         }
+      case StructType(fields) =>
+        (c, evPrim, evNull) => {
+          val row = ctx.freshName("row")
+          val buffer = ctx.freshName("buffer")
+          val bufferClass = classOf[UTF8StringBuilder].getName
+          val writeStructCode = writeStructToStringBuilder(fields.map(_.dataType), row, buffer, ctx)
+          s"""
+             |InternalRow $row = $c;
+             |$bufferClass $buffer = new $bufferClass();
+             |$writeStructCode
+             |$evPrim = $buffer.build();
+           """.stripMargin
+        }
       case _ =>
         (c, evPrim, evNull) => s"$evPrim = UTF8String.fromString(String.valueOf($c));"
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index 1445bb8a97d40..5b25bdf907c3a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -906,4 +906,20 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
       StringType)
     checkEvaluation(ret5, "[1 -> [1, 2, 3], 2 -> [4, 5, 6]]")
   }
+
+  test("SPARK-22981 Cast struct to string") {
+    val ret1 = cast(Literal.create((1, "a", 0.1)), StringType)
+    checkEvaluation(ret1, "[1, a, 0.1]")
+    val ret2 = cast(Literal.create(Tuple3[Int, String, String](1, null, "a")), StringType)
+    checkEvaluation(ret2, "[1,, a]")
+    val ret3 = cast(Literal.create(
+      (Date.valueOf("2014-12-03"), Timestamp.valueOf("2014-12-03 15:05:00"))), StringType)
+    checkEvaluation(ret3, "[2014-12-03, 2014-12-03 15:05:00]")
+    val ret4 = cast(Literal.create(((1, "a"), 5, 0.1)), StringType)
+    checkEvaluation(ret4, "[[1, a], 5, 0.1]")
+    val ret5 = cast(Literal.create((Seq(1, 2, 3), "a", 0.1)), StringType)
+    checkEvaluation(ret5, "[[1, 2, 3], a, 0.1]")
+    val ret6 = cast(Literal.create((1, Map(1 -> "a", 2 -> "b", 3 -> "c"))), StringType)
+    checkEvaluation(ret6, "[1, [1 -> a, 2 -> b, 3 -> c]]")
+  }
 }

From 44763d93c0d923977c114d63586abfc1b68ad7fc Mon Sep 17 00:00:00 2001
From: Jose Torres <jose@databricks.com>
Date: Mon, 8 Jan 2018 13:24:08 -0800
Subject: [PATCH 051/566] [SPARK-22912] v2 data source support in
 MicroBatchExecution

## What changes were proposed in this pull request?

Support for v2 data sources in microbatch streaming.

## How was this patch tested?

A very basic new unit test on the toy v2 implementation of rate source. Once we have a v1 source fully migrated to v2, we'll need to do more detailed compatibility testing.

Author: Jose Torres <jose@databricks.com>

Closes #20097 from jose-torres/v2-impl.
---
 ...pache.spark.sql.sources.DataSourceRegister |   1 +
 .../datasources/v2/DataSourceV2Relation.scala |  10 ++
 .../streaming/MicroBatchExecution.scala       | 112 ++++++++++++++----
 .../streaming/ProgressReporter.scala          |   6 +-
 .../streaming/RateSourceProvider.scala        |  10 +-
 .../execution/streaming/StreamExecution.scala |   4 +-
 .../streaming/StreamingRelation.scala         |   4 +-
 .../continuous/ContinuousExecution.scala      |   4 +-
 .../ContinuousRateStreamSource.scala          |  17 +--
 .../sources/RateStreamSourceV2.scala          |  31 ++++-
 .../sql/streaming/DataStreamReader.scala      |  25 +++-
 .../sql/streaming/StreamingQueryManager.scala |  24 ++--
 .../streaming/RateSourceV2Suite.scala         |  68 +++++++++--
 .../spark/sql/streaming/StreamTest.scala      |   2 +-
 .../continuous/ContinuousSuite.scala          |   2 +-
 15 files changed, 241 insertions(+), 79 deletions(-)

diff --git a/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
index 6cdfe2fae5642..0259c774bbf4a 100644
--- a/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
+++ b/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
@@ -7,3 +7,4 @@ org.apache.spark.sql.execution.datasources.text.TextFileFormat
 org.apache.spark.sql.execution.streaming.ConsoleSinkProvider
 org.apache.spark.sql.execution.streaming.TextSocketSourceProvider
 org.apache.spark.sql.execution.streaming.RateSourceProvider
+org.apache.spark.sql.execution.streaming.sources.RateSourceProviderV2
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
index 7eb99a645001a..cba20dd902007 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
@@ -35,6 +35,16 @@ case class DataSourceV2Relation(
   }
 }
 
+/**
+ * A specialization of DataSourceV2Relation with the streaming bit set to true. Otherwise identical
+ * to the non-streaming relation.
+ */
+class StreamingDataSourceV2Relation(
+    fullOutput: Seq[AttributeReference],
+    reader: DataSourceV2Reader) extends DataSourceV2Relation(fullOutput, reader) {
+  override def isStreaming: Boolean = true
+}
+
 object DataSourceV2Relation {
   def apply(reader: DataSourceV2Reader): DataSourceV2Relation = {
     new DataSourceV2Relation(reader.readSchema().toAttributes, reader)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
index 9a7a13fcc5806..42240eeb58d4b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.sql.execution.streaming
 
+import java.util.Optional
+
+import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, Map => MutableMap}
 
 import org.apache.spark.sql.{Dataset, SparkSession}
@@ -24,7 +27,10 @@ import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.execution.SQLExecution
-import org.apache.spark.sql.sources.v2.streaming.MicroBatchReadSupport
+import org.apache.spark.sql.execution.datasources.v2.{StreamingDataSourceV2Relation, WriteToDataSourceV2}
+import org.apache.spark.sql.sources.v2.DataSourceV2Options
+import org.apache.spark.sql.sources.v2.streaming.{MicroBatchReadSupport, MicroBatchWriteSupport}
+import org.apache.spark.sql.sources.v2.streaming.reader.{MicroBatchReader, Offset => OffsetV2}
 import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime, Trigger}
 import org.apache.spark.util.{Clock, Utils}
 
@@ -33,10 +39,11 @@ class MicroBatchExecution(
     name: String,
     checkpointRoot: String,
     analyzedPlan: LogicalPlan,
-    sink: Sink,
+    sink: BaseStreamingSink,
     trigger: Trigger,
     triggerClock: Clock,
     outputMode: OutputMode,
+    extraOptions: Map[String, String],
     deleteCheckpointOnStop: Boolean)
   extends StreamExecution(
     sparkSession, name, checkpointRoot, analyzedPlan, sink,
@@ -57,6 +64,13 @@ class MicroBatchExecution(
     var nextSourceId = 0L
     val toExecutionRelationMap = MutableMap[StreamingRelation, StreamingExecutionRelation]()
     val v2ToExecutionRelationMap = MutableMap[StreamingRelationV2, StreamingExecutionRelation]()
+    // We transform each distinct streaming relation into a StreamingExecutionRelation, keeping a
+    // map as we go to ensure each identical relation gets the same StreamingExecutionRelation
+    // object. For each microbatch, the StreamingExecutionRelation will be replaced with a logical
+    // plan for the data within that batch.
+    // Note that we have to use the previous `output` as attributes in StreamingExecutionRelation,
+    // since the existing logical plan has already used those attributes. The per-microbatch
+    // transformation is responsible for replacing attributes with their final values.
     val _logicalPlan = analyzedPlan.transform {
       case streamingRelation@StreamingRelation(dataSource, _, output) =>
         toExecutionRelationMap.getOrElseUpdate(streamingRelation, {
@@ -64,19 +78,26 @@ class MicroBatchExecution(
           val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId"
           val source = dataSource.createSource(metadataPath)
           nextSourceId += 1
-          // We still need to use the previous `output` instead of `source.schema` as attributes in
-          // "df.logicalPlan" has already used attributes of the previous `output`.
           StreamingExecutionRelation(source, output)(sparkSession)
         })
-      case s @ StreamingRelationV2(v2DataSource, _, _, output, v1DataSource)
-          if !v2DataSource.isInstanceOf[MicroBatchReadSupport] =>
+      case s @ StreamingRelationV2(source: MicroBatchReadSupport, _, options, output, _) =>
+        v2ToExecutionRelationMap.getOrElseUpdate(s, {
+          // Materialize source to avoid creating it in every batch
+          val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId"
+          val reader = source.createMicroBatchReader(
+            Optional.empty(), // user specified schema
+            metadataPath,
+            new DataSourceV2Options(options.asJava))
+          nextSourceId += 1
+          StreamingExecutionRelation(reader, output)(sparkSession)
+        })
+      case s @ StreamingRelationV2(_, _, _, output, v1Relation) =>
         v2ToExecutionRelationMap.getOrElseUpdate(s, {
           // Materialize source to avoid creating it in every batch
           val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId"
-          val source = v1DataSource.createSource(metadataPath)
+          assert(v1Relation.isDefined, "v2 execution didn't match but v1 was unavailable")
+          val source = v1Relation.get.dataSource.createSource(metadataPath)
           nextSourceId += 1
-          // We still need to use the previous `output` instead of `source.schema` as attributes in
-          // "df.logicalPlan" has already used attributes of the previous `output`.
           StreamingExecutionRelation(source, output)(sparkSession)
         })
     }
@@ -192,7 +213,8 @@ class MicroBatchExecution(
                     source.getBatch(start, end)
                   }
                 case nonV1Tuple =>
-                  throw new IllegalStateException(s"Unexpected V2 source in $nonV1Tuple")
+                  // The V2 API does not have the same edge case requiring getBatch to be called
+                  // here, so we do nothing here.
               }
               currentBatchId = latestCommittedBatchId + 1
               committedOffsets ++= availableOffsets
@@ -236,14 +258,27 @@ class MicroBatchExecution(
     val hasNewData = {
       awaitProgressLock.lock()
       try {
-        val latestOffsets: Map[Source, Option[Offset]] = uniqueSources.map {
+        // Generate a map from each unique source to the next available offset.
+        val latestOffsets: Map[BaseStreamingSource, Option[Offset]] = uniqueSources.map {
           case s: Source =>
             updateStatusMessage(s"Getting offsets from $s")
             reportTimeTaken("getOffset") {
               (s, s.getOffset)
             }
+          case s: MicroBatchReader =>
+            updateStatusMessage(s"Getting offsets from $s")
+            reportTimeTaken("getOffset") {
+            // Once v1 streaming source execution is gone, we can refactor this away.
+            // For now, we set the range here to get the source to infer the available end offset,
+            // get that offset, and then set the range again when we later execute.
+            s.setOffsetRange(
+              toJava(availableOffsets.get(s).map(off => s.deserializeOffset(off.json))),
+              Optional.empty())
+
+              (s, Some(s.getEndOffset))
+            }
         }.toMap
-        availableOffsets ++= latestOffsets.filter { case (s, o) => o.nonEmpty }.mapValues(_.get)
+        availableOffsets ++= latestOffsets.filter { case (_, o) => o.nonEmpty }.mapValues(_.get)
 
         if (dataAvailable) {
           true
@@ -317,6 +352,8 @@ class MicroBatchExecution(
           if (prevBatchOff.isDefined) {
             prevBatchOff.get.toStreamProgress(sources).foreach {
               case (src: Source, off) => src.commit(off)
+              case (reader: MicroBatchReader, off) =>
+                reader.commit(reader.deserializeOffset(off.json))
             }
           } else {
             throw new IllegalStateException(s"batch $currentBatchId doesn't exist")
@@ -357,7 +394,16 @@ class MicroBatchExecution(
             s"DataFrame returned by getBatch from $source did not have isStreaming=true\n" +
               s"${batch.queryExecution.logical}")
           logDebug(s"Retrieving data from $source: $current -> $available")
-          Some(source -> batch)
+          Some(source -> batch.logicalPlan)
+        case (reader: MicroBatchReader, available)
+          if committedOffsets.get(reader).map(_ != available).getOrElse(true) =>
+          val current = committedOffsets.get(reader).map(off => reader.deserializeOffset(off.json))
+          reader.setOffsetRange(
+            toJava(current),
+            Optional.of(available.asInstanceOf[OffsetV2]))
+          logDebug(s"Retrieving data from $reader: $current -> $available")
+          Some(reader ->
+            new StreamingDataSourceV2Relation(reader.readSchema().toAttributes, reader))
         case _ => None
       }
     }
@@ -365,15 +411,14 @@ class MicroBatchExecution(
     // A list of attributes that will need to be updated.
     val replacements = new ArrayBuffer[(Attribute, Attribute)]
     // Replace sources in the logical plan with data that has arrived since the last batch.
-    val withNewSources = logicalPlan transform {
+    val newBatchesPlan = logicalPlan transform {
       case StreamingExecutionRelation(source, output) =>
-        newData.get(source).map { data =>
-          val newPlan = data.logicalPlan
-          assert(output.size == newPlan.output.size,
+        newData.get(source).map { dataPlan =>
+          assert(output.size == dataPlan.output.size,
             s"Invalid batch: ${Utils.truncatedString(output, ",")} != " +
-              s"${Utils.truncatedString(newPlan.output, ",")}")
-          replacements ++= output.zip(newPlan.output)
-          newPlan
+              s"${Utils.truncatedString(dataPlan.output, ",")}")
+          replacements ++= output.zip(dataPlan.output)
+          dataPlan
         }.getOrElse {
           LocalRelation(output, isStreaming = true)
         }
@@ -381,7 +426,7 @@ class MicroBatchExecution(
 
     // Rewire the plan to use the new attributes that were returned by the source.
     val replacementMap = AttributeMap(replacements)
-    val triggerLogicalPlan = withNewSources transformAllExpressions {
+    val newAttributePlan = newBatchesPlan transformAllExpressions {
       case a: Attribute if replacementMap.contains(a) =>
         replacementMap(a).withMetadata(a.metadata)
       case ct: CurrentTimestamp =>
@@ -392,6 +437,20 @@ class MicroBatchExecution(
           cd.dataType, cd.timeZoneId)
     }
 
+    val triggerLogicalPlan = sink match {
+      case _: Sink => newAttributePlan
+      case s: MicroBatchWriteSupport =>
+        val writer = s.createMicroBatchWriter(
+          s"$runId",
+          currentBatchId,
+          newAttributePlan.schema,
+          outputMode,
+          new DataSourceV2Options(extraOptions.asJava))
+        assert(writer.isPresent, "microbatch writer must always be present")
+        WriteToDataSourceV2(writer.get, newAttributePlan)
+      case _ => throw new IllegalArgumentException(s"unknown sink type for $sink")
+    }
+
     reportTimeTaken("queryPlanning") {
       lastExecution = new IncrementalExecution(
         sparkSessionToRunBatch,
@@ -409,7 +468,12 @@ class MicroBatchExecution(
 
     reportTimeTaken("addBatch") {
       SQLExecution.withNewExecutionId(sparkSessionToRunBatch, lastExecution) {
-        sink.addBatch(currentBatchId, nextBatch)
+        sink match {
+          case s: Sink => s.addBatch(currentBatchId, nextBatch)
+          case s: MicroBatchWriteSupport =>
+            // This doesn't accumulate any data - it just forces execution of the microbatch writer.
+            nextBatch.collect()
+        }
       }
     }
 
@@ -421,4 +485,8 @@ class MicroBatchExecution(
       awaitProgressLock.unlock()
     }
   }
+
+  private def toJava(scalaOption: Option[OffsetV2]): Optional[OffsetV2] = {
+    Optional.ofNullable(scalaOption.orNull)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
index 1c9043613cb69..d1e5be9c12762 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
@@ -53,7 +53,7 @@ trait ProgressReporter extends Logging {
   protected def triggerClock: Clock
   protected def logicalPlan: LogicalPlan
   protected def lastExecution: QueryExecution
-  protected def newData: Map[BaseStreamingSource, DataFrame]
+  protected def newData: Map[BaseStreamingSource, LogicalPlan]
   protected def availableOffsets: StreamProgress
   protected def committedOffsets: StreamProgress
   protected def sources: Seq[BaseStreamingSource]
@@ -225,8 +225,8 @@ trait ProgressReporter extends Logging {
     //
     // 3. For each source, we sum the metrics of the associated execution plan leaves.
     //
-    val logicalPlanLeafToSource = newData.flatMap { case (source, df) =>
-      df.logicalPlan.collectLeaves().map { leaf => leaf -> source }
+    val logicalPlanLeafToSource = newData.flatMap { case (source, logicalPlan) =>
+      logicalPlan.collectLeaves().map { leaf => leaf -> source }
     }
     val allLogicalPlanLeaves = lastExecution.logical.collectLeaves() // includes non-streaming
     val allExecPlanLeaves = lastExecution.executedPlan.collectLeaves()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
index d02cf882b61ac..66eb0169ac1ec 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
@@ -29,12 +29,12 @@ import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.sql.{AnalysisException, DataFrame, SQLContext}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
-import org.apache.spark.sql.execution.streaming.continuous.ContinuousRateStreamReader
-import org.apache.spark.sql.execution.streaming.sources.RateStreamV2Reader
+import org.apache.spark.sql.execution.streaming.continuous.RateStreamContinuousReader
+import org.apache.spark.sql.execution.streaming.sources.RateStreamMicroBatchReader
 import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
 import org.apache.spark.sql.sources.v2._
-import org.apache.spark.sql.sources.v2.streaming.ContinuousReadSupport
-import org.apache.spark.sql.sources.v2.streaming.reader.ContinuousReader
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, MicroBatchReadSupport}
+import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousReader, MicroBatchReader}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.{ManualClock, SystemClock}
 
@@ -112,7 +112,7 @@ class RateSourceProvider extends StreamSourceProvider with DataSourceRegister
       schema: Optional[StructType],
       checkpointLocation: String,
       options: DataSourceV2Options): ContinuousReader = {
-    new ContinuousRateStreamReader(options)
+    new RateStreamContinuousReader(options)
   }
 
   override def shortName(): String = "rate"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 3e76bf7b7ca8f..24a8b000df0c1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -163,7 +163,7 @@ abstract class StreamExecution(
   var lastExecution: IncrementalExecution = _
 
   /** Holds the most recent input data for each source. */
-  protected var newData: Map[BaseStreamingSource, DataFrame] = _
+  protected var newData: Map[BaseStreamingSource, LogicalPlan] = _
 
   @volatile
   protected var streamDeathCause: StreamingQueryException = null
@@ -418,7 +418,7 @@ abstract class StreamExecution(
    * Blocks the current thread until processing for data from the given `source` has reached at
    * least the given `Offset`. This method is intended for use primarily when writing tests.
    */
-  private[sql] def awaitOffset(source: Source, newOffset: Offset): Unit = {
+  private[sql] def awaitOffset(source: BaseStreamingSource, newOffset: Offset): Unit = {
     assertAwaitThread()
     def notDone = {
       val localCommittedOffsets = committedOffsets
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
index a9d50e3a112e7..a0ee683a895d8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
@@ -61,7 +61,7 @@ case class StreamingRelation(dataSource: DataSource, sourceName: String, output:
  * [[org.apache.spark.sql.catalyst.plans.logical.LogicalPlan]].
  */
 case class StreamingExecutionRelation(
-    source: Source,
+    source: BaseStreamingSource,
     output: Seq[Attribute])(session: SparkSession)
   extends LeafNode {
 
@@ -92,7 +92,7 @@ case class StreamingRelationV2(
     sourceName: String,
     extraOptions: Map[String, String],
     output: Seq[Attribute],
-    v1DataSource: DataSource)(session: SparkSession)
+    v1Relation: Option[StreamingRelation])(session: SparkSession)
   extends LeafNode {
   override def isStreaming: Boolean = true
   override def toString: String = sourceName
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
index 2843ab13bde2b..9657b5e26d770 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.{AnalysisException, SparkSession}
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SQLExecution
-import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, WriteToDataSourceV2}
+import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, StreamingDataSourceV2Relation, WriteToDataSourceV2}
 import org.apache.spark.sql.execution.streaming.{ContinuousExecutionRelation, StreamingRelationV2, _}
 import org.apache.spark.sql.sources.v2.DataSourceV2Options
 import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, ContinuousWriteSupport}
@@ -174,7 +174,7 @@ class ContinuousExecution(
         val loggedOffset = offsets.offsets(0)
         val realOffset = loggedOffset.map(off => reader.deserializeOffset(off.json))
         reader.setOffset(java.util.Optional.ofNullable(realOffset.orNull))
-        DataSourceV2Relation(newOutput, reader)
+        new StreamingDataSourceV2Relation(newOutput, reader)
     }
 
     // Rewire the plan to use the new attributes that were returned by the source.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
index c9aa78a5a2e28..b4b21e7d2052f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
@@ -32,10 +32,10 @@ import org.apache.spark.sql.sources.v2.reader._
 import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousDataReader, ContinuousReader, Offset, PartitionOffset}
 import org.apache.spark.sql.types.{LongType, StructField, StructType, TimestampType}
 
-case class ContinuousRateStreamPartitionOffset(
+case class RateStreamPartitionOffset(
    partition: Int, currentValue: Long, currentTimeMs: Long) extends PartitionOffset
 
-class ContinuousRateStreamReader(options: DataSourceV2Options)
+class RateStreamContinuousReader(options: DataSourceV2Options)
   extends ContinuousReader {
   implicit val defaultFormats: DefaultFormats = DefaultFormats
 
@@ -48,7 +48,7 @@ class ContinuousRateStreamReader(options: DataSourceV2Options)
   override def mergeOffsets(offsets: Array[PartitionOffset]): Offset = {
     assert(offsets.length == numPartitions)
     val tuples = offsets.map {
-      case ContinuousRateStreamPartitionOffset(i, currVal, nextRead) =>
+      case RateStreamPartitionOffset(i, currVal, nextRead) =>
         (i, ValueRunTimeMsPair(currVal, nextRead))
     }
     RateStreamOffset(Map(tuples: _*))
@@ -86,7 +86,7 @@ class ContinuousRateStreamReader(options: DataSourceV2Options)
       val start = partitionStartMap(i)
       // Have each partition advance by numPartitions each row, with starting points staggered
       // by their partition index.
-      RateStreamReadTask(
+      RateStreamContinuousReadTask(
         start.value,
         start.runTimeMs,
         i,
@@ -101,7 +101,7 @@ class ContinuousRateStreamReader(options: DataSourceV2Options)
 
 }
 
-case class RateStreamReadTask(
+case class RateStreamContinuousReadTask(
     startValue: Long,
     startTimeMs: Long,
     partitionIndex: Int,
@@ -109,10 +109,11 @@ case class RateStreamReadTask(
     rowsPerSecond: Double)
   extends ReadTask[Row] {
   override def createDataReader(): DataReader[Row] =
-    new RateStreamDataReader(startValue, startTimeMs, partitionIndex, increment, rowsPerSecond)
+    new RateStreamContinuousDataReader(
+      startValue, startTimeMs, partitionIndex, increment, rowsPerSecond)
 }
 
-class RateStreamDataReader(
+class RateStreamContinuousDataReader(
     startValue: Long,
     startTimeMs: Long,
     partitionIndex: Int,
@@ -151,5 +152,5 @@ class RateStreamDataReader(
   override def close(): Unit = {}
 
   override def getOffset(): PartitionOffset =
-    ContinuousRateStreamPartitionOffset(partitionIndex, currentValue, nextReadTime)
+    RateStreamPartitionOffset(partitionIndex, currentValue, nextReadTime)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
index 97bada08bcd2b..c0ed12cec25ef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
@@ -28,17 +28,38 @@ import org.json4s.jackson.Serialization
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.streaming.{RateStreamOffset, ValueRunTimeMsPair}
-import org.apache.spark.sql.sources.v2.DataSourceV2Options
+import org.apache.spark.sql.sources.DataSourceRegister
+import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
 import org.apache.spark.sql.sources.v2.reader._
+import org.apache.spark.sql.sources.v2.streaming.MicroBatchReadSupport
 import org.apache.spark.sql.sources.v2.streaming.reader.{MicroBatchReader, Offset}
 import org.apache.spark.sql.types.{LongType, StructField, StructType, TimestampType}
-import org.apache.spark.util.SystemClock
+import org.apache.spark.util.{ManualClock, SystemClock}
 
-class RateStreamV2Reader(options: DataSourceV2Options)
+/**
+ * This is a temporary register as we build out v2 migration. Microbatch read support should
+ * be implemented in the same register as v1.
+ */
+class RateSourceProviderV2 extends DataSourceV2 with MicroBatchReadSupport with DataSourceRegister {
+  override def createMicroBatchReader(
+      schema: Optional[StructType],
+      checkpointLocation: String,
+      options: DataSourceV2Options): MicroBatchReader = {
+    new RateStreamMicroBatchReader(options)
+  }
+
+  override def shortName(): String = "ratev2"
+}
+
+class RateStreamMicroBatchReader(options: DataSourceV2Options)
   extends MicroBatchReader {
   implicit val defaultFormats: DefaultFormats = DefaultFormats
 
-  val clock = new SystemClock
+  val clock = {
+    // The option to use a manual clock is provided only for unit testing purposes.
+    if (options.get("useManualClock").orElse("false").toBoolean) new ManualClock
+    else new SystemClock
+  }
 
   private val numPartitions =
     options.get(RateStreamSourceV2.NUM_PARTITIONS).orElse("5").toInt
@@ -111,7 +132,7 @@ class RateStreamV2Reader(options: DataSourceV2Options)
 
       val packedRows = mutable.ListBuffer[(Long, Long)]()
       var outVal = startVal + numPartitions
-      var outTimeMs = startTimeMs + msPerPartitionBetweenRows
+      var outTimeMs = startTimeMs
       while (outVal <= endVal) {
         packedRows.append((outTimeMs, outVal))
         outVal += numPartitions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index 2e92beecf2c17..52f2e2639cd86 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.streaming
 
-import java.util.Locale
+import java.util.{Locale, Optional}
 
 import scala.collection.JavaConverters._
 
@@ -27,8 +27,9 @@ import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, SparkSession
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming.{StreamingRelation, StreamingRelationV2}
+import org.apache.spark.sql.sources.StreamSourceProvider
 import org.apache.spark.sql.sources.v2.DataSourceV2Options
-import org.apache.spark.sql.sources.v2.streaming.ContinuousReadSupport
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, MicroBatchReadSupport}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
 
@@ -166,19 +167,31 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
       userSpecifiedSchema = userSpecifiedSchema,
       className = source,
       options = extraOptions.toMap)
+    val v1Relation = ds match {
+      case _: StreamSourceProvider => Some(StreamingRelation(v1DataSource))
+      case _ => None
+    }
     ds match {
+      case s: MicroBatchReadSupport =>
+        val tempReader = s.createMicroBatchReader(
+          Optional.ofNullable(userSpecifiedSchema.orNull),
+          Utils.createTempDir(namePrefix = s"temporaryReader").getCanonicalPath,
+          options)
+        Dataset.ofRows(
+          sparkSession,
+          StreamingRelationV2(
+            s, source, extraOptions.toMap,
+            tempReader.readSchema().toAttributes, v1Relation)(sparkSession))
       case s: ContinuousReadSupport =>
         val tempReader = s.createContinuousReader(
-          java.util.Optional.ofNullable(userSpecifiedSchema.orNull),
+          Optional.ofNullable(userSpecifiedSchema.orNull),
           Utils.createTempDir(namePrefix = s"temporaryReader").getCanonicalPath,
           options)
-        // Generate the V1 node to catch errors thrown within generation.
-        StreamingRelation(v1DataSource)
         Dataset.ofRows(
           sparkSession,
           StreamingRelationV2(
             s, source, extraOptions.toMap,
-            tempReader.readSchema().toAttributes, v1DataSource)(sparkSession))
+            tempReader.readSchema().toAttributes, v1Relation)(sparkSession))
       case _ =>
         // Code path for data source v1.
         Dataset.ofRows(sparkSession, StreamingRelation(v1DataSource))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
index b508f4406138f..4b27e0d4ef47b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
@@ -29,10 +29,10 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{AnalysisException, DataFrame, SparkSession}
 import org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
+import org.apache.spark.sql.execution.streaming.continuous.{ContinuousExecution, ContinuousTrigger}
 import org.apache.spark.sql.execution.streaming.state.StateStoreCoordinatorRef
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.sources.v2.streaming.ContinuousWriteSupport
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousWriteSupport, MicroBatchWriteSupport}
 import org.apache.spark.util.{Clock, SystemClock, Utils}
 
 /**
@@ -240,31 +240,35 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo
           "is not supported in streaming DataFrames/Datasets and will be disabled.")
     }
 
-    sink match {
-      case v1Sink: Sink =>
-        new StreamingQueryWrapper(new MicroBatchExecution(
+    (sink, trigger) match {
+      case (v2Sink: ContinuousWriteSupport, trigger: ContinuousTrigger) =>
+        UnsupportedOperationChecker.checkForContinuous(analyzedPlan, outputMode)
+        new StreamingQueryWrapper(new ContinuousExecution(
           sparkSession,
           userSpecifiedName.orNull,
           checkpointLocation,
           analyzedPlan,
-          v1Sink,
+          v2Sink,
           trigger,
           triggerClock,
           outputMode,
+          extraOptions,
           deleteCheckpointOnStop))
-      case v2Sink: ContinuousWriteSupport =>
-        UnsupportedOperationChecker.checkForContinuous(analyzedPlan, outputMode)
-        new StreamingQueryWrapper(new ContinuousExecution(
+      case (_: MicroBatchWriteSupport, _) | (_: Sink, _) =>
+        new StreamingQueryWrapper(new MicroBatchExecution(
           sparkSession,
           userSpecifiedName.orNull,
           checkpointLocation,
           analyzedPlan,
-          v2Sink,
+          sink,
           trigger,
           triggerClock,
           outputMode,
           extraOptions,
           deleteCheckpointOnStop))
+      case (_: ContinuousWriteSupport, t) if !t.isInstanceOf[ContinuousTrigger] =>
+        throw new AnalysisException(
+          "Sink only supports continuous writes, but a continuous trigger was not specified.")
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
index e11705a227f48..85085d43061bd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
@@ -18,20 +18,64 @@
 package org.apache.spark.sql.execution.streaming
 
 import java.util.Optional
+import java.util.concurrent.TimeUnit
 
 import scala.collection.JavaConverters._
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming.continuous._
-import org.apache.spark.sql.execution.streaming.sources.{RateStreamBatchTask, RateStreamSourceV2, RateStreamV2Reader}
+import org.apache.spark.sql.execution.streaming.sources.{RateStreamBatchTask, RateStreamMicroBatchReader, RateStreamSourceV2}
 import org.apache.spark.sql.sources.v2.DataSourceV2Options
-import org.apache.spark.sql.sources.v2.streaming.ContinuousReadSupport
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, MicroBatchReadSupport}
 import org.apache.spark.sql.streaming.StreamTest
+import org.apache.spark.util.ManualClock
 
 class RateSourceV2Suite extends StreamTest {
+  import testImplicits._
+
+  case class AdvanceRateManualClock(seconds: Long) extends AddData {
+    override def addData(query: Option[StreamExecution]): (BaseStreamingSource, Offset) = {
+      assert(query.nonEmpty)
+      val rateSource = query.get.logicalPlan.collect {
+        case StreamingExecutionRelation(source: RateStreamMicroBatchReader, _) => source
+      }.head
+      rateSource.clock.asInstanceOf[ManualClock].advance(TimeUnit.SECONDS.toMillis(seconds))
+      rateSource.setOffsetRange(Optional.empty(), Optional.empty())
+      (rateSource, rateSource.getEndOffset())
+    }
+  }
+
+  test("microbatch in registry") {
+    DataSource.lookupDataSource("ratev2", spark.sqlContext.conf).newInstance() match {
+      case ds: MicroBatchReadSupport =>
+        val reader = ds.createMicroBatchReader(Optional.empty(), "", DataSourceV2Options.empty())
+        assert(reader.isInstanceOf[RateStreamMicroBatchReader])
+      case _ =>
+        throw new IllegalStateException("Could not find v2 read support for rate")
+    }
+  }
+
+  test("basic microbatch execution") {
+    val input = spark.readStream
+      .format("rateV2")
+      .option("numPartitions", "1")
+      .option("rowsPerSecond", "10")
+      .option("useManualClock", "true")
+      .load()
+    testStream(input, useV2Sink = true)(
+      AdvanceRateManualClock(seconds = 1),
+      CheckLastBatch((0 until 10).map(v => new java.sql.Timestamp(v * 100L) -> v): _*),
+      StopStream,
+      StartStream(),
+      // Advance 2 seconds because creating a new RateSource will also create a new ManualClock
+      AdvanceRateManualClock(seconds = 2),
+      CheckLastBatch((10 until 20).map(v => new java.sql.Timestamp(v * 100L) -> v): _*)
+    )
+  }
+
   test("microbatch - numPartitions propagated") {
-    val reader = new RateStreamV2Reader(
+    val reader = new RateStreamMicroBatchReader(
       new DataSourceV2Options(Map("numPartitions" -> "11", "rowsPerSecond" -> "33").asJava))
     reader.setOffsetRange(Optional.empty(), Optional.empty())
     val tasks = reader.createReadTasks()
@@ -39,7 +83,7 @@ class RateSourceV2Suite extends StreamTest {
   }
 
   test("microbatch - set offset") {
-    val reader = new RateStreamV2Reader(DataSourceV2Options.empty())
+    val reader = new RateStreamMicroBatchReader(DataSourceV2Options.empty())
     val startOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(0, 1000))))
     val endOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(0, 2000))))
     reader.setOffsetRange(Optional.of(startOffset), Optional.of(endOffset))
@@ -48,7 +92,7 @@ class RateSourceV2Suite extends StreamTest {
   }
 
   test("microbatch - infer offsets") {
-    val reader = new RateStreamV2Reader(
+    val reader = new RateStreamMicroBatchReader(
       new DataSourceV2Options(Map("numPartitions" -> "1", "rowsPerSecond" -> "100").asJava))
     reader.clock.waitTillTime(reader.clock.getTimeMillis() + 100)
     reader.setOffsetRange(Optional.empty(), Optional.empty())
@@ -69,7 +113,7 @@ class RateSourceV2Suite extends StreamTest {
   }
 
   test("microbatch - predetermined batch size") {
-    val reader = new RateStreamV2Reader(
+    val reader = new RateStreamMicroBatchReader(
       new DataSourceV2Options(Map("numPartitions" -> "1", "rowsPerSecond" -> "20").asJava))
     val startOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(0, 1000))))
     val endOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(20, 2000))))
@@ -80,7 +124,7 @@ class RateSourceV2Suite extends StreamTest {
   }
 
   test("microbatch - data read") {
-    val reader = new RateStreamV2Reader(
+    val reader = new RateStreamMicroBatchReader(
       new DataSourceV2Options(Map("numPartitions" -> "11", "rowsPerSecond" -> "33").asJava))
     val startOffset = RateStreamSourceV2.createInitialOffset(11, reader.creationTimeMs)
     val endOffset = RateStreamOffset(startOffset.partitionToValueAndRunTimeMs.toSeq.map {
@@ -107,14 +151,14 @@ class RateSourceV2Suite extends StreamTest {
     DataSource.lookupDataSource("rate", spark.sqlContext.conf).newInstance() match {
       case ds: ContinuousReadSupport =>
         val reader = ds.createContinuousReader(Optional.empty(), "", DataSourceV2Options.empty())
-        assert(reader.isInstanceOf[ContinuousRateStreamReader])
+        assert(reader.isInstanceOf[RateStreamContinuousReader])
       case _ =>
         throw new IllegalStateException("Could not find v2 read support for rate")
     }
   }
 
   test("continuous data") {
-    val reader = new ContinuousRateStreamReader(
+    val reader = new RateStreamContinuousReader(
       new DataSourceV2Options(Map("numPartitions" -> "2", "rowsPerSecond" -> "20").asJava))
     reader.setOffset(Optional.empty())
     val tasks = reader.createReadTasks()
@@ -122,17 +166,17 @@ class RateSourceV2Suite extends StreamTest {
 
     val data = scala.collection.mutable.ListBuffer[Row]()
     tasks.asScala.foreach {
-      case t: RateStreamReadTask =>
+      case t: RateStreamContinuousReadTask =>
         val startTimeMs = reader.getStartOffset()
           .asInstanceOf[RateStreamOffset]
           .partitionToValueAndRunTimeMs(t.partitionIndex)
           .runTimeMs
-        val r = t.createDataReader().asInstanceOf[RateStreamDataReader]
+        val r = t.createDataReader().asInstanceOf[RateStreamContinuousDataReader]
         for (rowIndex <- 0 to 9) {
           r.next()
           data.append(r.get())
           assert(r.getOffset() ==
-            ContinuousRateStreamPartitionOffset(
+            RateStreamPartitionOffset(
               t.partitionIndex,
               t.partitionIndex + rowIndex * 2,
               startTimeMs + (rowIndex + 1) * 100))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index 4b7f0fbe97d4e..d46461fa9bf6d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -105,7 +105,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
      * the active query, and then return the source object the data was added, as well as the
      * offset of added data.
      */
-    def addData(query: Option[StreamExecution]): (Source, Offset)
+    def addData(query: Option[StreamExecution]): (BaseStreamingSource, Offset)
   }
 
   /** A trait that can be extended when testing a source. */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala
index eda0d8ad48313..9562c10feafe9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala
@@ -61,7 +61,7 @@ class ContinuousSuiteBase extends StreamTest {
       case s: ContinuousExecution =>
         assert(numTriggers >= 2, "must wait for at least 2 triggers to ensure query is initialized")
         val reader = s.lastExecution.executedPlan.collectFirst {
-          case DataSourceV2ScanExec(_, r: ContinuousRateStreamReader) => r
+          case DataSourceV2ScanExec(_, r: RateStreamContinuousReader) => r
         }.get
 
         val deltaMs = numTriggers * 1000 + 300

From df047bdbaf7688cf5bcd8f3842711e551cc49057 Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Wed, 10 Jan 2018 10:15:27 +0800
Subject: [PATCH 052/566] [SPARK-23005][CORE] Improve RDD.take on small number
 of partitions

## What changes were proposed in this pull request?
In current implementation of RDD.take, we overestimate the number of partitions we need to try by 50%:
`(1.5 * num * partsScanned / buf.size).toInt`
However, when the number is small, the result of `.toInt` is not what we want.
E.g, 2.9 will become 2, which should be 3.
Use Math.ceil to fix the problem.

Also clean up the code in RDD.scala.

## How was this patch tested?

Unit test

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #20200 from gengliangwang/Take.

(cherry picked from commit 96ba217a06fbe1dad703447d7058cb7841653861)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../main/scala/org/apache/spark/rdd/RDD.scala | 27 +++++++++----------
 .../spark/sql/execution/SparkPlan.scala       |  5 ++--
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 8798dfc925362..7859781e98223 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -150,7 +150,7 @@ abstract class RDD[T: ClassTag](
   val id: Int = sc.newRddId()
 
   /** A friendly name for this RDD */
-  @transient var name: String = null
+  @transient var name: String = _
 
   /** Assign a name to this RDD */
   def setName(_name: String): this.type = {
@@ -224,8 +224,8 @@ abstract class RDD[T: ClassTag](
 
   // Our dependencies and partitions will be gotten by calling subclass's methods below, and will
   // be overwritten when we're checkpointed
-  private var dependencies_ : Seq[Dependency[_]] = null
-  @transient private var partitions_ : Array[Partition] = null
+  private var dependencies_ : Seq[Dependency[_]] = _
+  @transient private var partitions_ : Array[Partition] = _
 
   /** An Option holding our checkpoint RDD, if we are checkpointed */
   private def checkpointRDD: Option[CheckpointRDD[T]] = checkpointData.flatMap(_.checkpointRDD)
@@ -297,7 +297,7 @@ abstract class RDD[T: ClassTag](
   private[spark] def getNarrowAncestors: Seq[RDD[_]] = {
     val ancestors = new mutable.HashSet[RDD[_]]
 
-    def visit(rdd: RDD[_]) {
+    def visit(rdd: RDD[_]): Unit = {
       val narrowDependencies = rdd.dependencies.filter(_.isInstanceOf[NarrowDependency[_]])
       val narrowParents = narrowDependencies.map(_.rdd)
       val narrowParentsNotVisited = narrowParents.filterNot(ancestors.contains)
@@ -449,7 +449,7 @@ abstract class RDD[T: ClassTag](
     if (shuffle) {
       /** Distributes elements evenly across output partitions, starting from a random partition. */
       val distributePartition = (index: Int, items: Iterator[T]) => {
-        var position = (new Random(hashing.byteswap32(index))).nextInt(numPartitions)
+        var position = new Random(hashing.byteswap32(index)).nextInt(numPartitions)
         items.map { t =>
           // Note that the hash code of the key will just be the key itself. The HashPartitioner
           // will mod it with the number of total partitions.
@@ -951,7 +951,7 @@ abstract class RDD[T: ClassTag](
     def collectPartition(p: Int): Array[T] = {
       sc.runJob(this, (iter: Iterator[T]) => iter.toArray, Seq(p)).head
     }
-    (0 until partitions.length).iterator.flatMap(i => collectPartition(i))
+    partitions.indices.iterator.flatMap(i => collectPartition(i))
   }
 
   /**
@@ -1338,6 +1338,7 @@ abstract class RDD[T: ClassTag](
         // The number of partitions to try in this iteration. It is ok for this number to be
         // greater than totalParts because we actually cap it at totalParts in runJob.
         var numPartsToTry = 1L
+        val left = num - buf.size
         if (partsScanned > 0) {
           // If we didn't find any rows after the previous iteration, quadruple and retry.
           // Otherwise, interpolate the number of partitions we need to try, but overestimate
@@ -1345,13 +1346,12 @@ abstract class RDD[T: ClassTag](
           if (buf.isEmpty) {
             numPartsToTry = partsScanned * scaleUpFactor
           } else {
-            // the left side of max is >=1 whenever partsScanned >= 2
-            numPartsToTry = Math.max((1.5 * num * partsScanned / buf.size).toInt - partsScanned, 1)
+            // As left > 0, numPartsToTry is always >= 1
+            numPartsToTry = Math.ceil(1.5 * left * partsScanned / buf.size).toInt
             numPartsToTry = Math.min(numPartsToTry, partsScanned * scaleUpFactor)
           }
         }
 
-        val left = num - buf.size
         val p = partsScanned.until(math.min(partsScanned + numPartsToTry, totalParts).toInt)
         val res = sc.runJob(this, (it: Iterator[T]) => it.take(left).toArray, p)
 
@@ -1677,8 +1677,7 @@ abstract class RDD[T: ClassTag](
   // an RDD and its parent in every batch, in which case the parent may never be checkpointed
   // and its lineage never truncated, leading to OOMs in the long run (SPARK-6847).
   private val checkpointAllMarkedAncestors =
-    Option(sc.getLocalProperty(RDD.CHECKPOINT_ALL_MARKED_ANCESTORS))
-      .map(_.toBoolean).getOrElse(false)
+    Option(sc.getLocalProperty(RDD.CHECKPOINT_ALL_MARKED_ANCESTORS)).exists(_.toBoolean)
 
   /** Returns the first parent RDD */
   protected[spark] def firstParent[U: ClassTag]: RDD[U] = {
@@ -1686,7 +1685,7 @@ abstract class RDD[T: ClassTag](
   }
 
   /** Returns the jth parent RDD: e.g. rdd.parent[T](0) is equivalent to rdd.firstParent[T] */
-  protected[spark] def parent[U: ClassTag](j: Int) = {
+  protected[spark] def parent[U: ClassTag](j: Int): RDD[U] = {
     dependencies(j).rdd.asInstanceOf[RDD[U]]
   }
 
@@ -1754,7 +1753,7 @@ abstract class RDD[T: ClassTag](
    * collected. Subclasses of RDD may override this method for implementing their own cleaning
    * logic. See [[org.apache.spark.rdd.UnionRDD]] for an example.
    */
-  protected def clearDependencies() {
+  protected def clearDependencies(): Unit = {
     dependencies_ = null
   }
 
@@ -1790,7 +1789,7 @@ abstract class RDD[T: ClassTag](
           val lastDepStrings =
             debugString(lastDep.rdd, prefix, lastDep.isInstanceOf[ShuffleDependency[_, _, _]], true)
 
-          (frontDepStrings ++ lastDepStrings)
+          frontDepStrings ++ lastDepStrings
       }
     }
     // The first RDD in the dependency stack has no parents, so no need for a +-
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 82300efc01632..398758a3331b4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -351,8 +351,9 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
         if (buf.isEmpty) {
           numPartsToTry = partsScanned * limitScaleUpFactor
         } else {
-          // the left side of max is >=1 whenever partsScanned >= 2
-          numPartsToTry = Math.max((1.5 * n * partsScanned / buf.size).toInt - partsScanned, 1)
+          val left = n - buf.size
+          // As left > 0, numPartsToTry is always >= 1
+          numPartsToTry = Math.ceil(1.5 * left * partsScanned / buf.size).toInt
           numPartsToTry = Math.min(numPartsToTry, partsScanned * limitScaleUpFactor)
         }
       }

From 45f5c3cee9f3547ee0d526a5a8df38ae3f5e2796 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 10 Jan 2018 10:20:34 +0800
Subject: [PATCH 053/566] [MINOR] fix a typo in BroadcastJoinSuite

## What changes were proposed in this pull request?

`BroadcastNestedLoopJoinExec` should be `BroadcastHashJoinExec`

## How was this patch tested?

N/A

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20202 from cloud-fan/typo.

(cherry picked from commit 6f169ca9e1444fe8fd1ab6f3fbf0a8be1670f1b5)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/sql/execution/joins/BroadcastJoinSuite.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
index 6da46ea3480b3..0bcd54e1fceab 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
@@ -318,7 +318,7 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils {
       case b: BroadcastNestedLoopJoinExec =>
         assert(b.getClass.getSimpleName === joinMethod)
         assert(b.buildSide === buildSide)
-      case b: BroadcastNestedLoopJoinExec =>
+      case b: BroadcastHashJoinExec =>
         assert(b.getClass.getSimpleName === joinMethod)
         assert(b.buildSide === buildSide)
       case w: WholeStageCodegenExec =>

From 20a8c88671da73ab5d0e7048fe0424022ec16853 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Wed, 10 Jan 2018 14:00:07 +0900
Subject: [PATCH 054/566] [SPARK-23018][PYTHON] Fix createDataFrame from Pandas
 timestamp series assignment

## What changes were proposed in this pull request?

This fixes createDataFrame from Pandas to only assign modified timestamp series back to a copied version of the Pandas DataFrame.  Previously, if the Pandas DataFrame was only a reference (e.g. a slice of another) each series will still get assigned back to the reference even if it is not a modified timestamp column.  This caused the following warning "SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame."

## How was this patch tested?

existing tests

Author: Bryan Cutler <cutlerb@gmail.com>

Closes #20213 from BryanCutler/pyspark-createDataFrame-copy-slice-warn-SPARK-23018.

(cherry picked from commit 7bcc2666810cefc85dfa0d6679ac7a0de9e23154)
Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
---
 python/pyspark/sql/session.py | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 6052fa9e84096..3e4574729a631 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -459,21 +459,23 @@ def _convert_from_pandas(self, pdf, schema, timezone):
                     # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
                     if isinstance(field.dataType, TimestampType):
                         s = _check_series_convert_timestamps_tz_local(pdf[field.name], timezone)
-                        if not copied and s is not pdf[field.name]:
-                            # Copy once if the series is modified to prevent the original Pandas
-                            # DataFrame from being updated
-                            pdf = pdf.copy()
-                            copied = True
-                        pdf[field.name] = s
+                        if s is not pdf[field.name]:
+                            if not copied:
+                                # Copy once if the series is modified to prevent the original
+                                # Pandas DataFrame from being updated
+                                pdf = pdf.copy()
+                                copied = True
+                            pdf[field.name] = s
             else:
                 for column, series in pdf.iteritems():
-                    s = _check_series_convert_timestamps_tz_local(pdf[column], timezone)
-                    if not copied and s is not pdf[column]:
-                        # Copy once if the series is modified to prevent the original Pandas
-                        # DataFrame from being updated
-                        pdf = pdf.copy()
-                        copied = True
-                    pdf[column] = s
+                    s = _check_series_convert_timestamps_tz_local(series, timezone)
+                    if s is not series:
+                        if not copied:
+                            # Copy once if the series is modified to prevent the original
+                            # Pandas DataFrame from being updated
+                            pdf = pdf.copy()
+                            copied = True
+                        pdf[column] = s
 
         # Convert pandas.DataFrame to list of numpy records
         np_records = pdf.to_records(index=False)

From 162c5beccf6dfc1c8a4b7e2892cefd96a87ec900 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 10 Jan 2018 15:01:11 +0800
Subject: [PATCH 055/566] [SPARK-22982] Remove unsafe asynchronous close() call
 from FileDownloadChannel

## What changes were proposed in this pull request?

This patch fixes a severe asynchronous IO bug in Spark's Netty-based file transfer code. At a high-level, the problem is that an unsafe asynchronous `close()` of a pipe's source channel creates a race condition where file transfer code closes a file descriptor then attempts to read from it. If the closed file descriptor's number has been reused by an `open()` call then this invalid read may cause unrelated file operations to return incorrect results. **One manifestation of this problem is incorrect query results.**

For a high-level overview of how file download works, take a look at the control flow in `NettyRpcEnv.openChannel()`: this code creates a pipe to buffer results, then submits an asynchronous stream request to a lower-level TransportClient. The callback passes received data to the sink end of the pipe. The source end of the pipe is passed back to the caller of `openChannel()`. Thus `openChannel()` returns immediately and callers interact with the returned pipe source channel.

Because the underlying stream request is asynchronous, errors may occur after `openChannel()` has returned and after that method's caller has started to `read()` from the returned channel. For example, if a client requests an invalid stream from a remote server then the "stream does not exist" error may not be received from the remote server until after `openChannel()` has returned. In order to be able to propagate the "stream does not exist" error to the file-fetching application thread, this code wraps the pipe's source channel in a special `FileDownloadChannel` which adds an `setError(t: Throwable)` method, then calls this `setError()` method in the FileDownloadCallback's `onFailure` method.

It is possible for `FileDownloadChannel`'s `read()` and `setError()` methods to be called concurrently from different threads: the `setError()` method is called from within the Netty RPC system's stream callback handlers, while the `read()` methods are called from higher-level application code performing remote stream reads.

The problem lies in `setError()`: the existing code closed the wrapped pipe source channel. Because `read()` and `setError()` occur in different threads, this means it is possible for one thread to be calling `source.read()` while another asynchronously calls `source.close()`. Java's IO libraries do not guarantee that this will be safe and, in fact, it's possible for these operations to interleave in such a way that a lower-level `read()` system call occurs right after a `close()` call. In the best-case, this fails as a read of a closed file descriptor; in the worst-case, the file descriptor number has been re-used by an intervening `open()` operation and the read corrupts the result of an unrelated file IO operation being performed by a different thread.

The solution here is to remove the `stream.close()` call in `onError()`: the thread that is performing the `read()` calls is responsible for closing the stream in a `finally` block, so there's no need to close it here. If that thread is blocked in a `read()` then it will become unblocked when the sink end of the pipe is closed in `FileDownloadCallback.onFailure()`.

After making this change, we also need to refine the `read()` method to always check for a `setError()` result, even if the underlying channel `read()` call has succeeded.

This patch also makes a slight cleanup to a dodgy-looking `catch e: Exception` block to use a safer `try-finally` error handling idiom.

This bug was introduced in SPARK-11956 / #9941 and is present in Spark 1.6.0+.

## How was this patch tested?

This fix was tested manually against a workload which non-deterministically hit this bug.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #20179 from JoshRosen/SPARK-22982-fix-unsafe-async-io-in-file-download-channel.

(cherry picked from commit edf0a48c2ec696b92ed6a96dcee6eeb1a046b20b)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/rpc/netty/NettyRpcEnv.scala  | 37 +++++++++++--------
 .../shuffle/IndexShuffleBlockResolver.scala   | 21 +++++++++--
 2 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
index f951591e02a5c..a2936d6ad539c 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
@@ -332,16 +332,14 @@ private[netty] class NettyRpcEnv(
 
     val pipe = Pipe.open()
     val source = new FileDownloadChannel(pipe.source())
-    try {
+    Utils.tryWithSafeFinallyAndFailureCallbacks(block = {
       val client = downloadClient(parsedUri.getHost(), parsedUri.getPort())
       val callback = new FileDownloadCallback(pipe.sink(), source, client)
       client.stream(parsedUri.getPath(), callback)
-    } catch {
-      case e: Exception =>
-        pipe.sink().close()
-        source.close()
-        throw e
-    }
+    })(catchBlock = {
+      pipe.sink().close()
+      source.close()
+    })
 
     source
   }
@@ -370,24 +368,33 @@ private[netty] class NettyRpcEnv(
     fileDownloadFactory.createClient(host, port)
   }
 
-  private class FileDownloadChannel(source: ReadableByteChannel) extends ReadableByteChannel {
+  private class FileDownloadChannel(source: Pipe.SourceChannel) extends ReadableByteChannel {
 
     @volatile private var error: Throwable = _
 
     def setError(e: Throwable): Unit = {
+      // This setError callback is invoked by internal RPC threads in order to propagate remote
+      // exceptions to application-level threads which are reading from this channel. When an
+      // RPC error occurs, the RPC system will call setError() and then will close the
+      // Pipe.SinkChannel corresponding to the other end of the `source` pipe. Closing of the pipe
+      // sink will cause `source.read()` operations to return EOF, unblocking the application-level
+      // reading thread. Thus there is no need to actually call `source.close()` here in the
+      // onError() callback and, in fact, calling it here would be dangerous because the close()
+      // would be asynchronous with respect to the read() call and could trigger race-conditions
+      // that lead to data corruption. See the PR for SPARK-22982 for more details on this topic.
       error = e
-      source.close()
     }
 
     override def read(dst: ByteBuffer): Int = {
       Try(source.read(dst)) match {
+        // See the documentation above in setError(): if an RPC error has occurred then setError()
+        // will be called to propagate the RPC error and then `source`'s corresponding
+        // Pipe.SinkChannel will be closed, unblocking this read. In that case, we want to propagate
+        // the remote RPC exception (and not any exceptions triggered by the pipe close, such as
+        // ChannelClosedException), hence this `error != null` check:
+        case _ if error != null => throw error
         case Success(bytesRead) => bytesRead
-        case Failure(readErr) =>
-          if (error != null) {
-            throw error
-          } else {
-            throw readErr
-          }
+        case Failure(readErr) => throw readErr
       }
     }
 
diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
index 15540485170d0..266ee42e39cca 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
@@ -18,8 +18,8 @@
 package org.apache.spark.shuffle
 
 import java.io._
-
-import com.google.common.io.ByteStreams
+import java.nio.channels.Channels
+import java.nio.file.Files
 
 import org.apache.spark.{SparkConf, SparkEnv}
 import org.apache.spark.internal.Logging
@@ -196,11 +196,24 @@ private[spark] class IndexShuffleBlockResolver(
     // find out the consolidated file, then the offset within that from our index
     val indexFile = getIndexFile(blockId.shuffleId, blockId.mapId)
 
-    val in = new DataInputStream(new FileInputStream(indexFile))
+    // SPARK-22982: if this FileInputStream's position is seeked forward by another piece of code
+    // which is incorrectly using our file descriptor then this code will fetch the wrong offsets
+    // (which may cause a reducer to be sent a different reducer's data). The explicit position
+    // checks added here were a useful debugging aid during SPARK-22982 and may help prevent this
+    // class of issue from re-occurring in the future which is why they are left here even though
+    // SPARK-22982 is fixed.
+    val channel = Files.newByteChannel(indexFile.toPath)
+    channel.position(blockId.reduceId * 8)
+    val in = new DataInputStream(Channels.newInputStream(channel))
     try {
-      ByteStreams.skipFully(in, blockId.reduceId * 8)
       val offset = in.readLong()
       val nextOffset = in.readLong()
+      val actualPosition = channel.position()
+      val expectedPosition = blockId.reduceId * 8 + 16
+      if (actualPosition != expectedPosition) {
+        throw new Exception(s"SPARK-22982: Incorrect channel position after index file reads: " +
+          s"expected $expectedPosition but actual position was $actualPosition.")
+      }
       new FileSegmentManagedBuffer(
         transportConf,
         getDataFile(blockId.shuffleId, blockId.mapId),

From ecc24ec7fcf1225e5030a1c1224dca9a99138830 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 10 Jan 2018 15:16:27 +0800
Subject: [PATCH 056/566] [SPARK-16060][SQL][FOLLOW-UP] add a wrapper solution
 for vectorized orc reader

## What changes were proposed in this pull request?

This is mostly from https://github.com/apache/spark/pull/13775

The wrapper solution is pretty good for string/binary type, as the ORC column vector doesn't keep bytes in a continuous memory region, and has a significant overhead when copying the data to Spark columnar batch. For other cases, the wrapper solution is almost same with the current solution.

I think we can treat the wrapper solution as a baseline and keep improving the writing to Spark solution.

## How was this patch tested?

existing tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20205 from cloud-fan/orc.

(cherry picked from commit eaac60a1e20e29084b7151ffca964cfaa5ba99d1)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/sql/internal/SQLConf.scala   |   7 +
 .../datasources/orc/OrcColumnVector.java      | 251 ++++++++++++++++++
 .../orc/OrcColumnarBatchReader.java           | 106 ++++++--
 .../datasources/orc/OrcFileFormat.scala       |   6 +-
 .../spark/sql/hive/orc/OrcReadBenchmark.scala | 236 ++++++++++------
 5 files changed, 490 insertions(+), 116 deletions(-)
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 74949db883f7a..36e802a9faa6f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -391,6 +391,13 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
+  val ORC_COPY_BATCH_TO_SPARK = buildConf("spark.sql.orc.copyBatchToSpark")
+    .doc("Whether or not to copy the ORC columnar batch to Spark columnar batch in the " +
+      "vectorized ORC reader.")
+    .internal()
+    .booleanConf
+    .createWithDefault(false)
+
   val ORC_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.orc.filterPushdown")
     .doc("When true, enable filter pushdown for ORC files.")
     .booleanConf
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
new file mode 100644
index 0000000000000..f94c55d860304
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
@@ -0,0 +1,251 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.orc;
+
+import java.math.BigDecimal;
+
+import org.apache.orc.storage.ql.exec.vector.*;
+
+import org.apache.spark.sql.types.DataType;
+import org.apache.spark.sql.types.Decimal;
+import org.apache.spark.sql.types.TimestampType;
+import org.apache.spark.unsafe.types.UTF8String;
+
+/**
+ * A column vector class wrapping Hive's ColumnVector. Because Spark ColumnarBatch only accepts
+ * Spark's vectorized.ColumnVector, this column vector is used to adapt Hive ColumnVector with
+ * Spark ColumnarVector.
+ */
+public class OrcColumnVector extends org.apache.spark.sql.vectorized.ColumnVector {
+  private ColumnVector baseData;
+  private LongColumnVector longData;
+  private DoubleColumnVector doubleData;
+  private BytesColumnVector bytesData;
+  private DecimalColumnVector decimalData;
+  private TimestampColumnVector timestampData;
+  final private boolean isTimestamp;
+
+  private int batchSize;
+
+  OrcColumnVector(DataType type, ColumnVector vector) {
+    super(type);
+
+    if (type instanceof TimestampType) {
+      isTimestamp = true;
+    } else {
+      isTimestamp = false;
+    }
+
+    baseData = vector;
+    if (vector instanceof LongColumnVector) {
+      longData = (LongColumnVector) vector;
+    } else if (vector instanceof DoubleColumnVector) {
+      doubleData = (DoubleColumnVector) vector;
+    } else if (vector instanceof BytesColumnVector) {
+      bytesData = (BytesColumnVector) vector;
+    } else if (vector instanceof DecimalColumnVector) {
+      decimalData = (DecimalColumnVector) vector;
+    } else if (vector instanceof TimestampColumnVector) {
+      timestampData = (TimestampColumnVector) vector;
+    } else {
+      throw new UnsupportedOperationException();
+    }
+  }
+
+  public void setBatchSize(int batchSize) {
+    this.batchSize = batchSize;
+  }
+
+  @Override
+  public void close() {
+
+  }
+
+  @Override
+  public int numNulls() {
+    if (baseData.isRepeating) {
+      if (baseData.isNull[0]) {
+        return batchSize;
+      } else {
+        return 0;
+      }
+    } else if (baseData.noNulls) {
+      return 0;
+    } else {
+      int count = 0;
+      for (int i = 0; i < batchSize; i++) {
+        if (baseData.isNull[i]) count++;
+      }
+      return count;
+    }
+  }
+
+  /* A helper method to get the row index in a column. */
+  private int getRowIndex(int rowId) {
+    return baseData.isRepeating ? 0 : rowId;
+  }
+
+  @Override
+  public boolean isNullAt(int rowId) {
+    return baseData.isNull[getRowIndex(rowId)];
+  }
+
+  @Override
+  public boolean getBoolean(int rowId) {
+    return longData.vector[getRowIndex(rowId)] == 1;
+  }
+
+  @Override
+  public boolean[] getBooleans(int rowId, int count) {
+    boolean[] res = new boolean[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getBoolean(rowId + i);
+    }
+    return res;
+  }
+
+  @Override
+  public byte getByte(int rowId) {
+    return (byte) longData.vector[getRowIndex(rowId)];
+  }
+
+  @Override
+  public byte[] getBytes(int rowId, int count) {
+    byte[] res = new byte[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getByte(rowId + i);
+    }
+    return res;
+  }
+
+  @Override
+  public short getShort(int rowId) {
+    return (short) longData.vector[getRowIndex(rowId)];
+  }
+
+  @Override
+  public short[] getShorts(int rowId, int count) {
+    short[] res = new short[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getShort(rowId + i);
+    }
+    return res;
+  }
+
+  @Override
+  public int getInt(int rowId) {
+    return (int) longData.vector[getRowIndex(rowId)];
+  }
+
+  @Override
+  public int[] getInts(int rowId, int count) {
+    int[] res = new int[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getInt(rowId + i);
+    }
+    return res;
+  }
+
+  @Override
+  public long getLong(int rowId) {
+    int index = getRowIndex(rowId);
+    if (isTimestamp) {
+      return timestampData.time[index] * 1000 + timestampData.nanos[index] / 1000;
+    } else {
+      return longData.vector[index];
+    }
+  }
+
+  @Override
+  public long[] getLongs(int rowId, int count) {
+    long[] res = new long[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getLong(rowId + i);
+    }
+    return res;
+  }
+
+  @Override
+  public float getFloat(int rowId) {
+    return (float) doubleData.vector[getRowIndex(rowId)];
+  }
+
+  @Override
+  public float[] getFloats(int rowId, int count) {
+    float[] res = new float[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getFloat(rowId + i);
+    }
+    return res;
+  }
+
+  @Override
+  public double getDouble(int rowId) {
+    return doubleData.vector[getRowIndex(rowId)];
+  }
+
+  @Override
+  public double[] getDoubles(int rowId, int count) {
+    double[] res = new double[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getDouble(rowId + i);
+    }
+    return res;
+  }
+
+  @Override
+  public int getArrayLength(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public int getArrayOffset(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public Decimal getDecimal(int rowId, int precision, int scale) {
+    BigDecimal data = decimalData.vector[getRowIndex(rowId)].getHiveDecimal().bigDecimalValue();
+    return Decimal.apply(data, precision, scale);
+  }
+
+  @Override
+  public UTF8String getUTF8String(int rowId) {
+    int index = getRowIndex(rowId);
+    BytesColumnVector col = bytesData;
+    return UTF8String.fromBytes(col.vector[index], col.start[index], col.length[index]);
+  }
+
+  @Override
+  public byte[] getBinary(int rowId) {
+    int index = getRowIndex(rowId);
+    byte[] binary = new byte[bytesData.length[index]];
+    System.arraycopy(bytesData.vector[index], bytesData.start[index], binary, 0, binary.length);
+    return binary;
+  }
+
+  @Override
+  public org.apache.spark.sql.vectorized.ColumnVector arrayData() {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public org.apache.spark.sql.vectorized.ColumnVector getChildColumn(int ordinal) {
+    throw new UnsupportedOperationException();
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
index 5c28d0e6e507a..36fdf2bdf84d2 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
@@ -51,13 +51,13 @@
 public class OrcColumnarBatchReader extends RecordReader<Void, ColumnarBatch> {
 
   /**
-   * The default size of batch. We use this value for both ORC and Spark consistently
-   * because they have different default values like the following.
+   * The default size of batch. We use this value for ORC reader to make it consistent with Spark's
+   * columnar batch, because their default batch sizes are different like the following:
    *
    * - ORC's VectorizedRowBatch.DEFAULT_SIZE = 1024
    * - Spark's ColumnarBatch.DEFAULT_BATCH_SIZE = 4 * 1024
    */
-  public static final int DEFAULT_SIZE = 4 * 1024;
+  private static final int DEFAULT_SIZE = 4 * 1024;
 
   // ORC File Reader
   private Reader reader;
@@ -82,13 +82,18 @@ public class OrcColumnarBatchReader extends RecordReader<Void, ColumnarBatch> {
   // Writable column vectors of the result columnar batch.
   private WritableColumnVector[] columnVectors;
 
-  /**
-   * The memory mode of the columnarBatch
-   */
+  // The wrapped ORC column vectors. It should be null if `copyToSpark` is true.
+  private org.apache.spark.sql.vectorized.ColumnVector[] orcVectorWrappers;
+
+  // The memory mode of the columnarBatch
   private final MemoryMode MEMORY_MODE;
 
-  public OrcColumnarBatchReader(boolean useOffHeap) {
+  // Whether or not to copy the ORC columnar batch to Spark columnar batch.
+  private final boolean copyToSpark;
+
+  public OrcColumnarBatchReader(boolean useOffHeap, boolean copyToSpark) {
     MEMORY_MODE = useOffHeap ? MemoryMode.OFF_HEAP : MemoryMode.ON_HEAP;
+    this.copyToSpark = copyToSpark;
   }
 
 
@@ -167,27 +172,61 @@ public void initBatch(
     }
 
     int capacity = DEFAULT_SIZE;
-    if (MEMORY_MODE == MemoryMode.OFF_HEAP) {
-      columnVectors = OffHeapColumnVector.allocateColumns(capacity, resultSchema);
-    } else {
-      columnVectors = OnHeapColumnVector.allocateColumns(capacity, resultSchema);
-    }
-    columnarBatch = new ColumnarBatch(resultSchema, columnVectors, capacity);
 
-    if (partitionValues.numFields() > 0) {
-      int partitionIdx = requiredFields.length;
-      for (int i = 0; i < partitionValues.numFields(); i++) {
-        ColumnVectorUtils.populate(columnVectors[i + partitionIdx], partitionValues, i);
-        columnVectors[i + partitionIdx].setIsConstant();
+    if (copyToSpark) {
+      if (MEMORY_MODE == MemoryMode.OFF_HEAP) {
+        columnVectors = OffHeapColumnVector.allocateColumns(capacity, resultSchema);
+      } else {
+        columnVectors = OnHeapColumnVector.allocateColumns(capacity, resultSchema);
       }
-    }
 
-    // Initialize the missing columns once.
-    for (int i = 0; i < requiredFields.length; i++) {
-      if (requestedColIds[i] == -1) {
-        columnVectors[i].putNulls(0, columnarBatch.capacity());
-        columnVectors[i].setIsConstant();
+      // Initialize the missing columns once.
+      for (int i = 0; i < requiredFields.length; i++) {
+        if (requestedColIds[i] == -1) {
+          columnVectors[i].putNulls(0, capacity);
+          columnVectors[i].setIsConstant();
+        }
+      }
+
+      if (partitionValues.numFields() > 0) {
+        int partitionIdx = requiredFields.length;
+        for (int i = 0; i < partitionValues.numFields(); i++) {
+          ColumnVectorUtils.populate(columnVectors[i + partitionIdx], partitionValues, i);
+          columnVectors[i + partitionIdx].setIsConstant();
+        }
+      }
+
+      columnarBatch = new ColumnarBatch(resultSchema, columnVectors, capacity);
+    } else {
+      // Just wrap the ORC column vector instead of copying it to Spark column vector.
+      orcVectorWrappers = new org.apache.spark.sql.vectorized.ColumnVector[resultSchema.length()];
+
+      for (int i = 0; i < requiredFields.length; i++) {
+        DataType dt = requiredFields[i].dataType();
+        int colId = requestedColIds[i];
+        // Initialize the missing columns once.
+        if (colId == -1) {
+          OnHeapColumnVector missingCol = new OnHeapColumnVector(capacity, dt);
+          missingCol.putNulls(0, capacity);
+          missingCol.setIsConstant();
+          orcVectorWrappers[i] = missingCol;
+        } else {
+          orcVectorWrappers[i] = new OrcColumnVector(dt, batch.cols[colId]);
+        }
       }
+
+      if (partitionValues.numFields() > 0) {
+        int partitionIdx = requiredFields.length;
+        for (int i = 0; i < partitionValues.numFields(); i++) {
+          DataType dt = partitionSchema.fields()[i].dataType();
+          OnHeapColumnVector partitionCol = new OnHeapColumnVector(capacity, dt);
+          ColumnVectorUtils.populate(partitionCol, partitionValues, i);
+          partitionCol.setIsConstant();
+          orcVectorWrappers[partitionIdx + i] = partitionCol;
+        }
+      }
+
+      columnarBatch = new ColumnarBatch(resultSchema, orcVectorWrappers, capacity);
     }
   }
 
@@ -196,17 +235,26 @@ public void initBatch(
    * by copying from ORC VectorizedRowBatch columns to Spark ColumnarBatch columns.
    */
   private boolean nextBatch() throws IOException {
-    for (WritableColumnVector vector : columnVectors) {
-      vector.reset();
-    }
-    columnarBatch.setNumRows(0);
-
     recordReader.nextBatch(batch);
     int batchSize = batch.size;
     if (batchSize == 0) {
       return false;
     }
     columnarBatch.setNumRows(batchSize);
+
+    if (!copyToSpark) {
+      for (int i = 0; i < requiredFields.length; i++) {
+        if (requestedColIds[i] != -1) {
+          ((OrcColumnVector) orcVectorWrappers[i]).setBatchSize(batchSize);
+        }
+      }
+      return true;
+    }
+
+    for (WritableColumnVector vector : columnVectors) {
+      vector.reset();
+    }
+
     for (int i = 0; i < requiredFields.length; i++) {
       StructField field = requiredFields[i];
       WritableColumnVector toColumn = columnVectors[i];
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
index b8bacfa1838ae..2dd314d165348 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
@@ -38,6 +38,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
 import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.SerializableConfiguration
@@ -150,6 +151,7 @@ class OrcFileFormat
     val sqlConf = sparkSession.sessionState.conf
     val enableOffHeapColumnVector = sqlConf.offHeapColumnVectorEnabled
     val enableVectorizedReader = supportBatch(sparkSession, resultSchema)
+    val copyToSpark = sparkSession.sessionState.conf.getConf(SQLConf.ORC_COPY_BATCH_TO_SPARK)
 
     val broadcastedConf =
       sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
@@ -183,8 +185,8 @@ class OrcFileFormat
 
         val taskContext = Option(TaskContext.get())
         if (enableVectorizedReader) {
-          val batchReader =
-            new OrcColumnarBatchReader(enableOffHeapColumnVector && taskContext.isDefined)
+          val batchReader = new OrcColumnarBatchReader(
+            enableOffHeapColumnVector && taskContext.isDefined, copyToSpark)
           batchReader.initialize(fileSplit, taskAttemptContext)
           batchReader.initBatch(
             reader.getSchema,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala
index 37ed846acd1eb..bf6efa7c4c08c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala
@@ -86,7 +86,7 @@ object OrcReadBenchmark {
   }
 
   def numericScanBenchmark(values: Int, dataType: DataType): Unit = {
-    val sqlBenchmark = new Benchmark(s"SQL Single ${dataType.sql} Column Scan", values)
+    val benchmark = new Benchmark(s"SQL Single ${dataType.sql} Column Scan", values)
 
     withTempPath { dir =>
       withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
@@ -95,61 +95,73 @@ object OrcReadBenchmark {
 
         prepareTable(dir, spark.sql(s"SELECT CAST(value as ${dataType.sql}) id FROM t1"))
 
-        sqlBenchmark.addCase("Native ORC MR") { _ =>
+        benchmark.addCase("Native ORC MR") { _ =>
           withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
             spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
           }
         }
 
-        sqlBenchmark.addCase("Native ORC Vectorized") { _ =>
+        benchmark.addCase("Native ORC Vectorized") { _ =>
           spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
         }
 
-        sqlBenchmark.addCase("Hive built-in ORC") { _ =>
+        benchmark.addCase("Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Hive built-in ORC") { _ =>
           spark.sql("SELECT sum(id) FROM hiveOrcTable").collect()
         }
 
         /*
-        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
-        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+        Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
 
         SQL Single TINYINT Column Scan:          Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 1192 / 1221         13.2          75.8       1.0X
-        Native ORC Vectorized                          161 /  170         97.5          10.3       7.4X
-        Hive built-in ORC                             1399 / 1413         11.2          89.0       0.9X
+        Native ORC MR                                 1135 / 1171         13.9          72.2       1.0X
+        Native ORC Vectorized                          152 /  163        103.4           9.7       7.5X
+        Native ORC Vectorized with copy                149 /  162        105.4           9.5       7.6X
+        Hive built-in ORC                             1380 / 1384         11.4          87.7       0.8X
 
         SQL Single SMALLINT Column Scan:         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 1287 / 1333         12.2          81.8       1.0X
-        Native ORC Vectorized                          164 /  172         95.6          10.5       7.8X
-        Hive built-in ORC                             1629 / 1650          9.7         103.6       0.8X
+        Native ORC MR                                 1182 / 1244         13.3          75.2       1.0X
+        Native ORC Vectorized                          145 /  156        108.7           9.2       8.2X
+        Native ORC Vectorized with copy                148 /  158        106.4           9.4       8.0X
+        Hive built-in ORC                             1591 / 1636          9.9         101.2       0.7X
 
         SQL Single INT Column Scan:              Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 1304 / 1388         12.1          82.9       1.0X
-        Native ORC Vectorized                          227 /  240         69.3          14.4       5.7X
-        Hive built-in ORC                             1866 / 1867          8.4         118.6       0.7X
+        Native ORC MR                                 1271 / 1271         12.4          80.8       1.0X
+        Native ORC Vectorized                          206 /  212         76.3          13.1       6.2X
+        Native ORC Vectorized with copy                200 /  213         78.8          12.7       6.4X
+        Hive built-in ORC                             1776 / 1787          8.9         112.9       0.7X
 
         SQL Single BIGINT Column Scan:           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 1331 / 1357         11.8          84.6       1.0X
-        Native ORC Vectorized                          289 /  297         54.4          18.4       4.6X
-        Hive built-in ORC                             1922 / 1929          8.2         122.2       0.7X
+        Native ORC MR                                 1344 / 1355         11.7          85.4       1.0X
+        Native ORC Vectorized                          258 /  268         61.0          16.4       5.2X
+        Native ORC Vectorized with copy                252 /  257         62.4          16.0       5.3X
+        Hive built-in ORC                             1818 / 1823          8.7         115.6       0.7X
 
         SQL Single FLOAT Column Scan:            Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 1410 / 1428         11.2          89.7       1.0X
-        Native ORC Vectorized                          328 /  335         48.0          20.8       4.3X
-        Hive built-in ORC                             1929 / 2012          8.2         122.6       0.7X
+        Native ORC MR                                 1333 / 1352         11.8          84.8       1.0X
+        Native ORC Vectorized                          310 /  324         50.7          19.7       4.3X
+        Native ORC Vectorized with copy                312 /  320         50.4          19.9       4.3X
+        Hive built-in ORC                             1904 / 1918          8.3         121.0       0.7X
 
         SQL Single DOUBLE Column Scan:           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 1467 / 1485         10.7          93.3       1.0X
-        Native ORC Vectorized                          402 /  411         39.1          25.6       3.6X
-        Hive built-in ORC                             2023 / 2042          7.8         128.6       0.7X
+        Native ORC MR                                 1408 / 1585         11.2          89.5       1.0X
+        Native ORC Vectorized                          359 /  368         43.8          22.8       3.9X
+        Native ORC Vectorized with copy                364 /  371         43.2          23.2       3.9X
+        Hive built-in ORC                             1881 / 1954          8.4         119.6       0.7X
         */
-        sqlBenchmark.run()
+        benchmark.run()
       }
     }
   }
@@ -176,19 +188,26 @@ object OrcReadBenchmark {
           spark.sql("SELECT sum(c1), sum(length(c2)) FROM nativeOrcTable").collect()
         }
 
+        benchmark.addCase("Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql("SELECT sum(c1), sum(length(c2)) FROM nativeOrcTable").collect()
+          }
+        }
+
         benchmark.addCase("Hive built-in ORC") { _ =>
           spark.sql("SELECT sum(c1), sum(length(c2)) FROM hiveOrcTable").collect()
         }
 
         /*
-        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
-        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+        Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
 
         Int and String Scan:                     Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 2729 / 2744          3.8         260.2       1.0X
-        Native ORC Vectorized                         1318 / 1344          8.0         125.7       2.1X
-        Hive built-in ORC                             3731 / 3782          2.8         355.8       0.7X
+        Native ORC MR                                 2566 / 2592          4.1         244.7       1.0X
+        Native ORC Vectorized                         1098 / 1113          9.6         104.7       2.3X
+        Native ORC Vectorized with copy               1527 / 1593          6.9         145.6       1.7X
+        Hive built-in ORC                             3561 / 3705          2.9         339.6       0.7X
         */
         benchmark.run()
       }
@@ -205,63 +224,84 @@ object OrcReadBenchmark {
 
         prepareTable(dir, spark.sql("SELECT value % 2 AS p, value AS id FROM t1"), Some("p"))
 
-        benchmark.addCase("Read data column - Native ORC MR") { _ =>
+        benchmark.addCase("Data column - Native ORC MR") { _ =>
           withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
             spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
           }
         }
 
-        benchmark.addCase("Read data column - Native ORC Vectorized") { _ =>
+        benchmark.addCase("Data column - Native ORC Vectorized") { _ =>
           spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
         }
 
-        benchmark.addCase("Read data column - Hive built-in ORC") { _ =>
+        benchmark.addCase("Data column - Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Data column - Hive built-in ORC") { _ =>
           spark.sql("SELECT sum(id) FROM hiveOrcTable").collect()
         }
 
-        benchmark.addCase("Read partition column - Native ORC MR") { _ =>
+        benchmark.addCase("Partition column - Native ORC MR") { _ =>
           withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
             spark.sql("SELECT sum(p) FROM nativeOrcTable").collect()
           }
         }
 
-        benchmark.addCase("Read partition column - Native ORC Vectorized") { _ =>
+        benchmark.addCase("Partition column - Native ORC Vectorized") { _ =>
           spark.sql("SELECT sum(p) FROM nativeOrcTable").collect()
         }
 
-        benchmark.addCase("Read partition column - Hive built-in ORC") { _ =>
+        benchmark.addCase("Partition column - Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql("SELECT sum(p) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Partition column - Hive built-in ORC") { _ =>
           spark.sql("SELECT sum(p) FROM hiveOrcTable").collect()
         }
 
-        benchmark.addCase("Read both columns - Native ORC MR") { _ =>
+        benchmark.addCase("Both columns - Native ORC MR") { _ =>
           withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
             spark.sql("SELECT sum(p), sum(id) FROM nativeOrcTable").collect()
           }
         }
 
-        benchmark.addCase("Read both columns - Native ORC Vectorized") { _ =>
+        benchmark.addCase("Both columns - Native ORC Vectorized") { _ =>
           spark.sql("SELECT sum(p), sum(id) FROM nativeOrcTable").collect()
         }
 
-        benchmark.addCase("Read both columns - Hive built-in ORC") { _ =>
+        benchmark.addCase("Both column - Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql("SELECT sum(p), sum(id) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Both columns - Hive built-in ORC") { _ =>
           spark.sql("SELECT sum(p), sum(id) FROM hiveOrcTable").collect()
         }
 
         /*
-        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
-        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+        Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
 
         Partitioned Table:                       Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Read data column - Native ORC MR               1531 / 1536         10.3          97.4       1.0X
-        Read data column - Native ORC Vectorized        295 /  298         53.3          18.8       5.2X
-        Read data column - Hive built-in ORC           2125 / 2126          7.4         135.1       0.7X
-        Read partition column - Native ORC MR          1049 / 1062         15.0          66.7       1.5X
-        Read partition column - Native ORC Vectorized    54 /   57        290.1           3.4      28.2X
-        Read partition column - Hive built-in ORC      1282 / 1291         12.3          81.5       1.2X
-        Read both columns - Native ORC MR              1594 / 1598          9.9         101.3       1.0X
-        Read both columns - Native ORC Vectorized       332 /  336         47.4          21.1       4.6X
-        Read both columns - Hive built-in ORC          2145 / 2187          7.3         136.4       0.7X
+        Data only - Native ORC MR                      1447 / 1457         10.9          92.0       1.0X
+        Data only - Native ORC Vectorized               256 /  266         61.4          16.3       5.6X
+        Data only - Native ORC Vectorized with copy     263 /  273         59.8          16.7       5.5X
+        Data only - Hive built-in ORC                  1960 / 1988          8.0         124.6       0.7X
+        Partition only - Native ORC MR                 1039 / 1043         15.1          66.0       1.4X
+        Partition only - Native ORC Vectorized           48 /   53        326.6           3.1      30.1X
+        Partition only - Native ORC Vectorized with copy 48 /   53        328.4           3.0      30.2X
+        Partition only - Hive built-in ORC             1234 / 1242         12.7          78.4       1.2X
+        Both columns - Native ORC MR                   1465 / 1475         10.7          93.1       1.0X
+        Both columns - Native ORC Vectorized            292 /  301         53.9          18.6       5.0X
+        Both column - Native ORC Vectorized with copy   348 /  354         45.1          22.2       4.2X
+        Both columns - Hive built-in ORC               2051 / 2060          7.7         130.4       0.7X
         */
         benchmark.run()
       }
@@ -287,19 +327,26 @@ object OrcReadBenchmark {
           spark.sql("SELECT sum(length(c1)) FROM nativeOrcTable").collect()
         }
 
+        benchmark.addCase("Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql("SELECT sum(length(c1)) FROM nativeOrcTable").collect()
+          }
+        }
+
         benchmark.addCase("Hive built-in ORC") { _ =>
           spark.sql("SELECT sum(length(c1)) FROM hiveOrcTable").collect()
         }
 
         /*
-        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
-        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+        Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
 
         Repeated String:                         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 1325 / 1328          7.9         126.4       1.0X
-        Native ORC Vectorized                          320 /  330         32.8          30.5       4.1X
-        Hive built-in ORC                             1971 / 1972          5.3         188.0       0.7X
+        Native ORC MR                                 1271 / 1278          8.3         121.2       1.0X
+        Native ORC Vectorized                          200 /  212         52.4          19.1       6.4X
+        Native ORC Vectorized with copy                342 /  347         30.7          32.6       3.7X
+        Hive built-in ORC                             1874 / 2105          5.6         178.7       0.7X
         */
         benchmark.run()
       }
@@ -331,32 +378,42 @@ object OrcReadBenchmark {
             "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect()
         }
 
+        benchmark.addCase("Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql("SELECT SUM(LENGTH(c2)) FROM nativeOrcTable " +
+              "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect()
+          }
+        }
+
         benchmark.addCase("Hive built-in ORC") { _ =>
           spark.sql("SELECT SUM(LENGTH(c2)) FROM hiveOrcTable " +
             "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect()
         }
 
         /*
-        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
-        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+        Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
 
         String with Nulls Scan (0.0%):           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 2553 / 2554          4.1         243.4       1.0X
-        Native ORC Vectorized                          953 /  954         11.0          90.9       2.7X
-        Hive built-in ORC                             3875 / 3898          2.7         369.6       0.7X
+        Native ORC MR                                 2394 / 2886          4.4         228.3       1.0X
+        Native ORC Vectorized                          699 /  729         15.0          66.7       3.4X
+        Native ORC Vectorized with copy                959 / 1025         10.9          91.5       2.5X
+        Hive built-in ORC                             3899 / 3901          2.7         371.9       0.6X
 
         String with Nulls Scan (0.5%):           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 2389 / 2408          4.4         227.8       1.0X
-        Native ORC Vectorized                         1208 / 1209          8.7         115.2       2.0X
-        Hive built-in ORC                             2940 / 2952          3.6         280.4       0.8X
+        Native ORC MR                                 2234 / 2255          4.7         213.1       1.0X
+        Native ORC Vectorized                          854 /  869         12.3          81.4       2.6X
+        Native ORC Vectorized with copy               1099 / 1128          9.5         104.8       2.0X
+        Hive built-in ORC                             2767 / 2793          3.8         263.9       0.8X
 
         String with Nulls Scan (0.95%):          Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 1295 / 1311          8.1         123.5       1.0X
-        Native ORC Vectorized                          449 /  457         23.4          42.8       2.9X
-        Hive built-in ORC                             1649 / 1660          6.4         157.3       0.8X
+        Native ORC MR                                 1166 / 1202          9.0         111.2       1.0X
+        Native ORC Vectorized                          338 /  345         31.1          32.2       3.5X
+        Native ORC Vectorized with copy                418 /  428         25.1          39.9       2.8X
+        Hive built-in ORC                             1730 / 1761          6.1         164.9       0.7X
         */
         benchmark.run()
       }
@@ -364,7 +421,7 @@ object OrcReadBenchmark {
   }
 
   def columnsBenchmark(values: Int, width: Int): Unit = {
-    val sqlBenchmark = new Benchmark(s"SQL Single Column Scan from $width columns", values)
+    val benchmark = new Benchmark(s"Single Column Scan from $width columns", values)
 
     withTempPath { dir =>
       withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
@@ -376,43 +433,52 @@ object OrcReadBenchmark {
 
         prepareTable(dir, spark.sql("SELECT * FROM t1"))
 
-        sqlBenchmark.addCase("Native ORC MR") { _ =>
+        benchmark.addCase("Native ORC MR") { _ =>
           withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
             spark.sql(s"SELECT sum(c$middle) FROM nativeOrcTable").collect()
           }
         }
 
-        sqlBenchmark.addCase("Native ORC Vectorized") { _ =>
+        benchmark.addCase("Native ORC Vectorized") { _ =>
           spark.sql(s"SELECT sum(c$middle) FROM nativeOrcTable").collect()
         }
 
-        sqlBenchmark.addCase("Hive built-in ORC") { _ =>
+        benchmark.addCase("Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql(s"SELECT sum(c$middle) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Hive built-in ORC") { _ =>
           spark.sql(s"SELECT sum(c$middle) FROM hiveOrcTable").collect()
         }
 
         /*
-        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
-        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+        Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
 
-        SQL Single Column Scan from 100 columns: Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        Single Column Scan from 100 columns:     Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 1103 / 1124          1.0        1052.0       1.0X
-        Native ORC Vectorized                           92 /  100         11.4          87.9      12.0X
-        Hive built-in ORC                              383 /  390          2.7         365.4       2.9X
+        Native ORC MR                                 1050 / 1053          1.0        1001.1       1.0X
+        Native ORC Vectorized                           95 /  101         11.0          90.9      11.0X
+        Native ORC Vectorized with copy                 95 /  102         11.0          90.9      11.0X
+        Hive built-in ORC                              348 /  358          3.0         331.8       3.0X
 
-        SQL Single Column Scan from 200 columns: Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        Single Column Scan from 200 columns:     Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 2245 / 2250          0.5        2141.0       1.0X
-        Native ORC Vectorized                          157 /  165          6.7         150.2      14.3X
-        Hive built-in ORC                              587 /  593          1.8         559.4       3.8X
+        Native ORC MR                                 2099 / 2108          0.5        2002.1       1.0X
+        Native ORC Vectorized                          179 /  187          5.8         171.1      11.7X
+        Native ORC Vectorized with copy                176 /  188          6.0         167.6      11.9X
+        Hive built-in ORC                              562 /  581          1.9         535.9       3.7X
 
-        SQL Single Column Scan from 300 columns: Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        Single Column Scan from 300 columns:     Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 3343 / 3350          0.3        3188.3       1.0X
-        Native ORC Vectorized                          265 /  280          3.9         253.2      12.6X
-        Hive built-in ORC                              828 /  842          1.3         789.8       4.0X
+        Native ORC MR                                 3221 / 3246          0.3        3071.4       1.0X
+        Native ORC Vectorized                          312 /  322          3.4         298.0      10.3X
+        Native ORC Vectorized with copy                306 /  320          3.4         291.6      10.5X
+        Hive built-in ORC                              815 /  824          1.3         777.3       4.0X
         */
-        sqlBenchmark.run()
+        benchmark.run()
       }
     }
   }

From 2db523959658d8cd04f83c176e46f6bcdb745335 Mon Sep 17 00:00:00 2001
From: sethah <shendrickson@cloudera.com>
Date: Tue, 9 Jan 2018 23:32:47 -0800
Subject: [PATCH 057/566] [SPARK-22993][ML] Clarify HasCheckpointInterval param
 doc

## What changes were proposed in this pull request?

Add a note to the `HasCheckpointInterval` parameter doc that clarifies that this setting is ignored when no checkpoint directory has been set on the spark context.

## How was this patch tested?

No tests necessary, just a doc update.

Author: sethah <shendrickson@cloudera.com>

Closes #20188 from sethah/als_checkpoint_doc.

(cherry picked from commit 70bcc9d5ae33d6669bb5c97db29087ccead770fb)
Signed-off-by: Felix Cheung <felixcheung@apache.org>
---
 R/pkg/R/mllib_recommendation.R                              | 2 ++
 R/pkg/R/mllib_tree.R                                        | 6 ++++++
 .../apache/spark/ml/param/shared/SharedParamsCodeGen.scala  | 4 +++-
 .../org/apache/spark/ml/param/shared/sharedParams.scala     | 4 ++--
 python/pyspark/ml/param/_shared_params_code_gen.py          | 5 +++--
 python/pyspark/ml/param/shared.py                           | 4 ++--
 6 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/R/pkg/R/mllib_recommendation.R b/R/pkg/R/mllib_recommendation.R
index fa794249085d7..5441c4a4022a9 100644
--- a/R/pkg/R/mllib_recommendation.R
+++ b/R/pkg/R/mllib_recommendation.R
@@ -48,6 +48,8 @@ setClass("ALSModel", representation(jobj = "jobj"))
 #' @param numUserBlocks number of user blocks used to parallelize computation (> 0).
 #' @param numItemBlocks number of item blocks used to parallelize computation (> 0).
 #' @param checkpointInterval number of checkpoint intervals (>= 1) or disable checkpoint (-1).
+#'                           Note: this setting will be ignored if the checkpoint directory is not
+#'                           set.
 #' @param ... additional argument(s) passed to the method.
 #' @return \code{spark.als} returns a fitted ALS model.
 #' @rdname spark.als
diff --git a/R/pkg/R/mllib_tree.R b/R/pkg/R/mllib_tree.R
index 89a58bf0aadae..4e5ddf22ee16d 100644
--- a/R/pkg/R/mllib_tree.R
+++ b/R/pkg/R/mllib_tree.R
@@ -161,6 +161,8 @@ print.summary.decisionTree <- function(x) {
 #'                            >= 1.
 #' @param minInfoGain Minimum information gain for a split to be considered at a tree node.
 #' @param checkpointInterval Param for set checkpoint interval (>= 1) or disable checkpoint (-1).
+#'                           Note: this setting will be ignored if the checkpoint directory is not
+#'                           set.
 #' @param maxMemoryInMB Maximum memory in MB allocated to histogram aggregation.
 #' @param cacheNodeIds If FALSE, the algorithm will pass trees to executors to match instances with
 #'                     nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
@@ -382,6 +384,8 @@ setMethod("write.ml", signature(object = "GBTClassificationModel", path = "chara
 #' @param minInstancesPerNode Minimum number of instances each child must have after split.
 #' @param minInfoGain Minimum information gain for a split to be considered at a tree node.
 #' @param checkpointInterval Param for set checkpoint interval (>= 1) or disable checkpoint (-1).
+#'                           Note: this setting will be ignored if the checkpoint directory is not
+#'                           set.
 #' @param maxMemoryInMB Maximum memory in MB allocated to histogram aggregation.
 #' @param cacheNodeIds If FALSE, the algorithm will pass trees to executors to match instances with
 #'                     nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
@@ -595,6 +599,8 @@ setMethod("write.ml", signature(object = "RandomForestClassificationModel", path
 #' @param minInstancesPerNode Minimum number of instances each child must have after split.
 #' @param minInfoGain Minimum information gain for a split to be considered at a tree node.
 #' @param checkpointInterval Param for set checkpoint interval (>= 1) or disable checkpoint (-1).
+#'                           Note: this setting will be ignored if the checkpoint directory is not
+#'                           set.
 #' @param maxMemoryInMB Maximum memory in MB allocated to histogram aggregation.
 #' @param cacheNodeIds If FALSE, the algorithm will pass trees to executors to match instances with
 #'                     nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index a5d57a15317e6..6ad44af9ef7eb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -63,7 +63,9 @@ private[shared] object SharedParamsCodeGen {
       ParamDesc[Array[String]]("outputCols", "output column names"),
       ParamDesc[Int]("checkpointInterval", "set checkpoint interval (>= 1) or " +
         "disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed " +
-        "every 10 iterations", isValid = "(interval: Int) => interval == -1 || interval >= 1"),
+        "every 10 iterations. Note: this setting will be ignored if the checkpoint directory " +
+        "is not set in the SparkContext",
+        isValid = "(interval: Int) => interval == -1 || interval >= 1"),
       ParamDesc[Boolean]("fitIntercept", "whether to fit an intercept term", Some("true")),
       ParamDesc[String]("handleInvalid", "how to handle invalid entries. Options are skip (which " +
         "will filter out rows with bad values), or error (which will throw an error). More " +
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index 13425dacc9f18..be8b2f273164b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -282,10 +282,10 @@ trait HasOutputCols extends Params {
 trait HasCheckpointInterval extends Params {
 
   /**
-   * Param for set checkpoint interval (&gt;= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * Param for set checkpoint interval (&gt;= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.
    * @group param
    */
-  final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations", (interval: Int) => interval == -1 || interval >= 1)
+  final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext", (interval: Int) => interval == -1 || interval >= 1)
 
   /** @group getParam */
   final def getCheckpointInterval: Int = $(checkpointInterval)
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index d55d209d09398..1d0f60acc6983 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -121,8 +121,9 @@ def get$Name(self):
         ("outputCol", "output column name.", "self.uid + '__output'", "TypeConverters.toString"),
         ("numFeatures", "number of features.", None, "TypeConverters.toInt"),
         ("checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). " +
-         "E.g. 10 means that the cache will get checkpointed every 10 iterations.", None,
-         "TypeConverters.toInt"),
+         "E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: " +
+         "this setting will be ignored if the checkpoint directory is not set in the SparkContext.",
+         None, "TypeConverters.toInt"),
         ("seed", "random seed.", "hash(type(self).__name__)", "TypeConverters.toInt"),
         ("tol", "the convergence tolerance for iterative algorithms (>= 0).", None,
          "TypeConverters.toFloat"),
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index e5c5ddfba6c1f..813f7a59f3fd1 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -281,10 +281,10 @@ def getNumFeatures(self):
 
 class HasCheckpointInterval(Params):
     """
-    Mixin for param checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
+    Mixin for param checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.
     """
 
-    checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.", typeConverter=TypeConverters.toInt)
+    checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.", typeConverter=TypeConverters.toInt)
 
     def __init__(self):
         super(HasCheckpointInterval, self).__init__()

From 60d4d79bb40f13c68773a0224f2003cdca28c138 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 10 Jan 2018 00:45:47 -0800
Subject: [PATCH 058/566] [SPARK-22997] Add additional defenses against use of
 freed MemoryBlocks

## What changes were proposed in this pull request?

This patch modifies Spark's `MemoryAllocator` implementations so that `free(MemoryBlock)` mutates the passed block to clear pointers (in the off-heap case) or null out references to backing `long[]` arrays (in the on-heap case). The goal of this change is to add an extra layer of defense against use-after-free bugs because currently it's hard to detect corruption caused by blind writes to freed memory blocks.

## How was this patch tested?

New unit tests in `PlatformSuite`, including new tests for existing functionality because we did not have sufficient mutation coverage of the on-heap memory allocator's pooling logic.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #20191 from JoshRosen/SPARK-22997-add-defenses-against-use-after-free-bugs-in-memory-allocator.

(cherry picked from commit f340b6b3066033d40b7e163fd5fb68e9820adfb1)
Signed-off-by: Josh Rosen <joshrosen@databricks.com>
---
 .../unsafe/memory/HeapMemoryAllocator.java    | 35 +++++++++----
 .../spark/unsafe/memory/MemoryBlock.java      | 21 +++++++-
 .../unsafe/memory/UnsafeMemoryAllocator.java  | 11 ++++
 .../spark/unsafe/PlatformUtilSuite.java       | 50 ++++++++++++++++++-
 .../spark/memory/TaskMemoryManager.java       | 13 ++++-
 .../spark/memory/TaskMemoryManagerSuite.java  | 29 +++++++++++
 6 files changed, 146 insertions(+), 13 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
index cc9cc429643ad..3acfe3696cb1e 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
@@ -31,8 +31,7 @@
 public class HeapMemoryAllocator implements MemoryAllocator {
 
   @GuardedBy("this")
-  private final Map<Long, LinkedList<WeakReference<MemoryBlock>>> bufferPoolsBySize =
-    new HashMap<>();
+  private final Map<Long, LinkedList<WeakReference<long[]>>> bufferPoolsBySize = new HashMap<>();
 
   private static final int POOLING_THRESHOLD_BYTES = 1024 * 1024;
 
@@ -49,13 +48,14 @@ private boolean shouldPool(long size) {
   public MemoryBlock allocate(long size) throws OutOfMemoryError {
     if (shouldPool(size)) {
       synchronized (this) {
-        final LinkedList<WeakReference<MemoryBlock>> pool = bufferPoolsBySize.get(size);
+        final LinkedList<WeakReference<long[]>> pool = bufferPoolsBySize.get(size);
         if (pool != null) {
           while (!pool.isEmpty()) {
-            final WeakReference<MemoryBlock> blockReference = pool.pop();
-            final MemoryBlock memory = blockReference.get();
-            if (memory != null) {
-              assert (memory.size() == size);
+            final WeakReference<long[]> arrayReference = pool.pop();
+            final long[] array = arrayReference.get();
+            if (array != null) {
+              assert (array.length * 8L >= size);
+              MemoryBlock memory = new MemoryBlock(array, Platform.LONG_ARRAY_OFFSET, size);
               if (MemoryAllocator.MEMORY_DEBUG_FILL_ENABLED) {
                 memory.fill(MemoryAllocator.MEMORY_DEBUG_FILL_CLEAN_VALUE);
               }
@@ -76,18 +76,35 @@ public MemoryBlock allocate(long size) throws OutOfMemoryError {
 
   @Override
   public void free(MemoryBlock memory) {
+    assert (memory.obj != null) :
+      "baseObject was null; are you trying to use the on-heap allocator to free off-heap memory?";
+    assert (memory.pageNumber != MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER) :
+      "page has already been freed";
+    assert ((memory.pageNumber == MemoryBlock.NO_PAGE_NUMBER)
+            || (memory.pageNumber == MemoryBlock.FREED_IN_TMM_PAGE_NUMBER)) :
+      "TMM-allocated pages must first be freed via TMM.freePage(), not directly in allocator free()";
+
     final long size = memory.size();
     if (MemoryAllocator.MEMORY_DEBUG_FILL_ENABLED) {
       memory.fill(MemoryAllocator.MEMORY_DEBUG_FILL_FREED_VALUE);
     }
+
+    // Mark the page as freed (so we can detect double-frees).
+    memory.pageNumber = MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER;
+
+    // As an additional layer of defense against use-after-free bugs, we mutate the
+    // MemoryBlock to null out its reference to the long[] array.
+    long[] array = (long[]) memory.obj;
+    memory.setObjAndOffset(null, 0);
+
     if (shouldPool(size)) {
       synchronized (this) {
-        LinkedList<WeakReference<MemoryBlock>> pool = bufferPoolsBySize.get(size);
+        LinkedList<WeakReference<long[]>> pool = bufferPoolsBySize.get(size);
         if (pool == null) {
           pool = new LinkedList<>();
           bufferPoolsBySize.put(size, pool);
         }
-        pool.add(new WeakReference<>(memory));
+        pool.add(new WeakReference<>(array));
       }
     } else {
       // Do nothing
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java
index cd1d378bc1470..c333857358d30 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java
@@ -26,6 +26,25 @@
  */
 public class MemoryBlock extends MemoryLocation {
 
+  /** Special `pageNumber` value for pages which were not allocated by TaskMemoryManagers */
+  public static final int NO_PAGE_NUMBER = -1;
+
+  /**
+   * Special `pageNumber` value for marking pages that have been freed in the TaskMemoryManager.
+   * We set `pageNumber` to this value in TaskMemoryManager.freePage() so that MemoryAllocator
+   * can detect if pages which were allocated by TaskMemoryManager have been freed in the TMM
+   * before being passed to MemoryAllocator.free() (it is an error to allocate a page in
+   * TaskMemoryManager and then directly free it in a MemoryAllocator without going through
+   * the TMM freePage() call).
+   */
+  public static final int FREED_IN_TMM_PAGE_NUMBER = -2;
+
+  /**
+   * Special `pageNumber` value for pages that have been freed by the MemoryAllocator. This allows
+   * us to detect double-frees.
+   */
+  public static final int FREED_IN_ALLOCATOR_PAGE_NUMBER = -3;
+
   private final long length;
 
   /**
@@ -33,7 +52,7 @@ public class MemoryBlock extends MemoryLocation {
    * TaskMemoryManager. This field is public so that it can be modified by the TaskMemoryManager,
    * which lives in a different package.
    */
-  public int pageNumber = -1;
+  public int pageNumber = NO_PAGE_NUMBER;
 
   public MemoryBlock(@Nullable Object obj, long offset, long length) {
     super(obj, offset);
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java
index 55bcdf1ed7b06..4368fb615ba1e 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java
@@ -38,9 +38,20 @@ public MemoryBlock allocate(long size) throws OutOfMemoryError {
   public void free(MemoryBlock memory) {
     assert (memory.obj == null) :
       "baseObject not null; are you trying to use the off-heap allocator to free on-heap memory?";
+    assert (memory.pageNumber != MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER) :
+      "page has already been freed";
+    assert ((memory.pageNumber == MemoryBlock.NO_PAGE_NUMBER)
+            || (memory.pageNumber == MemoryBlock.FREED_IN_TMM_PAGE_NUMBER)) :
+      "TMM-allocated pages must be freed via TMM.freePage(), not directly in allocator free()";
+
     if (MemoryAllocator.MEMORY_DEBUG_FILL_ENABLED) {
       memory.fill(MemoryAllocator.MEMORY_DEBUG_FILL_FREED_VALUE);
     }
     Platform.freeMemory(memory.offset);
+    // As an additional layer of defense against use-after-free bugs, we mutate the
+    // MemoryBlock to reset its pointer.
+    memory.offset = 0;
+    // Mark the page as freed (so we can detect double-frees).
+    memory.pageNumber = MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER;
   }
 }
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java
index 4b141339ec816..62854837b05ed 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java
@@ -62,6 +62,52 @@ public void overlappingCopyMemory() {
     }
   }
 
+  @Test
+  public void onHeapMemoryAllocatorPoolingReUsesLongArrays() {
+    MemoryBlock block1 = MemoryAllocator.HEAP.allocate(1024 * 1024);
+    Object baseObject1 = block1.getBaseObject();
+    MemoryAllocator.HEAP.free(block1);
+    MemoryBlock block2 = MemoryAllocator.HEAP.allocate(1024 * 1024);
+    Object baseObject2 = block2.getBaseObject();
+    Assert.assertSame(baseObject1, baseObject2);
+    MemoryAllocator.HEAP.free(block2);
+  }
+
+  @Test
+  public void freeingOnHeapMemoryBlockResetsBaseObjectAndOffset() {
+    MemoryBlock block = MemoryAllocator.HEAP.allocate(1024);
+    Assert.assertNotNull(block.getBaseObject());
+    MemoryAllocator.HEAP.free(block);
+    Assert.assertNull(block.getBaseObject());
+    Assert.assertEquals(0, block.getBaseOffset());
+    Assert.assertEquals(MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER, block.pageNumber);
+  }
+
+  @Test
+  public void freeingOffHeapMemoryBlockResetsOffset() {
+    MemoryBlock block = MemoryAllocator.UNSAFE.allocate(1024);
+    Assert.assertNull(block.getBaseObject());
+    Assert.assertNotEquals(0, block.getBaseOffset());
+    MemoryAllocator.UNSAFE.free(block);
+    Assert.assertNull(block.getBaseObject());
+    Assert.assertEquals(0, block.getBaseOffset());
+    Assert.assertEquals(MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER, block.pageNumber);
+  }
+
+  @Test(expected = AssertionError.class)
+  public void onHeapMemoryAllocatorThrowsAssertionErrorOnDoubleFree() {
+    MemoryBlock block = MemoryAllocator.HEAP.allocate(1024);
+    MemoryAllocator.HEAP.free(block);
+    MemoryAllocator.HEAP.free(block);
+  }
+
+  @Test(expected = AssertionError.class)
+  public void offHeapMemoryAllocatorThrowsAssertionErrorOnDoubleFree() {
+    MemoryBlock block = MemoryAllocator.UNSAFE.allocate(1024);
+    MemoryAllocator.UNSAFE.free(block);
+    MemoryAllocator.UNSAFE.free(block);
+  }
+
   @Test
   public void memoryDebugFillEnabledInTest() {
     Assert.assertTrue(MemoryAllocator.MEMORY_DEBUG_FILL_ENABLED);
@@ -71,9 +117,11 @@ public void memoryDebugFillEnabledInTest() {
       MemoryAllocator.MEMORY_DEBUG_FILL_CLEAN_VALUE);
 
     MemoryBlock onheap1 = MemoryAllocator.HEAP.allocate(1024 * 1024);
+    Object onheap1BaseObject = onheap1.getBaseObject();
+    long onheap1BaseOffset = onheap1.getBaseOffset();
     MemoryAllocator.HEAP.free(onheap1);
     Assert.assertEquals(
-      Platform.getByte(onheap1.getBaseObject(), onheap1.getBaseOffset()),
+      Platform.getByte(onheap1BaseObject, onheap1BaseOffset),
       MemoryAllocator.MEMORY_DEBUG_FILL_FREED_VALUE);
     MemoryBlock onheap2 = MemoryAllocator.HEAP.allocate(1024 * 1024);
     Assert.assertEquals(
diff --git a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
index e8d3730daa7a4..632d718062212 100644
--- a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
+++ b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
@@ -321,8 +321,12 @@ public MemoryBlock allocatePage(long size, MemoryConsumer consumer) {
    * Free a block of memory allocated via {@link TaskMemoryManager#allocatePage}.
    */
   public void freePage(MemoryBlock page, MemoryConsumer consumer) {
-    assert (page.pageNumber != -1) :
+    assert (page.pageNumber != MemoryBlock.NO_PAGE_NUMBER) :
       "Called freePage() on memory that wasn't allocated with allocatePage()";
+    assert (page.pageNumber != MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER) :
+      "Called freePage() on a memory block that has already been freed";
+    assert (page.pageNumber != MemoryBlock.FREED_IN_TMM_PAGE_NUMBER) :
+            "Called freePage() on a memory block that has already been freed";
     assert(allocatedPages.get(page.pageNumber));
     pageTable[page.pageNumber] = null;
     synchronized (this) {
@@ -332,6 +336,10 @@ public void freePage(MemoryBlock page, MemoryConsumer consumer) {
       logger.trace("Freed page number {} ({} bytes)", page.pageNumber, page.size());
     }
     long pageSize = page.size();
+    // Clear the page number before passing the block to the MemoryAllocator's free().
+    // Doing this allows the MemoryAllocator to detect when a TaskMemoryManager-managed
+    // page has been inappropriately directly freed without calling TMM.freePage().
+    page.pageNumber = MemoryBlock.FREED_IN_TMM_PAGE_NUMBER;
     memoryManager.tungstenMemoryAllocator().free(page);
     releaseExecutionMemory(pageSize, consumer);
   }
@@ -358,7 +366,7 @@ public long encodePageNumberAndOffset(MemoryBlock page, long offsetInPage) {
 
   @VisibleForTesting
   public static long encodePageNumberAndOffset(int pageNumber, long offsetInPage) {
-    assert (pageNumber != -1) : "encodePageNumberAndOffset called with invalid page";
+    assert (pageNumber >= 0) : "encodePageNumberAndOffset called with invalid page";
     return (((long) pageNumber) << OFFSET_BITS) | (offsetInPage & MASK_LONG_LOWER_51_BITS);
   }
 
@@ -424,6 +432,7 @@ public long cleanUpAllAllocatedMemory() {
       for (MemoryBlock page : pageTable) {
         if (page != null) {
           logger.debug("unreleased page: " + page + " in task " + taskAttemptId);
+          page.pageNumber = MemoryBlock.FREED_IN_TMM_PAGE_NUMBER;
           memoryManager.tungstenMemoryAllocator().free(page);
         }
       }
diff --git a/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java b/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java
index 46b0516e36141..a0664b30d6cc2 100644
--- a/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java
+++ b/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java
@@ -21,6 +21,7 @@
 import org.junit.Test;
 
 import org.apache.spark.SparkConf;
+import org.apache.spark.unsafe.memory.MemoryAllocator;
 import org.apache.spark.unsafe.memory.MemoryBlock;
 
 public class TaskMemoryManagerSuite {
@@ -68,6 +69,34 @@ public void encodePageNumberAndOffsetOnHeap() {
     Assert.assertEquals(64, manager.getOffsetInPage(encodedAddress));
   }
 
+  @Test
+  public void freeingPageSetsPageNumberToSpecialConstant() {
+    final TaskMemoryManager manager = new TaskMemoryManager(
+      new TestMemoryManager(new SparkConf().set("spark.memory.offHeap.enabled", "false")), 0);
+    final MemoryConsumer c = new TestMemoryConsumer(manager, MemoryMode.ON_HEAP);
+    final MemoryBlock dataPage = manager.allocatePage(256, c);
+    c.freePage(dataPage);
+    Assert.assertEquals(MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER, dataPage.pageNumber);
+  }
+
+  @Test(expected = AssertionError.class)
+  public void freeingPageDirectlyInAllocatorTriggersAssertionError() {
+    final TaskMemoryManager manager = new TaskMemoryManager(
+      new TestMemoryManager(new SparkConf().set("spark.memory.offHeap.enabled", "false")), 0);
+    final MemoryConsumer c = new TestMemoryConsumer(manager, MemoryMode.ON_HEAP);
+    final MemoryBlock dataPage = manager.allocatePage(256, c);
+    MemoryAllocator.HEAP.free(dataPage);
+  }
+
+  @Test(expected = AssertionError.class)
+  public void callingFreePageOnDirectlyAllocatedPageTriggersAssertionError() {
+    final TaskMemoryManager manager = new TaskMemoryManager(
+      new TestMemoryManager(new SparkConf().set("spark.memory.offHeap.enabled", "false")), 0);
+    final MemoryConsumer c = new TestMemoryConsumer(manager, MemoryMode.ON_HEAP);
+    final MemoryBlock dataPage = MemoryAllocator.HEAP.allocate(256);
+    manager.freePage(dataPage, c);
+  }
+
   @Test
   public void cooperativeSpilling() {
     final TestMemoryManager memoryManager = new TestMemoryManager(new SparkConf());

From 5b5851cb685f395574c94174d45a47c4fbf946c8 Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Wed, 10 Jan 2018 09:44:30 -0800
Subject: [PATCH 059/566] [SPARK-23019][CORE] Wait until SparkContext.stop()
 finished in SparkLauncherSuite

## What changes were proposed in this pull request?
In current code ,the function `waitFor` call https://github.com/apache/spark/blob/cfcd746689c2b84824745fa6d327ffb584c7a17d/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java#L155 only wait until DAGScheduler is stopped, while SparkContext.clearActiveContext may not be called yet.
https://github.com/apache/spark/blob/1c9f95cb771ac78775a77edd1abfeb2d8ae2a124/core/src/main/scala/org/apache/spark/SparkContext.scala#L1924

Thus, in the Jenkins test
https://amplab.cs.berkeley.edu/jenkins/job/spark-branch-2.3-test-maven-hadoop-2.6/ ,  `JdbcRDDSuite` failed because the previous test `SparkLauncherSuite` exit before SparkContext.stop() is finished.

To repo:
```
$ build/sbt
> project core
> testOnly *SparkLauncherSuite *JavaJdbcRDDSuite
```

To Fix:
Wait for a reasonable amount of time to avoid creating two active SparkContext in JVM in SparkLauncherSuite.
Can' come up with any better solution for now.

## How was this patch tested?

Unit test

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #20221 from gengliangwang/SPARK-23019.

(cherry picked from commit 344e3aab87178e45957333479a07e07f202ca1fd)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../java/org/apache/spark/launcher/SparkLauncherSuite.java   | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
index c2261c204cd45..9d2f563b2e367 100644
--- a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
+++ b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
@@ -23,6 +23,7 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Properties;
+import java.util.concurrent.TimeUnit;
 
 import org.junit.Test;
 import static org.junit.Assert.*;
@@ -133,6 +134,10 @@ public void testInProcessLauncher() throws Exception {
         p.put(e.getKey(), e.getValue());
       }
       System.setProperties(p);
+      // Here DAGScheduler is stopped, while SparkContext.clearActiveContext may not be called yet.
+      // Wait for a reasonable amount of time to avoid creating two active SparkContext in JVM.
+      // See SPARK-23019 and SparkContext.stop() for details.
+      TimeUnit.MILLISECONDS.sleep(500);
     }
   }
 

From eb4fa551e60800269a939b2c1c0ad69e3a801264 Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliu@databricks.com>
Date: Wed, 10 Jan 2018 14:25:04 -0800
Subject: [PATCH 060/566] [SPARK-22951][SQL] fix aggregation after
 dropDuplicates on empty data frames

## What changes were proposed in this pull request?

(courtesy of liancheng)

Spark SQL supports both global aggregation and grouping aggregation. Global aggregation always return a single row with the initial aggregation state as the output, even there are zero input rows. Spark implements this by simply checking the number of grouping keys and treats an aggregation as a global aggregation if it has zero grouping keys.

However, this simple principle drops the ball in the following case:

```scala
spark.emptyDataFrame.dropDuplicates().agg(count($"*") as "c").show()
// +---+
// | c |
// +---+
// | 1 |
// +---+
```

The reason is that:

1. `df.dropDuplicates()` is roughly translated into something equivalent to:

```scala
val allColumns = df.columns.map { col }
df.groupBy(allColumns: _*).agg(allColumns.head, allColumns.tail: _*)
```

This translation is implemented in the rule `ReplaceDeduplicateWithAggregate`.

2. `spark.emptyDataFrame` contains zero columns and zero rows.

Therefore, rule `ReplaceDeduplicateWithAggregate` makes a confusing transformation roughly equivalent to the following one:

```scala
spark.emptyDataFrame.dropDuplicates()
=> spark.emptyDataFrame.groupBy().agg(Map.empty[String, String])
```

The above transformation is confusing because the resulting aggregate operator contains no grouping keys (because `emptyDataFrame` contains no columns), and gets recognized as a global aggregation. As a result, Spark SQL allocates a single row filled by the initial aggregation state and uses it as the output, and returns a wrong result.

To fix this issue, this PR tweaks `ReplaceDeduplicateWithAggregate` by appending a literal `1` to the grouping key list of the resulting `Aggregate` operator when the input plan contains zero output columns. In this way, `spark.emptyDataFrame.dropDuplicates()` is now translated into a grouping aggregation, roughly depicted as:

```scala
spark.emptyDataFrame.dropDuplicates()
=> spark.emptyDataFrame.groupBy(lit(1)).agg(Map.empty[String, String])
```

Which is now properly treated as a grouping aggregation and returns the correct answer.

## How was this patch tested?

New unit tests added

Author: Feng Liu <fengliu@databricks.com>

Closes #20174 from liufengdb/fix-duplicate.

(cherry picked from commit 9b33dfc408de986f4203bb0ac0c3f5c56effd69d)
Signed-off-by: Cheng Lian <lian.cs.zju@gmail.com>
---
 .../sql/catalyst/optimizer/Optimizer.scala    |  8 ++++++-
 .../optimizer/ReplaceOperatorSuite.scala      | 10 +++++++-
 .../spark/sql/DataFrameAggregateSuite.scala   | 24 +++++++++++++++++--
 3 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index df0af8264a329..c794ba8619322 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -1222,7 +1222,13 @@ object ReplaceDeduplicateWithAggregate extends Rule[LogicalPlan] {
           Alias(new First(attr).toAggregateExpression(), attr.name)(attr.exprId)
         }
       }
-      Aggregate(keys, aggCols, child)
+      // SPARK-22951: Physical aggregate operators distinguishes global aggregation and grouping
+      // aggregations by checking the number of grouping keys. The key difference here is that a
+      // global aggregation always returns at least one row even if there are no input rows. Here
+      // we append a literal when the grouping key list is empty so that the result aggregate
+      // operator is properly treated as a grouping aggregation.
+      val nonemptyKeys = if (keys.isEmpty) Literal(1) :: Nil else keys
+      Aggregate(nonemptyKeys, aggCols, child)
   }
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala
index 0fa1aaeb9e164..e9701ffd2c54b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.optimizer
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.expressions.{Alias, Not}
+import org.apache.spark.sql.catalyst.expressions.{Alias, Literal, Not}
 import org.apache.spark.sql.catalyst.expressions.aggregate.First
 import org.apache.spark.sql.catalyst.plans.{LeftAnti, LeftSemi, PlanTest}
 import org.apache.spark.sql.catalyst.plans.logical._
@@ -198,6 +198,14 @@ class ReplaceOperatorSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
+  test("add one grouping key if necessary when replace Deduplicate with Aggregate") {
+    val input = LocalRelation()
+    val query = Deduplicate(Seq.empty, input) // dropDuplicates()
+    val optimized = Optimize.execute(query.analyze)
+    val correctAnswer = Aggregate(Seq(Literal(1)), input.output, input)
+    comparePlans(optimized, correctAnswer)
+  }
+
   test("don't replace streaming Deduplicate") {
     val input = LocalRelation(Seq('a.int, 'b.int), isStreaming = true)
     val attrA = input.output(0)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 06848e4d2b297..e7776e36702ad 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql
 
 import scala.util.Random
 
+import org.apache.spark.sql.catalyst.expressions.{Alias, Literal}
+import org.apache.spark.sql.catalyst.expressions.aggregate.Count
 import org.apache.spark.sql.execution.WholeStageCodegenExec
 import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec}
 import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
@@ -27,7 +29,7 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.test.SQLTestData.DecimalData
-import org.apache.spark.sql.types.{Decimal, DecimalType}
+import org.apache.spark.sql.types.DecimalType
 
 case class Fact(date: Int, hour: Int, minute: Int, room_name: String, temp: Double)
 
@@ -456,7 +458,6 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
 
   test("null moments") {
     val emptyTableData = Seq.empty[(Int, Int)].toDF("a", "b")
-
     checkAnswer(
       emptyTableData.agg(variance('a), var_samp('a), var_pop('a), skewness('a), kurtosis('a)),
       Row(null, null, null, null, null))
@@ -666,4 +667,23 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
       assert(exchangePlans.length == 1)
     }
   }
+
+  Seq(true, false).foreach { codegen =>
+    test("SPARK-22951: dropDuplicates on empty dataFrames should produce correct aggregate " +
+      s"results when codegen is enabled: $codegen") {
+      withSQLConf((SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, codegen.toString)) {
+        // explicit global aggregations
+        val emptyAgg = Map.empty[String, String]
+        checkAnswer(spark.emptyDataFrame.agg(emptyAgg), Seq(Row()))
+        checkAnswer(spark.emptyDataFrame.groupBy().agg(emptyAgg), Seq(Row()))
+        checkAnswer(spark.emptyDataFrame.groupBy().agg(count("*")), Seq(Row(0)))
+        checkAnswer(spark.emptyDataFrame.dropDuplicates().agg(emptyAgg), Seq(Row()))
+        checkAnswer(spark.emptyDataFrame.dropDuplicates().groupBy().agg(emptyAgg), Seq(Row()))
+        checkAnswer(spark.emptyDataFrame.dropDuplicates().groupBy().agg(count("*")), Seq(Row(0)))
+
+        // global aggregation is converted to grouping aggregation:
+        assert(spark.emptyDataFrame.dropDuplicates().count() == 0)
+      }
+    }
+  }
 }

From 551ccfba529996e987c4d2e8d4dd61c4ab9a2e95 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Wed, 10 Jan 2018 14:55:24 +0900
Subject: [PATCH 061/566] [SPARK-23009][PYTHON] Fix for non-str col names to
 createDataFrame from Pandas

## What changes were proposed in this pull request?

This the case when calling `SparkSession.createDataFrame` using a Pandas DataFrame that has non-str column labels.

The column name conversion logic to handle non-string or unicode in python2 is:
```
if column is not any type of string:
    name = str(column)
else if column is unicode in Python 2:
    name = column.encode('utf-8')
```

## How was this patch tested?

Added a new test with a Pandas DataFrame that has int column labels

Author: Bryan Cutler <cutlerb@gmail.com>

Closes #20210 from BryanCutler/python-createDataFrame-int-col-error-SPARK-23009.
---
 python/pyspark/sql/session.py | 4 +++-
 python/pyspark/sql/tests.py   | 9 +++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 3e4574729a631..604021c1f45cc 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -648,7 +648,9 @@ def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=Tr
 
             # If no schema supplied by user then get the names of columns only
             if schema is None:
-                schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in data.columns]
+                schema = [str(x) if not isinstance(x, basestring) else
+                          (x.encode('utf-8') if not isinstance(x, str) else x)
+                          for x in data.columns]
 
             if self.conf.get("spark.sql.execution.arrow.enabled", "false").lower() == "true" \
                     and len(data) > 0:
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 13576ff57001b..80a94a91a87b3 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3532,6 +3532,15 @@ def test_toPandas_with_array_type(self):
                 self.assertTrue(expected[r][e] == result_arrow[r][e] and
                                 result[r][e] == result_arrow[r][e])
 
+    def test_createDataFrame_with_int_col_names(self):
+        import numpy as np
+        import pandas as pd
+        pdf = pd.DataFrame(np.random.rand(4, 2))
+        df, df_arrow = self._createDataFrame_toggle(pdf)
+        pdf_col_names = [str(c) for c in pdf.columns]
+        self.assertEqual(pdf_col_names, df.columns)
+        self.assertEqual(pdf_col_names, df_arrow.columns)
+
 
 @unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
 class PandasUDFTests(ReusedSQLTestCase):

From 317b0aaed83e4bbf66f63ddc0d618da9f1f85085 Mon Sep 17 00:00:00 2001
From: Mingjie Tang <mtang@hortonworks.com>
Date: Thu, 11 Jan 2018 11:51:03 +0800
Subject: [PATCH 062/566] [SPARK-22587] Spark job fails if fs.defaultFS and
 application jar are different url

## What changes were proposed in this pull request?

Two filesystems comparing does not consider the authority of URI. This is specific for
WASB file storage system, where userInfo is honored to differentiate filesystems.
For example: wasbs://user1xyz.net, wasbs://user2xyz.net would consider as two filesystem.
Therefore, we have to add the authority to compare two filesystem, and  two filesystem with different authority can not be the same FS.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Mingjie Tang <mtang@hortonworks.com>

Closes #19885 from merlintang/EAR-7377.

(cherry picked from commit a6647ffbf7a312a3e119a9beef90880cc915aa60)
Signed-off-by: jerryshao <sshao@hortonworks.com>
---
 .../org/apache/spark/deploy/yarn/Client.scala | 24 +++++++++++---
 .../spark/deploy/yarn/ClientSuite.scala       | 33 +++++++++++++++++++
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 15328d08b3b5c..8cd3cd9746a3a 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -1421,15 +1421,20 @@ private object Client extends Logging {
   }
 
   /**
-   * Return whether the two file systems are the same.
+   * Return whether two URI represent file system are the same
    */
-  private def compareFs(srcFs: FileSystem, destFs: FileSystem): Boolean = {
-    val srcUri = srcFs.getUri()
-    val dstUri = destFs.getUri()
+  private[spark] def compareUri(srcUri: URI, dstUri: URI): Boolean = {
+
     if (srcUri.getScheme() == null || srcUri.getScheme() != dstUri.getScheme()) {
       return false
     }
 
+    val srcAuthority = srcUri.getAuthority()
+    val dstAuthority = dstUri.getAuthority()
+    if (srcAuthority != null && !srcAuthority.equalsIgnoreCase(dstAuthority)) {
+      return false
+    }
+
     var srcHost = srcUri.getHost()
     var dstHost = dstUri.getHost()
 
@@ -1447,6 +1452,17 @@ private object Client extends Logging {
     }
 
     Objects.equal(srcHost, dstHost) && srcUri.getPort() == dstUri.getPort()
+
+  }
+
+  /**
+   * Return whether the two file systems are the same.
+   */
+  protected def compareFs(srcFs: FileSystem, destFs: FileSystem): Boolean = {
+    val srcUri = srcFs.getUri()
+    val dstUri = destFs.getUri()
+
+    compareUri(srcUri, dstUri)
   }
 
   /**
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
index 9d5f5eb621118..7fa597167f3f0 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
@@ -357,6 +357,39 @@ class ClientSuite extends SparkFunSuite with Matchers {
     sparkConf.get(SECONDARY_JARS) should be (Some(Seq(new File(jar2.toURI).getName)))
   }
 
+  private val matching = Seq(
+    ("files URI match test1", "file:///file1", "file:///file2"),
+    ("files URI match test2", "file:///c:file1", "file://c:file2"),
+    ("files URI match test3", "file://host/file1", "file://host/file2"),
+    ("wasb URI match test", "wasb://bucket1@user", "wasb://bucket1@user/"),
+    ("hdfs URI match test", "hdfs:/path1", "hdfs:/path1")
+  )
+
+  matching.foreach { t =>
+      test(t._1) {
+        assert(Client.compareUri(new URI(t._2), new URI(t._3)),
+          s"No match between ${t._2} and ${t._3}")
+      }
+  }
+
+  private val unmatching = Seq(
+    ("files URI unmatch test1", "file:///file1", "file://host/file2"),
+    ("files URI unmatch test2", "file://host/file1", "file:///file2"),
+    ("files URI unmatch test3", "file://host/file1", "file://host2/file2"),
+    ("wasb URI unmatch test1", "wasb://bucket1@user", "wasb://bucket2@user/"),
+    ("wasb URI unmatch test2", "wasb://bucket1@user", "wasb://bucket1@user2/"),
+    ("s3 URI unmatch test", "s3a://user@pass:bucket1/", "s3a://user2@pass2:bucket1/"),
+    ("hdfs URI unmatch test1", "hdfs://namenode1/path1", "hdfs://namenode1:8080/path2"),
+    ("hdfs URI unmatch test2", "hdfs://namenode1:8020/path1", "hdfs://namenode1:8080/path2")
+  )
+
+  unmatching.foreach { t =>
+      test(t._1) {
+        assert(!Client.compareUri(new URI(t._2), new URI(t._3)),
+          s"match between ${t._2} and ${t._3}")
+      }
+  }
+
   object Fixtures {
 
     val knownDefYarnAppCP: Seq[String] =

From d9a973d65c52169e3c3b2223d4a55b07ee82b88e Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 11 Jan 2018 18:17:34 +0800
Subject: [PATCH 063/566] [SPARK-23001][SQL] Fix NullPointerException when DESC
 a database with NULL description

## What changes were proposed in this pull request?
When users' DB description is NULL, users might hit `NullPointerException`. This PR is to fix the issue.

## How was this patch tested?
Added test cases

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20215 from gatorsmile/SPARK-23001.

(cherry picked from commit 87c98de8b23f0e978958fc83677fdc4c339b7e6a)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/sql/hive/client/HiveClientImpl.scala    | 2 +-
 .../apache/spark/sql/hive/HiveExternalCatalogSuite.scala | 6 ++++++
 .../org/apache/spark/sql/hive/client/VersionsSuite.scala | 9 +++++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 102f40bacc985..4b923f5235a90 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -330,7 +330,7 @@ private[hive] class HiveClientImpl(
     Option(client.getDatabase(dbName)).map { d =>
       CatalogDatabase(
         name = d.getName,
-        description = d.getDescription,
+        description = Option(d.getDescription).getOrElse(""),
         locationUri = CatalogUtils.stringToURI(d.getLocationUri),
         properties = Option(d.getParameters).map(_.asScala.toMap).orNull)
     }.getOrElse(throw new NoSuchDatabaseException(dbName))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala
index 2e35fdeba464d..0a522b6a11c80 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala
@@ -107,4 +107,10 @@ class HiveExternalCatalogSuite extends ExternalCatalogSuite {
       .filter(_.contains("Num Buckets")).head
     assert(bucketString.contains("10"))
   }
+
+  test("SPARK-23001: NullPointerException when running desc database") {
+    val catalog = newBasicCatalog()
+    catalog.createDatabase(newDb("dbWithNullDesc").copy(description = null), ignoreIfExists = false)
+    assert(catalog.getDatabase("dbWithNullDesc").description == "")
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index 94473a08dd317..ff90e9dda5f7c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -163,6 +163,15 @@ class VersionsSuite extends SparkFunSuite with Logging {
       client.createDatabase(tempDB, ignoreIfExists = true)
     }
 
+    test(s"$version: createDatabase with null description") {
+      withTempDir { tmpDir =>
+        val dbWithNullDesc =
+          CatalogDatabase("dbWithNullDesc", description = null, tmpDir.toURI, Map())
+        client.createDatabase(dbWithNullDesc, ignoreIfExists = true)
+        assert(client.getDatabase("dbWithNullDesc").description == "")
+      }
+    }
+
     test(s"$version: setCurrentDatabase") {
       client.setCurrentDatabase("default")
     }

From b78130123baba87554503e81b8aee3121666ba91 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 11 Jan 2018 19:41:48 +0800
Subject: [PATCH 064/566] [SPARK-20657][CORE] Speed up rendering of the stages
 page.

There are two main changes to speed up rendering of the tasks list
when rendering the stage page.

The first one makes the code only load the tasks being shown in the
current page of the tasks table, and information related to only
those tasks. One side-effect of this change is that the graph that
shows task-related events now only shows events for the tasks in
the current page, instead of the previously hardcoded limit of "events
for the first 1000 tasks". That ends up helping with readability,
though.

To make sorting efficient when using a disk store, the task wrapper
was extended to include many new indices, one for each of the sortable
columns in the UI, and metrics for which quantiles are calculated.

The second changes the way metric quantiles are calculated for stages.
Instead of using the "Distribution" class to process data for all task
metrics, which requires scanning all tasks of a stage, the code now
uses the KVStore "skip()" functionality to only read tasks that contain
interesting information for the quantiles that are desired.

This is still not cheap; because there are many metrics that the UI
and API track, the code needs to scan the index for each metric to
gather the information. Savings come mainly from skipping deserialization
when using the disk store, but the in-memory code also seems to be
faster than before (most probably because of other changes in this
patch).

To make subsequent calls faster, some quantiles are cached in the
status store. This makes UIs much faster after the first time a stage
has been loaded.

With the above changes, a lot of code in the UI layer could be simplified.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20013 from vanzin/SPARK-20657.

(cherry picked from commit 1c70da3bfbb4016e394de2c73eb0db7cdd9a6968)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/util/kvstore/LevelDB.java    |   1 +
 .../spark/status/AppStatusListener.scala      |  57 +-
 .../apache/spark/status/AppStatusStore.scala  | 389 +++++---
 .../apache/spark/status/AppStatusUtils.scala  |  68 ++
 .../org/apache/spark/status/LiveEntity.scala  | 344 ++++---
 .../spark/status/api/v1/StagesResource.scala  |   3 +-
 .../org/apache/spark/status/api/v1/api.scala  |   3 +
 .../org/apache/spark/status/storeTypes.scala  | 327 ++++++-
 .../apache/spark/ui/jobs/ExecutorTable.scala  |   4 +-
 .../org/apache/spark/ui/jobs/JobPage.scala    |   2 +-
 .../org/apache/spark/ui/jobs/StagePage.scala  | 919 ++++++------------
 ...mmary_w__custom_quantiles_expectation.json |   3 +
 ...sk_summary_w_shuffle_read_expectation.json |   3 +
 ...k_summary_w_shuffle_write_expectation.json |   3 +
 .../spark/status/AppStatusListenerSuite.scala | 105 +-
 .../spark/status/AppStatusStoreSuite.scala    | 104 ++
 .../org/apache/spark/ui/StagePageSuite.scala  |  10 +-
 scalastyle-config.xml                         |   2 +-
 18 files changed, 1361 insertions(+), 986 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/status/AppStatusUtils.scala
 create mode 100644 core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala

diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
index 4f9e10ca20066..0e491efac9181 100644
--- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
@@ -83,6 +83,7 @@ public LevelDB(File path, KVStoreSerializer serializer) throws Exception {
     if (versionData != null) {
       long version = serializer.deserializeLong(versionData);
       if (version != STORE_VERSION) {
+        close();
         throw new UnsupportedStoreVersionException();
       }
     } else {
diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
index 88b75ddd5993a..b4edcf23abc09 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
@@ -377,6 +377,10 @@ private[spark] class AppStatusListener(
     Option(liveStages.get((event.stageId, event.stageAttemptId))).foreach { stage =>
       stage.activeTasks += 1
       stage.firstLaunchTime = math.min(stage.firstLaunchTime, event.taskInfo.launchTime)
+
+      val locality = event.taskInfo.taskLocality.toString()
+      val count = stage.localitySummary.getOrElse(locality, 0L) + 1L
+      stage.localitySummary = stage.localitySummary ++ Map(locality -> count)
       maybeUpdate(stage, now)
 
       stage.jobs.foreach { job =>
@@ -433,7 +437,7 @@ private[spark] class AppStatusListener(
       }
       task.errorMessage = errorMessage
       val delta = task.updateMetrics(event.taskMetrics)
-      update(task, now)
+      update(task, now, last = true)
       delta
     }.orNull
 
@@ -450,7 +454,7 @@ private[spark] class AppStatusListener(
 
     Option(liveStages.get((event.stageId, event.stageAttemptId))).foreach { stage =>
       if (metricsDelta != null) {
-        stage.metrics.update(metricsDelta)
+        stage.metrics = LiveEntityHelpers.addMetrics(stage.metrics, metricsDelta)
       }
       stage.activeTasks -= 1
       stage.completedTasks += completedDelta
@@ -486,7 +490,7 @@ private[spark] class AppStatusListener(
       esummary.failedTasks += failedDelta
       esummary.killedTasks += killedDelta
       if (metricsDelta != null) {
-        esummary.metrics.update(metricsDelta)
+        esummary.metrics = LiveEntityHelpers.addMetrics(esummary.metrics, metricsDelta)
       }
       maybeUpdate(esummary, now)
 
@@ -604,11 +608,11 @@ private[spark] class AppStatusListener(
         maybeUpdate(task, now)
 
         Option(liveStages.get((sid, sAttempt))).foreach { stage =>
-          stage.metrics.update(delta)
+          stage.metrics = LiveEntityHelpers.addMetrics(stage.metrics, delta)
           maybeUpdate(stage, now)
 
           val esummary = stage.executorSummary(event.execId)
-          esummary.metrics.update(delta)
+          esummary.metrics = LiveEntityHelpers.addMetrics(esummary.metrics, delta)
           maybeUpdate(esummary, now)
         }
       }
@@ -690,7 +694,7 @@ private[spark] class AppStatusListener(
     // can update the executor information too.
     liveRDDs.get(block.rddId).foreach { rdd =>
       if (updatedStorageLevel.isDefined) {
-        rdd.storageLevel = updatedStorageLevel.get
+        rdd.setStorageLevel(updatedStorageLevel.get)
       }
 
       val partition = rdd.partition(block.name)
@@ -814,7 +818,7 @@ private[spark] class AppStatusListener(
 
   /** Update a live entity only if it hasn't been updated in the last configured period. */
   private def maybeUpdate(entity: LiveEntity, now: Long): Unit = {
-    if (liveUpdatePeriodNs >= 0 && now - entity.lastWriteTime > liveUpdatePeriodNs) {
+    if (live && liveUpdatePeriodNs >= 0 && now - entity.lastWriteTime > liveUpdatePeriodNs) {
       update(entity, now)
     }
   }
@@ -865,7 +869,7 @@ private[spark] class AppStatusListener(
     }
 
     stages.foreach { s =>
-      val key = s.id
+      val key = Array(s.info.stageId, s.info.attemptId)
       kvstore.delete(s.getClass(), key)
 
       val execSummaries = kvstore.view(classOf[ExecutorStageSummaryWrapper])
@@ -885,15 +889,15 @@ private[spark] class AppStatusListener(
         .asScala
 
       tasks.foreach { t =>
-        kvstore.delete(t.getClass(), t.info.taskId)
+        kvstore.delete(t.getClass(), t.taskId)
       }
 
       // Check whether there are remaining attempts for the same stage. If there aren't, then
       // also delete the RDD graph data.
       val remainingAttempts = kvstore.view(classOf[StageDataWrapper])
         .index("stageId")
-        .first(s.stageId)
-        .last(s.stageId)
+        .first(s.info.stageId)
+        .last(s.info.stageId)
         .closeableIterator()
 
       val hasMoreAttempts = try {
@@ -905,8 +909,10 @@ private[spark] class AppStatusListener(
       }
 
       if (!hasMoreAttempts) {
-        kvstore.delete(classOf[RDDOperationGraphWrapper], s.stageId)
+        kvstore.delete(classOf[RDDOperationGraphWrapper], s.info.stageId)
       }
+
+      cleanupCachedQuantiles(key)
     }
   }
 
@@ -919,9 +925,9 @@ private[spark] class AppStatusListener(
 
       // Try to delete finished tasks only.
       val toDelete = KVUtils.viewToSeq(view, countToDelete) { t =>
-        !live || t.info.status != TaskState.RUNNING.toString()
+        !live || t.status != TaskState.RUNNING.toString()
       }
-      toDelete.foreach { t => kvstore.delete(t.getClass(), t.info.taskId) }
+      toDelete.foreach { t => kvstore.delete(t.getClass(), t.taskId) }
       stage.savedTasks.addAndGet(-toDelete.size)
 
       // If there are more running tasks than the configured limit, delete running tasks. This
@@ -930,13 +936,34 @@ private[spark] class AppStatusListener(
       val remaining = countToDelete - toDelete.size
       if (remaining > 0) {
         val runningTasksToDelete = view.max(remaining).iterator().asScala.toList
-        runningTasksToDelete.foreach { t => kvstore.delete(t.getClass(), t.info.taskId) }
+        runningTasksToDelete.foreach { t => kvstore.delete(t.getClass(), t.taskId) }
         stage.savedTasks.addAndGet(-remaining)
       }
+
+      // On live applications, cleanup any cached quantiles for the stage. This makes sure that
+      // quantiles will be recalculated after tasks are replaced with newer ones.
+      //
+      // This is not needed in the SHS since caching only happens after the event logs are
+      // completely processed.
+      if (live) {
+        cleanupCachedQuantiles(stageKey)
+      }
     }
     stage.cleaning = false
   }
 
+  private def cleanupCachedQuantiles(stageKey: Array[Int]): Unit = {
+    val cachedQuantiles = kvstore.view(classOf[CachedQuantile])
+      .index("stage")
+      .first(stageKey)
+      .last(stageKey)
+      .asScala
+      .toList
+    cachedQuantiles.foreach { q =>
+      kvstore.delete(q.getClass(), q.id)
+    }
+  }
+
   /**
    * Remove at least (retainedSize / 10) items to reduce friction. Because tracking may be done
    * asynchronously, this method may return 0 in case enough items have been deleted already.
diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
index 5a942f5284018..efc28538a33db 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
@@ -24,7 +24,7 @@ import scala.collection.JavaConverters._
 import org.apache.spark.{JobExecutionStatus, SparkConf}
 import org.apache.spark.status.api.v1
 import org.apache.spark.ui.scope._
-import org.apache.spark.util.Distribution
+import org.apache.spark.util.{Distribution, Utils}
 import org.apache.spark.util.kvstore.{InMemoryStore, KVStore}
 
 /**
@@ -98,7 +98,11 @@ private[spark] class AppStatusStore(
     val it = store.view(classOf[StageDataWrapper]).index("stageId").reverse().first(stageId)
       .closeableIterator()
     try {
-      it.next().info
+      if (it.hasNext()) {
+        it.next().info
+      } else {
+        throw new NoSuchElementException(s"No stage with id $stageId")
+      }
     } finally {
       it.close()
     }
@@ -110,107 +114,238 @@ private[spark] class AppStatusStore(
     if (details) stageWithDetails(stage) else stage
   }
 
+  def taskCount(stageId: Int, stageAttemptId: Int): Long = {
+    store.count(classOf[TaskDataWrapper], "stage", Array(stageId, stageAttemptId))
+  }
+
+  def localitySummary(stageId: Int, stageAttemptId: Int): Map[String, Long] = {
+    store.read(classOf[StageDataWrapper], Array(stageId, stageAttemptId)).locality
+  }
+
+  /**
+   * Calculates a summary of the task metrics for the given stage attempt, returning the
+   * requested quantiles for the recorded metrics.
+   *
+   * This method can be expensive if the requested quantiles are not cached; the method
+   * will only cache certain quantiles (every 0.05 step), so it's recommended to stick to
+   * those to avoid expensive scans of all task data.
+   */
   def taskSummary(
       stageId: Int,
       stageAttemptId: Int,
-      quantiles: Array[Double]): v1.TaskMetricDistributions = {
-
-    val stage = Array(stageId, stageAttemptId)
-
-    val rawMetrics = store.view(classOf[TaskDataWrapper])
-      .index("stage")
-      .first(stage)
-      .last(stage)
-      .asScala
-      .flatMap(_.info.taskMetrics)
-      .toList
-      .view
-
-    def metricQuantiles(f: v1.TaskMetrics => Double): IndexedSeq[Double] =
-      Distribution(rawMetrics.map { d => f(d) }).get.getQuantiles(quantiles)
-
-    // We need to do a lot of similar munging to nested metrics here.  For each one,
-    // we want (a) extract the values for nested metrics (b) make a distribution for each metric
-    // (c) shove the distribution into the right field in our return type and (d) only return
-    // a result if the option is defined for any of the tasks.  MetricHelper is a little util
-    // to make it a little easier to deal w/ all of the nested options.  Mostly it lets us just
-    // implement one "build" method, which just builds the quantiles for each field.
-
-    val inputMetrics =
-      new MetricHelper[v1.InputMetrics, v1.InputMetricDistributions](rawMetrics, quantiles) {
-        def getSubmetrics(raw: v1.TaskMetrics): v1.InputMetrics = raw.inputMetrics
-
-        def build: v1.InputMetricDistributions = new v1.InputMetricDistributions(
-          bytesRead = submetricQuantiles(_.bytesRead),
-          recordsRead = submetricQuantiles(_.recordsRead)
-        )
-      }.build
-
-    val outputMetrics =
-      new MetricHelper[v1.OutputMetrics, v1.OutputMetricDistributions](rawMetrics, quantiles) {
-        def getSubmetrics(raw: v1.TaskMetrics): v1.OutputMetrics = raw.outputMetrics
-
-        def build: v1.OutputMetricDistributions = new v1.OutputMetricDistributions(
-          bytesWritten = submetricQuantiles(_.bytesWritten),
-          recordsWritten = submetricQuantiles(_.recordsWritten)
-        )
-      }.build
-
-    val shuffleReadMetrics =
-      new MetricHelper[v1.ShuffleReadMetrics, v1.ShuffleReadMetricDistributions](rawMetrics,
-        quantiles) {
-        def getSubmetrics(raw: v1.TaskMetrics): v1.ShuffleReadMetrics =
-          raw.shuffleReadMetrics
-
-        def build: v1.ShuffleReadMetricDistributions = new v1.ShuffleReadMetricDistributions(
-          readBytes = submetricQuantiles { s => s.localBytesRead + s.remoteBytesRead },
-          readRecords = submetricQuantiles(_.recordsRead),
-          remoteBytesRead = submetricQuantiles(_.remoteBytesRead),
-          remoteBytesReadToDisk = submetricQuantiles(_.remoteBytesReadToDisk),
-          remoteBlocksFetched = submetricQuantiles(_.remoteBlocksFetched),
-          localBlocksFetched = submetricQuantiles(_.localBlocksFetched),
-          totalBlocksFetched = submetricQuantiles { s =>
-            s.localBlocksFetched + s.remoteBlocksFetched
-          },
-          fetchWaitTime = submetricQuantiles(_.fetchWaitTime)
-        )
-      }.build
-
-    val shuffleWriteMetrics =
-      new MetricHelper[v1.ShuffleWriteMetrics, v1.ShuffleWriteMetricDistributions](rawMetrics,
-        quantiles) {
-        def getSubmetrics(raw: v1.TaskMetrics): v1.ShuffleWriteMetrics =
-          raw.shuffleWriteMetrics
-
-        def build: v1.ShuffleWriteMetricDistributions = new v1.ShuffleWriteMetricDistributions(
-          writeBytes = submetricQuantiles(_.bytesWritten),
-          writeRecords = submetricQuantiles(_.recordsWritten),
-          writeTime = submetricQuantiles(_.writeTime)
-        )
-      }.build
-
-    new v1.TaskMetricDistributions(
+      unsortedQuantiles: Array[Double]): Option[v1.TaskMetricDistributions] = {
+    val stageKey = Array(stageId, stageAttemptId)
+    val quantiles = unsortedQuantiles.sorted
+
+    // We don't know how many tasks remain in the store that actually have metrics. So scan one
+    // metric and count how many valid tasks there are. Use skip() instead of next() since it's
+    // cheaper for disk stores (avoids deserialization).
+    val count = {
+      Utils.tryWithResource(
+        store.view(classOf[TaskDataWrapper])
+          .parent(stageKey)
+          .index(TaskIndexNames.EXEC_RUN_TIME)
+          .first(0L)
+          .closeableIterator()
+      ) { it =>
+        var _count = 0L
+        while (it.hasNext()) {
+          _count += 1
+          it.skip(1)
+        }
+        _count
+      }
+    }
+
+    if (count <= 0) {
+      return None
+    }
+
+    // Find out which quantiles are already cached. The data in the store must match the expected
+    // task count to be considered, otherwise it will be re-scanned and overwritten.
+    val cachedQuantiles = quantiles.filter(shouldCacheQuantile).flatMap { q =>
+      val qkey = Array(stageId, stageAttemptId, quantileToString(q))
+      asOption(store.read(classOf[CachedQuantile], qkey)).filter(_.taskCount == count)
+    }
+
+    // If there are no missing quantiles, return the data. Otherwise, just compute everything
+    // to make the code simpler.
+    if (cachedQuantiles.size == quantiles.size) {
+      def toValues(fn: CachedQuantile => Double): IndexedSeq[Double] = cachedQuantiles.map(fn)
+
+      val distributions = new v1.TaskMetricDistributions(
+        quantiles = quantiles,
+        executorDeserializeTime = toValues(_.executorDeserializeTime),
+        executorDeserializeCpuTime = toValues(_.executorDeserializeCpuTime),
+        executorRunTime = toValues(_.executorRunTime),
+        executorCpuTime = toValues(_.executorCpuTime),
+        resultSize = toValues(_.resultSize),
+        jvmGcTime = toValues(_.jvmGcTime),
+        resultSerializationTime = toValues(_.resultSerializationTime),
+        gettingResultTime = toValues(_.gettingResultTime),
+        schedulerDelay = toValues(_.schedulerDelay),
+        peakExecutionMemory = toValues(_.peakExecutionMemory),
+        memoryBytesSpilled = toValues(_.memoryBytesSpilled),
+        diskBytesSpilled = toValues(_.diskBytesSpilled),
+        inputMetrics = new v1.InputMetricDistributions(
+          toValues(_.bytesRead),
+          toValues(_.recordsRead)),
+        outputMetrics = new v1.OutputMetricDistributions(
+          toValues(_.bytesWritten),
+          toValues(_.recordsWritten)),
+        shuffleReadMetrics = new v1.ShuffleReadMetricDistributions(
+          toValues(_.shuffleReadBytes),
+          toValues(_.shuffleRecordsRead),
+          toValues(_.shuffleRemoteBlocksFetched),
+          toValues(_.shuffleLocalBlocksFetched),
+          toValues(_.shuffleFetchWaitTime),
+          toValues(_.shuffleRemoteBytesRead),
+          toValues(_.shuffleRemoteBytesReadToDisk),
+          toValues(_.shuffleTotalBlocksFetched)),
+        shuffleWriteMetrics = new v1.ShuffleWriteMetricDistributions(
+          toValues(_.shuffleWriteBytes),
+          toValues(_.shuffleWriteRecords),
+          toValues(_.shuffleWriteTime)))
+
+      return Some(distributions)
+    }
+
+    // Compute quantiles by scanning the tasks in the store. This is not really stable for live
+    // stages (e.g. the number of recorded tasks may change while this code is running), but should
+    // stabilize once the stage finishes. It's also slow, especially with disk stores.
+    val indices = quantiles.map { q => math.min((q * count).toLong, count - 1) }
+
+    def scanTasks(index: String)(fn: TaskDataWrapper => Long): IndexedSeq[Double] = {
+      Utils.tryWithResource(
+        store.view(classOf[TaskDataWrapper])
+          .parent(stageKey)
+          .index(index)
+          .first(0L)
+          .closeableIterator()
+      ) { it =>
+        var last = Double.NaN
+        var currentIdx = -1L
+        indices.map { idx =>
+          if (idx == currentIdx) {
+            last
+          } else {
+            val diff = idx - currentIdx
+            currentIdx = idx
+            if (it.skip(diff - 1)) {
+              last = fn(it.next()).toDouble
+              last
+            } else {
+              Double.NaN
+            }
+          }
+        }.toIndexedSeq
+      }
+    }
+
+    val computedQuantiles = new v1.TaskMetricDistributions(
       quantiles = quantiles,
-      executorDeserializeTime = metricQuantiles(_.executorDeserializeTime),
-      executorDeserializeCpuTime = metricQuantiles(_.executorDeserializeCpuTime),
-      executorRunTime = metricQuantiles(_.executorRunTime),
-      executorCpuTime = metricQuantiles(_.executorCpuTime),
-      resultSize = metricQuantiles(_.resultSize),
-      jvmGcTime = metricQuantiles(_.jvmGcTime),
-      resultSerializationTime = metricQuantiles(_.resultSerializationTime),
-      memoryBytesSpilled = metricQuantiles(_.memoryBytesSpilled),
-      diskBytesSpilled = metricQuantiles(_.diskBytesSpilled),
-      inputMetrics = inputMetrics,
-      outputMetrics = outputMetrics,
-      shuffleReadMetrics = shuffleReadMetrics,
-      shuffleWriteMetrics = shuffleWriteMetrics
-    )
+      executorDeserializeTime = scanTasks(TaskIndexNames.DESER_TIME) { t =>
+        t.executorDeserializeTime
+      },
+      executorDeserializeCpuTime = scanTasks(TaskIndexNames.DESER_CPU_TIME) { t =>
+        t.executorDeserializeCpuTime
+      },
+      executorRunTime = scanTasks(TaskIndexNames.EXEC_RUN_TIME) { t => t.executorRunTime },
+      executorCpuTime = scanTasks(TaskIndexNames.EXEC_CPU_TIME) { t => t.executorCpuTime },
+      resultSize = scanTasks(TaskIndexNames.RESULT_SIZE) { t => t.resultSize },
+      jvmGcTime = scanTasks(TaskIndexNames.GC_TIME) { t => t.jvmGcTime },
+      resultSerializationTime = scanTasks(TaskIndexNames.SER_TIME) { t =>
+        t.resultSerializationTime
+      },
+      gettingResultTime = scanTasks(TaskIndexNames.GETTING_RESULT_TIME) { t =>
+        t.gettingResultTime
+      },
+      schedulerDelay = scanTasks(TaskIndexNames.SCHEDULER_DELAY) { t => t.schedulerDelay },
+      peakExecutionMemory = scanTasks(TaskIndexNames.PEAK_MEM) { t => t.peakExecutionMemory },
+      memoryBytesSpilled = scanTasks(TaskIndexNames.MEM_SPILL) { t => t.memoryBytesSpilled },
+      diskBytesSpilled = scanTasks(TaskIndexNames.DISK_SPILL) { t => t.diskBytesSpilled },
+      inputMetrics = new v1.InputMetricDistributions(
+        scanTasks(TaskIndexNames.INPUT_SIZE) { t => t.inputBytesRead },
+        scanTasks(TaskIndexNames.INPUT_RECORDS) { t => t.inputRecordsRead }),
+      outputMetrics = new v1.OutputMetricDistributions(
+        scanTasks(TaskIndexNames.OUTPUT_SIZE) { t => t.outputBytesWritten },
+        scanTasks(TaskIndexNames.OUTPUT_RECORDS) { t => t.outputRecordsWritten }),
+      shuffleReadMetrics = new v1.ShuffleReadMetricDistributions(
+        scanTasks(TaskIndexNames.SHUFFLE_TOTAL_READS) { m =>
+          m.shuffleLocalBytesRead + m.shuffleRemoteBytesRead
+        },
+        scanTasks(TaskIndexNames.SHUFFLE_READ_RECORDS) { t => t.shuffleRecordsRead },
+        scanTasks(TaskIndexNames.SHUFFLE_REMOTE_BLOCKS) { t => t.shuffleRemoteBlocksFetched },
+        scanTasks(TaskIndexNames.SHUFFLE_LOCAL_BLOCKS) { t => t.shuffleLocalBlocksFetched },
+        scanTasks(TaskIndexNames.SHUFFLE_READ_TIME) { t => t.shuffleFetchWaitTime },
+        scanTasks(TaskIndexNames.SHUFFLE_REMOTE_READS) { t => t.shuffleRemoteBytesRead },
+        scanTasks(TaskIndexNames.SHUFFLE_REMOTE_READS_TO_DISK) { t =>
+          t.shuffleRemoteBytesReadToDisk
+        },
+        scanTasks(TaskIndexNames.SHUFFLE_TOTAL_BLOCKS) { m =>
+          m.shuffleLocalBlocksFetched + m.shuffleRemoteBlocksFetched
+        }),
+      shuffleWriteMetrics = new v1.ShuffleWriteMetricDistributions(
+        scanTasks(TaskIndexNames.SHUFFLE_WRITE_SIZE) { t => t.shuffleBytesWritten },
+        scanTasks(TaskIndexNames.SHUFFLE_WRITE_RECORDS) { t => t.shuffleRecordsWritten },
+        scanTasks(TaskIndexNames.SHUFFLE_WRITE_TIME) { t => t.shuffleWriteTime }))
+
+    // Go through the computed quantiles and cache the values that match the caching criteria.
+    computedQuantiles.quantiles.zipWithIndex
+      .filter { case (q, _) => quantiles.contains(q) && shouldCacheQuantile(q) }
+      .foreach { case (q, idx) =>
+        val cached = new CachedQuantile(stageId, stageAttemptId, quantileToString(q), count,
+          executorDeserializeTime = computedQuantiles.executorDeserializeTime(idx),
+          executorDeserializeCpuTime = computedQuantiles.executorDeserializeCpuTime(idx),
+          executorRunTime = computedQuantiles.executorRunTime(idx),
+          executorCpuTime = computedQuantiles.executorCpuTime(idx),
+          resultSize = computedQuantiles.resultSize(idx),
+          jvmGcTime = computedQuantiles.jvmGcTime(idx),
+          resultSerializationTime = computedQuantiles.resultSerializationTime(idx),
+          gettingResultTime = computedQuantiles.gettingResultTime(idx),
+          schedulerDelay = computedQuantiles.schedulerDelay(idx),
+          peakExecutionMemory = computedQuantiles.peakExecutionMemory(idx),
+          memoryBytesSpilled = computedQuantiles.memoryBytesSpilled(idx),
+          diskBytesSpilled = computedQuantiles.diskBytesSpilled(idx),
+
+          bytesRead = computedQuantiles.inputMetrics.bytesRead(idx),
+          recordsRead = computedQuantiles.inputMetrics.recordsRead(idx),
+
+          bytesWritten = computedQuantiles.outputMetrics.bytesWritten(idx),
+          recordsWritten = computedQuantiles.outputMetrics.recordsWritten(idx),
+
+          shuffleReadBytes = computedQuantiles.shuffleReadMetrics.readBytes(idx),
+          shuffleRecordsRead = computedQuantiles.shuffleReadMetrics.readRecords(idx),
+          shuffleRemoteBlocksFetched =
+            computedQuantiles.shuffleReadMetrics.remoteBlocksFetched(idx),
+          shuffleLocalBlocksFetched = computedQuantiles.shuffleReadMetrics.localBlocksFetched(idx),
+          shuffleFetchWaitTime = computedQuantiles.shuffleReadMetrics.fetchWaitTime(idx),
+          shuffleRemoteBytesRead = computedQuantiles.shuffleReadMetrics.remoteBytesRead(idx),
+          shuffleRemoteBytesReadToDisk =
+            computedQuantiles.shuffleReadMetrics.remoteBytesReadToDisk(idx),
+          shuffleTotalBlocksFetched = computedQuantiles.shuffleReadMetrics.totalBlocksFetched(idx),
+
+          shuffleWriteBytes = computedQuantiles.shuffleWriteMetrics.writeBytes(idx),
+          shuffleWriteRecords = computedQuantiles.shuffleWriteMetrics.writeRecords(idx),
+          shuffleWriteTime = computedQuantiles.shuffleWriteMetrics.writeTime(idx))
+        store.write(cached)
+      }
+
+    Some(computedQuantiles)
   }
 
+  /**
+   * Whether to cache information about a specific metric quantile. We cache quantiles at every 0.05
+   * step, which covers the default values used both in the API and in the stages page.
+   */
+  private def shouldCacheQuantile(q: Double): Boolean = (math.round(q * 100) % 5) == 0
+
+  private def quantileToString(q: Double): String = math.round(q * 100).toString
+
   def taskList(stageId: Int, stageAttemptId: Int, maxTasks: Int): Seq[v1.TaskData] = {
     val stageKey = Array(stageId, stageAttemptId)
     store.view(classOf[TaskDataWrapper]).index("stage").first(stageKey).last(stageKey).reverse()
-      .max(maxTasks).asScala.map(_.info).toSeq.reverse
+      .max(maxTasks).asScala.map(_.toApi).toSeq.reverse
   }
 
   def taskList(
@@ -219,18 +354,43 @@ private[spark] class AppStatusStore(
       offset: Int,
       length: Int,
       sortBy: v1.TaskSorting): Seq[v1.TaskData] = {
+    val (indexName, ascending) = sortBy match {
+      case v1.TaskSorting.ID =>
+        (None, true)
+      case v1.TaskSorting.INCREASING_RUNTIME =>
+        (Some(TaskIndexNames.EXEC_RUN_TIME), true)
+      case v1.TaskSorting.DECREASING_RUNTIME =>
+        (Some(TaskIndexNames.EXEC_RUN_TIME), false)
+    }
+    taskList(stageId, stageAttemptId, offset, length, indexName, ascending)
+  }
+
+  def taskList(
+      stageId: Int,
+      stageAttemptId: Int,
+      offset: Int,
+      length: Int,
+      sortBy: Option[String],
+      ascending: Boolean): Seq[v1.TaskData] = {
     val stageKey = Array(stageId, stageAttemptId)
     val base = store.view(classOf[TaskDataWrapper])
     val indexed = sortBy match {
-      case v1.TaskSorting.ID =>
+      case Some(index) =>
+        base.index(index).parent(stageKey)
+
+      case _ =>
+        // Sort by ID, which is the "stage" index.
         base.index("stage").first(stageKey).last(stageKey)
-      case v1.TaskSorting.INCREASING_RUNTIME =>
-        base.index("runtime").first(stageKey ++ Array(-1L)).last(stageKey ++ Array(Long.MaxValue))
-      case v1.TaskSorting.DECREASING_RUNTIME =>
-        base.index("runtime").first(stageKey ++ Array(Long.MaxValue)).last(stageKey ++ Array(-1L))
-          .reverse()
     }
-    indexed.skip(offset).max(length).asScala.map(_.info).toSeq
+
+    val ordered = if (ascending) indexed else indexed.reverse()
+    ordered.skip(offset).max(length).asScala.map(_.toApi).toSeq
+  }
+
+  def executorSummary(stageId: Int, attemptId: Int): Map[String, v1.ExecutorStageSummary] = {
+    val stageKey = Array(stageId, attemptId)
+    store.view(classOf[ExecutorStageSummaryWrapper]).index("stage").first(stageKey).last(stageKey)
+      .asScala.map { exec => (exec.executorId -> exec.info) }.toMap
   }
 
   def rddList(cachedOnly: Boolean = true): Seq[v1.RDDStorageInfo] = {
@@ -256,12 +416,6 @@ private[spark] class AppStatusStore(
       .map { t => (t.taskId, t) }
       .toMap
 
-    val stageKey = Array(stage.stageId, stage.attemptId)
-    val execs = store.view(classOf[ExecutorStageSummaryWrapper]).index("stage").first(stageKey)
-      .last(stageKey).closeableIterator().asScala
-      .map { exec => (exec.executorId -> exec.info) }
-      .toMap
-
     new v1.StageData(
       stage.status,
       stage.stageId,
@@ -295,7 +449,7 @@ private[spark] class AppStatusStore(
       stage.rddIds,
       stage.accumulatorUpdates,
       Some(tasks),
-      Some(execs),
+      Some(executorSummary(stage.stageId, stage.attemptId)),
       stage.killedTasksSummary)
   }
 
@@ -352,22 +506,3 @@ private[spark] object AppStatusStore {
   }
 
 }
-
-/**
- * Helper for getting distributions from nested metric types.
- */
-private abstract class MetricHelper[I, O](
-    rawMetrics: Seq[v1.TaskMetrics],
-    quantiles: Array[Double]) {
-
-  def getSubmetrics(raw: v1.TaskMetrics): I
-
-  def build: O
-
-  val data: Seq[I] = rawMetrics.map(getSubmetrics)
-
-  /** applies the given function to all input metrics, and returns the quantiles */
-  def submetricQuantiles(f: I => Double): IndexedSeq[Double] = {
-    Distribution(data.map { d => f(d) }).get.getQuantiles(quantiles)
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusUtils.scala b/core/src/main/scala/org/apache/spark/status/AppStatusUtils.scala
new file mode 100644
index 0000000000000..341bd4e0cd016
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusUtils.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.status
+
+import org.apache.spark.status.api.v1.{TaskData, TaskMetrics}
+
+private[spark] object AppStatusUtils {
+
+  def schedulerDelay(task: TaskData): Long = {
+    if (task.taskMetrics.isDefined && task.duration.isDefined) {
+      val m = task.taskMetrics.get
+      schedulerDelay(task.launchTime.getTime(), fetchStart(task), task.duration.get,
+        m.executorDeserializeTime, m.resultSerializationTime, m.executorRunTime)
+    } else {
+      0L
+    }
+  }
+
+  def gettingResultTime(task: TaskData): Long = {
+    gettingResultTime(task.launchTime.getTime(), fetchStart(task), task.duration.getOrElse(-1L))
+  }
+
+  def schedulerDelay(
+      launchTime: Long,
+      fetchStart: Long,
+      duration: Long,
+      deserializeTime: Long,
+      serializeTime: Long,
+      runTime: Long): Long = {
+    math.max(0, duration - runTime - deserializeTime - serializeTime -
+      gettingResultTime(launchTime, fetchStart, duration))
+  }
+
+  def gettingResultTime(launchTime: Long, fetchStart: Long, duration: Long): Long = {
+    if (fetchStart > 0) {
+      if (duration > 0) {
+        launchTime + duration - fetchStart
+      } else {
+        System.currentTimeMillis() - fetchStart
+      }
+    } else {
+      0L
+    }
+  }
+
+  private def fetchStart(task: TaskData): Long = {
+    if (task.resultFetchStart.isDefined) {
+      task.resultFetchStart.get.getTime()
+    } else {
+      -1
+    }
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
index 305c2fafa6aac..4295e664e131c 100644
--- a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
+++ b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
@@ -22,6 +22,8 @@ import java.util.concurrent.atomic.AtomicInteger
 
 import scala.collection.mutable.HashMap
 
+import com.google.common.collect.Interners
+
 import org.apache.spark.JobExecutionStatus
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler.{AccumulableInfo, StageInfo, TaskInfo}
@@ -119,7 +121,9 @@ private class LiveTask(
 
   import LiveEntityHelpers._
 
-  private var recordedMetrics: v1.TaskMetrics = null
+  // The task metrics use a special value when no metrics have been reported. The special value is
+  // checked when calculating indexed values when writing to the store (see [[TaskDataWrapper]]).
+  private var metrics: v1.TaskMetrics = createMetrics(default = -1L)
 
   var errorMessage: Option[String] = None
 
@@ -129,8 +133,8 @@ private class LiveTask(
    */
   def updateMetrics(metrics: TaskMetrics): v1.TaskMetrics = {
     if (metrics != null) {
-      val old = recordedMetrics
-      recordedMetrics = new v1.TaskMetrics(
+      val old = this.metrics
+      val newMetrics = createMetrics(
         metrics.executorDeserializeTime,
         metrics.executorDeserializeCpuTime,
         metrics.executorRunTime,
@@ -141,73 +145,35 @@ private class LiveTask(
         metrics.memoryBytesSpilled,
         metrics.diskBytesSpilled,
         metrics.peakExecutionMemory,
-        new v1.InputMetrics(
-          metrics.inputMetrics.bytesRead,
-          metrics.inputMetrics.recordsRead),
-        new v1.OutputMetrics(
-          metrics.outputMetrics.bytesWritten,
-          metrics.outputMetrics.recordsWritten),
-        new v1.ShuffleReadMetrics(
-          metrics.shuffleReadMetrics.remoteBlocksFetched,
-          metrics.shuffleReadMetrics.localBlocksFetched,
-          metrics.shuffleReadMetrics.fetchWaitTime,
-          metrics.shuffleReadMetrics.remoteBytesRead,
-          metrics.shuffleReadMetrics.remoteBytesReadToDisk,
-          metrics.shuffleReadMetrics.localBytesRead,
-          metrics.shuffleReadMetrics.recordsRead),
-        new v1.ShuffleWriteMetrics(
-          metrics.shuffleWriteMetrics.bytesWritten,
-          metrics.shuffleWriteMetrics.writeTime,
-          metrics.shuffleWriteMetrics.recordsWritten))
-      if (old != null) calculateMetricsDelta(recordedMetrics, old) else recordedMetrics
+        metrics.inputMetrics.bytesRead,
+        metrics.inputMetrics.recordsRead,
+        metrics.outputMetrics.bytesWritten,
+        metrics.outputMetrics.recordsWritten,
+        metrics.shuffleReadMetrics.remoteBlocksFetched,
+        metrics.shuffleReadMetrics.localBlocksFetched,
+        metrics.shuffleReadMetrics.fetchWaitTime,
+        metrics.shuffleReadMetrics.remoteBytesRead,
+        metrics.shuffleReadMetrics.remoteBytesReadToDisk,
+        metrics.shuffleReadMetrics.localBytesRead,
+        metrics.shuffleReadMetrics.recordsRead,
+        metrics.shuffleWriteMetrics.bytesWritten,
+        metrics.shuffleWriteMetrics.writeTime,
+        metrics.shuffleWriteMetrics.recordsWritten)
+
+      this.metrics = newMetrics
+
+      // Only calculate the delta if the old metrics contain valid information, otherwise
+      // the new metrics are the delta.
+      if (old.executorDeserializeTime >= 0L) {
+        subtractMetrics(newMetrics, old)
+      } else {
+        newMetrics
+      }
     } else {
       null
     }
   }
 
-  /**
-   * Return a new TaskMetrics object containing the delta of the various fields of the given
-   * metrics objects. This is currently targeted at updating stage data, so it does not
-   * necessarily calculate deltas for all the fields.
-   */
-  private def calculateMetricsDelta(
-      metrics: v1.TaskMetrics,
-      old: v1.TaskMetrics): v1.TaskMetrics = {
-    val shuffleWriteDelta = new v1.ShuffleWriteMetrics(
-      metrics.shuffleWriteMetrics.bytesWritten - old.shuffleWriteMetrics.bytesWritten,
-      0L,
-      metrics.shuffleWriteMetrics.recordsWritten - old.shuffleWriteMetrics.recordsWritten)
-
-    val shuffleReadDelta = new v1.ShuffleReadMetrics(
-      0L, 0L, 0L,
-      metrics.shuffleReadMetrics.remoteBytesRead - old.shuffleReadMetrics.remoteBytesRead,
-      metrics.shuffleReadMetrics.remoteBytesReadToDisk -
-        old.shuffleReadMetrics.remoteBytesReadToDisk,
-      metrics.shuffleReadMetrics.localBytesRead - old.shuffleReadMetrics.localBytesRead,
-      metrics.shuffleReadMetrics.recordsRead - old.shuffleReadMetrics.recordsRead)
-
-    val inputDelta = new v1.InputMetrics(
-      metrics.inputMetrics.bytesRead - old.inputMetrics.bytesRead,
-      metrics.inputMetrics.recordsRead - old.inputMetrics.recordsRead)
-
-    val outputDelta = new v1.OutputMetrics(
-      metrics.outputMetrics.bytesWritten - old.outputMetrics.bytesWritten,
-      metrics.outputMetrics.recordsWritten - old.outputMetrics.recordsWritten)
-
-    new v1.TaskMetrics(
-      0L, 0L,
-      metrics.executorRunTime - old.executorRunTime,
-      metrics.executorCpuTime - old.executorCpuTime,
-      0L, 0L, 0L,
-      metrics.memoryBytesSpilled - old.memoryBytesSpilled,
-      metrics.diskBytesSpilled - old.diskBytesSpilled,
-      0L,
-      inputDelta,
-      outputDelta,
-      shuffleReadDelta,
-      shuffleWriteDelta)
-  }
-
   override protected def doUpdate(): Any = {
     val duration = if (info.finished) {
       info.duration
@@ -215,22 +181,48 @@ private class LiveTask(
       info.timeRunning(lastUpdateTime.getOrElse(System.currentTimeMillis()))
     }
 
-    val task = new v1.TaskData(
+    new TaskDataWrapper(
       info.taskId,
       info.index,
       info.attemptNumber,
-      new Date(info.launchTime),
-      if (info.gettingResult) Some(new Date(info.gettingResultTime)) else None,
-      Some(duration),
-      info.executorId,
-      info.host,
-      info.status,
-      info.taskLocality.toString(),
+      info.launchTime,
+      if (info.gettingResult) info.gettingResultTime else -1L,
+      duration,
+      weakIntern(info.executorId),
+      weakIntern(info.host),
+      weakIntern(info.status),
+      weakIntern(info.taskLocality.toString()),
       info.speculative,
       newAccumulatorInfos(info.accumulables),
       errorMessage,
-      Option(recordedMetrics))
-    new TaskDataWrapper(task, stageId, stageAttemptId)
+
+      metrics.executorDeserializeTime,
+      metrics.executorDeserializeCpuTime,
+      metrics.executorRunTime,
+      metrics.executorCpuTime,
+      metrics.resultSize,
+      metrics.jvmGcTime,
+      metrics.resultSerializationTime,
+      metrics.memoryBytesSpilled,
+      metrics.diskBytesSpilled,
+      metrics.peakExecutionMemory,
+      metrics.inputMetrics.bytesRead,
+      metrics.inputMetrics.recordsRead,
+      metrics.outputMetrics.bytesWritten,
+      metrics.outputMetrics.recordsWritten,
+      metrics.shuffleReadMetrics.remoteBlocksFetched,
+      metrics.shuffleReadMetrics.localBlocksFetched,
+      metrics.shuffleReadMetrics.fetchWaitTime,
+      metrics.shuffleReadMetrics.remoteBytesRead,
+      metrics.shuffleReadMetrics.remoteBytesReadToDisk,
+      metrics.shuffleReadMetrics.localBytesRead,
+      metrics.shuffleReadMetrics.recordsRead,
+      metrics.shuffleWriteMetrics.bytesWritten,
+      metrics.shuffleWriteMetrics.writeTime,
+      metrics.shuffleWriteMetrics.recordsWritten,
+
+      stageId,
+      stageAttemptId)
   }
 
 }
@@ -313,50 +305,19 @@ private class LiveExecutor(val executorId: String, _addTime: Long) extends LiveE
 
 }
 
-/** Metrics tracked per stage (both total and per executor). */
-private class MetricsTracker {
-  var executorRunTime = 0L
-  var executorCpuTime = 0L
-  var inputBytes = 0L
-  var inputRecords = 0L
-  var outputBytes = 0L
-  var outputRecords = 0L
-  var shuffleReadBytes = 0L
-  var shuffleReadRecords = 0L
-  var shuffleWriteBytes = 0L
-  var shuffleWriteRecords = 0L
-  var memoryBytesSpilled = 0L
-  var diskBytesSpilled = 0L
-
-  def update(delta: v1.TaskMetrics): Unit = {
-    executorRunTime += delta.executorRunTime
-    executorCpuTime += delta.executorCpuTime
-    inputBytes += delta.inputMetrics.bytesRead
-    inputRecords += delta.inputMetrics.recordsRead
-    outputBytes += delta.outputMetrics.bytesWritten
-    outputRecords += delta.outputMetrics.recordsWritten
-    shuffleReadBytes += delta.shuffleReadMetrics.localBytesRead +
-      delta.shuffleReadMetrics.remoteBytesRead
-    shuffleReadRecords += delta.shuffleReadMetrics.recordsRead
-    shuffleWriteBytes += delta.shuffleWriteMetrics.bytesWritten
-    shuffleWriteRecords += delta.shuffleWriteMetrics.recordsWritten
-    memoryBytesSpilled += delta.memoryBytesSpilled
-    diskBytesSpilled += delta.diskBytesSpilled
-  }
-
-}
-
 private class LiveExecutorStageSummary(
     stageId: Int,
     attemptId: Int,
     executorId: String) extends LiveEntity {
 
+  import LiveEntityHelpers._
+
   var taskTime = 0L
   var succeededTasks = 0
   var failedTasks = 0
   var killedTasks = 0
 
-  val metrics = new MetricsTracker()
+  var metrics = createMetrics(default = 0L)
 
   override protected def doUpdate(): Any = {
     val info = new v1.ExecutorStageSummary(
@@ -364,14 +325,14 @@ private class LiveExecutorStageSummary(
       failedTasks,
       succeededTasks,
       killedTasks,
-      metrics.inputBytes,
-      metrics.inputRecords,
-      metrics.outputBytes,
-      metrics.outputRecords,
-      metrics.shuffleReadBytes,
-      metrics.shuffleReadRecords,
-      metrics.shuffleWriteBytes,
-      metrics.shuffleWriteRecords,
+      metrics.inputMetrics.bytesRead,
+      metrics.inputMetrics.recordsRead,
+      metrics.outputMetrics.bytesWritten,
+      metrics.outputMetrics.recordsWritten,
+      metrics.shuffleReadMetrics.remoteBytesRead + metrics.shuffleReadMetrics.localBytesRead,
+      metrics.shuffleReadMetrics.recordsRead,
+      metrics.shuffleWriteMetrics.bytesWritten,
+      metrics.shuffleWriteMetrics.recordsWritten,
       metrics.memoryBytesSpilled,
       metrics.diskBytesSpilled)
     new ExecutorStageSummaryWrapper(stageId, attemptId, executorId, info)
@@ -402,7 +363,9 @@ private class LiveStage extends LiveEntity {
 
   var firstLaunchTime = Long.MaxValue
 
-  val metrics = new MetricsTracker()
+  var localitySummary: Map[String, Long] = Map()
+
+  var metrics = createMetrics(default = 0L)
 
   val executorSummaries = new HashMap[String, LiveExecutorStageSummary]()
 
@@ -435,14 +398,14 @@ private class LiveStage extends LiveEntity {
       info.completionTime.map(new Date(_)),
       info.failureReason,
 
-      metrics.inputBytes,
-      metrics.inputRecords,
-      metrics.outputBytes,
-      metrics.outputRecords,
-      metrics.shuffleReadBytes,
-      metrics.shuffleReadRecords,
-      metrics.shuffleWriteBytes,
-      metrics.shuffleWriteRecords,
+      metrics.inputMetrics.bytesRead,
+      metrics.inputMetrics.recordsRead,
+      metrics.outputMetrics.bytesWritten,
+      metrics.outputMetrics.recordsWritten,
+      metrics.shuffleReadMetrics.localBytesRead + metrics.shuffleReadMetrics.remoteBytesRead,
+      metrics.shuffleReadMetrics.recordsRead,
+      metrics.shuffleWriteMetrics.bytesWritten,
+      metrics.shuffleWriteMetrics.recordsWritten,
       metrics.memoryBytesSpilled,
       metrics.diskBytesSpilled,
 
@@ -459,13 +422,15 @@ private class LiveStage extends LiveEntity {
   }
 
   override protected def doUpdate(): Any = {
-    new StageDataWrapper(toApi(), jobIds)
+    new StageDataWrapper(toApi(), jobIds, localitySummary)
   }
 
 }
 
 private class LiveRDDPartition(val blockName: String) {
 
+  import LiveEntityHelpers._
+
   // Pointers used by RDDPartitionSeq.
   @volatile var prev: LiveRDDPartition = null
   @volatile var next: LiveRDDPartition = null
@@ -485,7 +450,7 @@ private class LiveRDDPartition(val blockName: String) {
       diskUsed: Long): Unit = {
     value = new v1.RDDPartitionInfo(
       blockName,
-      storageLevel,
+      weakIntern(storageLevel),
       memoryUsed,
       diskUsed,
       executors)
@@ -495,6 +460,8 @@ private class LiveRDDPartition(val blockName: String) {
 
 private class LiveRDDDistribution(exec: LiveExecutor) {
 
+  import LiveEntityHelpers._
+
   val executorId = exec.executorId
   var memoryUsed = 0L
   var diskUsed = 0L
@@ -508,7 +475,7 @@ private class LiveRDDDistribution(exec: LiveExecutor) {
   def toApi(): v1.RDDDataDistribution = {
     if (lastUpdate == null) {
       lastUpdate = new v1.RDDDataDistribution(
-        exec.hostPort,
+        weakIntern(exec.hostPort),
         memoryUsed,
         exec.maxMemory - exec.memoryUsed,
         diskUsed,
@@ -524,7 +491,9 @@ private class LiveRDDDistribution(exec: LiveExecutor) {
 
 private class LiveRDD(val info: RDDInfo) extends LiveEntity {
 
-  var storageLevel: String = info.storageLevel.description
+  import LiveEntityHelpers._
+
+  var storageLevel: String = weakIntern(info.storageLevel.description)
   var memoryUsed = 0L
   var diskUsed = 0L
 
@@ -533,6 +502,10 @@ private class LiveRDD(val info: RDDInfo) extends LiveEntity {
 
   private val distributions = new HashMap[String, LiveRDDDistribution]()
 
+  def setStorageLevel(level: String): Unit = {
+    this.storageLevel = weakIntern(level)
+  }
+
   def partition(blockName: String): LiveRDDPartition = {
     partitions.getOrElseUpdate(blockName, {
       val part = new LiveRDDPartition(blockName)
@@ -593,6 +566,9 @@ private class SchedulerPool(name: String) extends LiveEntity {
 
 private object LiveEntityHelpers {
 
+  private val stringInterner = Interners.newWeakInterner[String]()
+
+
   def newAccumulatorInfos(accums: Iterable[AccumulableInfo]): Seq[v1.AccumulableInfo] = {
     accums
       .filter { acc =>
@@ -604,13 +580,119 @@ private object LiveEntityHelpers {
       .map { acc =>
         new v1.AccumulableInfo(
           acc.id,
-          acc.name.orNull,
+          acc.name.map(weakIntern).orNull,
           acc.update.map(_.toString()),
           acc.value.map(_.toString()).orNull)
       }
       .toSeq
   }
 
+  /** String interning to reduce the memory usage. */
+  def weakIntern(s: String): String = {
+    stringInterner.intern(s)
+  }
+
+  // scalastyle:off argcount
+  def createMetrics(
+      executorDeserializeTime: Long,
+      executorDeserializeCpuTime: Long,
+      executorRunTime: Long,
+      executorCpuTime: Long,
+      resultSize: Long,
+      jvmGcTime: Long,
+      resultSerializationTime: Long,
+      memoryBytesSpilled: Long,
+      diskBytesSpilled: Long,
+      peakExecutionMemory: Long,
+      inputBytesRead: Long,
+      inputRecordsRead: Long,
+      outputBytesWritten: Long,
+      outputRecordsWritten: Long,
+      shuffleRemoteBlocksFetched: Long,
+      shuffleLocalBlocksFetched: Long,
+      shuffleFetchWaitTime: Long,
+      shuffleRemoteBytesRead: Long,
+      shuffleRemoteBytesReadToDisk: Long,
+      shuffleLocalBytesRead: Long,
+      shuffleRecordsRead: Long,
+      shuffleBytesWritten: Long,
+      shuffleWriteTime: Long,
+      shuffleRecordsWritten: Long): v1.TaskMetrics = {
+    new v1.TaskMetrics(
+      executorDeserializeTime,
+      executorDeserializeCpuTime,
+      executorRunTime,
+      executorCpuTime,
+      resultSize,
+      jvmGcTime,
+      resultSerializationTime,
+      memoryBytesSpilled,
+      diskBytesSpilled,
+      peakExecutionMemory,
+      new v1.InputMetrics(
+        inputBytesRead,
+        inputRecordsRead),
+      new v1.OutputMetrics(
+        outputBytesWritten,
+        outputRecordsWritten),
+      new v1.ShuffleReadMetrics(
+        shuffleRemoteBlocksFetched,
+        shuffleLocalBlocksFetched,
+        shuffleFetchWaitTime,
+        shuffleRemoteBytesRead,
+        shuffleRemoteBytesReadToDisk,
+        shuffleLocalBytesRead,
+        shuffleRecordsRead),
+      new v1.ShuffleWriteMetrics(
+        shuffleBytesWritten,
+        shuffleWriteTime,
+        shuffleRecordsWritten))
+  }
+  // scalastyle:on argcount
+
+  def createMetrics(default: Long): v1.TaskMetrics = {
+    createMetrics(default, default, default, default, default, default, default, default,
+      default, default, default, default, default, default, default, default,
+      default, default, default, default, default, default, default, default)
+  }
+
+  /** Add m2 values to m1. */
+  def addMetrics(m1: v1.TaskMetrics, m2: v1.TaskMetrics): v1.TaskMetrics = addMetrics(m1, m2, 1)
+
+  /** Subtract m2 values from m1. */
+  def subtractMetrics(m1: v1.TaskMetrics, m2: v1.TaskMetrics): v1.TaskMetrics = {
+    addMetrics(m1, m2, -1)
+  }
+
+  private def addMetrics(m1: v1.TaskMetrics, m2: v1.TaskMetrics, mult: Int): v1.TaskMetrics = {
+    createMetrics(
+      m1.executorDeserializeTime + m2.executorDeserializeTime * mult,
+      m1.executorDeserializeCpuTime + m2.executorDeserializeCpuTime * mult,
+      m1.executorRunTime + m2.executorRunTime * mult,
+      m1.executorCpuTime + m2.executorCpuTime * mult,
+      m1.resultSize + m2.resultSize * mult,
+      m1.jvmGcTime + m2.jvmGcTime * mult,
+      m1.resultSerializationTime + m2.resultSerializationTime * mult,
+      m1.memoryBytesSpilled + m2.memoryBytesSpilled * mult,
+      m1.diskBytesSpilled + m2.diskBytesSpilled * mult,
+      m1.peakExecutionMemory + m2.peakExecutionMemory * mult,
+      m1.inputMetrics.bytesRead + m2.inputMetrics.bytesRead * mult,
+      m1.inputMetrics.recordsRead + m2.inputMetrics.recordsRead * mult,
+      m1.outputMetrics.bytesWritten + m2.outputMetrics.bytesWritten * mult,
+      m1.outputMetrics.recordsWritten + m2.outputMetrics.recordsWritten * mult,
+      m1.shuffleReadMetrics.remoteBlocksFetched + m2.shuffleReadMetrics.remoteBlocksFetched * mult,
+      m1.shuffleReadMetrics.localBlocksFetched + m2.shuffleReadMetrics.localBlocksFetched * mult,
+      m1.shuffleReadMetrics.fetchWaitTime + m2.shuffleReadMetrics.fetchWaitTime * mult,
+      m1.shuffleReadMetrics.remoteBytesRead + m2.shuffleReadMetrics.remoteBytesRead * mult,
+      m1.shuffleReadMetrics.remoteBytesReadToDisk +
+        m2.shuffleReadMetrics.remoteBytesReadToDisk * mult,
+      m1.shuffleReadMetrics.localBytesRead + m2.shuffleReadMetrics.localBytesRead * mult,
+      m1.shuffleReadMetrics.recordsRead + m2.shuffleReadMetrics.recordsRead * mult,
+      m1.shuffleWriteMetrics.bytesWritten + m2.shuffleWriteMetrics.bytesWritten * mult,
+      m1.shuffleWriteMetrics.writeTime + m2.shuffleWriteMetrics.writeTime * mult,
+      m1.shuffleWriteMetrics.recordsWritten + m2.shuffleWriteMetrics.recordsWritten * mult)
+  }
+
 }
 
 /**
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala
index 3b879545b3d2e..96249e4bfd5fa 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala
@@ -87,7 +87,8 @@ private[v1] class StagesResource extends BaseAppResource {
       }
     }
 
-    ui.store.taskSummary(stageId, stageAttemptId, quantiles)
+    ui.store.taskSummary(stageId, stageAttemptId, quantiles).getOrElse(
+      throw new NotFoundException(s"No tasks reported metrics for $stageId / $stageAttemptId yet."))
   }
 
   @GET
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
index 45eaf935fb083..7d8e4de3c8efb 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
@@ -261,6 +261,9 @@ class TaskMetricDistributions private[spark](
     val resultSize: IndexedSeq[Double],
     val jvmGcTime: IndexedSeq[Double],
     val resultSerializationTime: IndexedSeq[Double],
+    val gettingResultTime: IndexedSeq[Double],
+    val schedulerDelay: IndexedSeq[Double],
+    val peakExecutionMemory: IndexedSeq[Double],
     val memoryBytesSpilled: IndexedSeq[Double],
     val diskBytesSpilled: IndexedSeq[Double],
 
diff --git a/core/src/main/scala/org/apache/spark/status/storeTypes.scala b/core/src/main/scala/org/apache/spark/status/storeTypes.scala
index 1cfd30df49091..c9cb996a55fcc 100644
--- a/core/src/main/scala/org/apache/spark/status/storeTypes.scala
+++ b/core/src/main/scala/org/apache/spark/status/storeTypes.scala
@@ -17,9 +17,11 @@
 
 package org.apache.spark.status
 
-import java.lang.{Integer => JInteger, Long => JLong}
+import java.lang.{Long => JLong}
+import java.util.Date
 
 import com.fasterxml.jackson.annotation.JsonIgnore
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize
 
 import org.apache.spark.status.KVUtils._
 import org.apache.spark.status.api.v1._
@@ -49,10 +51,10 @@ private[spark] class ApplicationEnvironmentInfoWrapper(val info: ApplicationEnvi
 private[spark] class ExecutorSummaryWrapper(val info: ExecutorSummary) {
 
   @JsonIgnore @KVIndex
-  private[this] val id: String = info.id
+  private def id: String = info.id
 
   @JsonIgnore @KVIndex("active")
-  private[this] val active: Boolean = info.isActive
+  private def active: Boolean = info.isActive
 
   @JsonIgnore @KVIndex("host")
   val host: String = info.hostPort.split(":")(0)
@@ -69,51 +71,271 @@ private[spark] class JobDataWrapper(
     val skippedStages: Set[Int]) {
 
   @JsonIgnore @KVIndex
-  private[this] val id: Int = info.jobId
+  private def id: Int = info.jobId
 
 }
 
 private[spark] class StageDataWrapper(
     val info: StageData,
-    val jobIds: Set[Int]) {
+    val jobIds: Set[Int],
+    @JsonDeserialize(contentAs = classOf[JLong])
+    val locality: Map[String, Long]) {
 
   @JsonIgnore @KVIndex
-  def id: Array[Int] = Array(info.stageId, info.attemptId)
+  private[this] val id: Array[Int] = Array(info.stageId, info.attemptId)
 
   @JsonIgnore @KVIndex("stageId")
-  def stageId: Int = info.stageId
+  private def stageId: Int = info.stageId
 
+  @JsonIgnore @KVIndex("active")
+  private def active: Boolean = info.status == StageStatus.ACTIVE
+
+}
+
+/**
+ * Tasks have a lot of indices that are used in a few different places. This object keeps logical
+ * names for these indices, mapped to short strings to save space when using a disk store.
+ */
+private[spark] object TaskIndexNames {
+  final val ACCUMULATORS = "acc"
+  final val ATTEMPT = "att"
+  final val DESER_CPU_TIME = "dct"
+  final val DESER_TIME = "des"
+  final val DISK_SPILL = "dbs"
+  final val DURATION = "dur"
+  final val ERROR = "err"
+  final val EXECUTOR = "exe"
+  final val EXEC_CPU_TIME = "ect"
+  final val EXEC_RUN_TIME = "ert"
+  final val GC_TIME = "gc"
+  final val GETTING_RESULT_TIME = "grt"
+  final val INPUT_RECORDS = "ir"
+  final val INPUT_SIZE = "is"
+  final val LAUNCH_TIME = "lt"
+  final val LOCALITY = "loc"
+  final val MEM_SPILL = "mbs"
+  final val OUTPUT_RECORDS = "or"
+  final val OUTPUT_SIZE = "os"
+  final val PEAK_MEM = "pem"
+  final val RESULT_SIZE = "rs"
+  final val SCHEDULER_DELAY = "dly"
+  final val SER_TIME = "rst"
+  final val SHUFFLE_LOCAL_BLOCKS = "slbl"
+  final val SHUFFLE_READ_RECORDS = "srr"
+  final val SHUFFLE_READ_TIME = "srt"
+  final val SHUFFLE_REMOTE_BLOCKS = "srbl"
+  final val SHUFFLE_REMOTE_READS = "srby"
+  final val SHUFFLE_REMOTE_READS_TO_DISK = "srbd"
+  final val SHUFFLE_TOTAL_READS = "stby"
+  final val SHUFFLE_TOTAL_BLOCKS = "stbl"
+  final val SHUFFLE_WRITE_RECORDS = "swr"
+  final val SHUFFLE_WRITE_SIZE = "sws"
+  final val SHUFFLE_WRITE_TIME = "swt"
+  final val STAGE = "stage"
+  final val STATUS = "sta"
+  final val TASK_INDEX = "idx"
 }
 
 /**
- * The task information is always indexed with the stage ID, since that is how the UI and API
- * consume it. That means every indexed value has the stage ID and attempt ID included, aside
- * from the actual data being indexed.
+ * Unlike other data types, the task data wrapper does not keep a reference to the API's TaskData.
+ * That is to save memory, since for large applications there can be a large number of these
+ * elements (by default up to 100,000 per stage), and every bit of wasted memory adds up.
+ *
+ * It also contains many secondary indices, which are used to sort data efficiently in the UI at the
+ * expense of storage space (and slower write times).
  */
 private[spark] class TaskDataWrapper(
-    val info: TaskData,
+    // Storing this as an object actually saves memory; it's also used as the key in the in-memory
+    // store, so in that case you'd save the extra copy of the value here.
+    @KVIndexParam
+    val taskId: JLong,
+    @KVIndexParam(value = TaskIndexNames.TASK_INDEX, parent = TaskIndexNames.STAGE)
+    val index: Int,
+    @KVIndexParam(value = TaskIndexNames.ATTEMPT, parent = TaskIndexNames.STAGE)
+    val attempt: Int,
+    @KVIndexParam(value = TaskIndexNames.LAUNCH_TIME, parent = TaskIndexNames.STAGE)
+    val launchTime: Long,
+    val resultFetchStart: Long,
+    @KVIndexParam(value = TaskIndexNames.DURATION, parent = TaskIndexNames.STAGE)
+    val duration: Long,
+    @KVIndexParam(value = TaskIndexNames.EXECUTOR, parent = TaskIndexNames.STAGE)
+    val executorId: String,
+    val host: String,
+    @KVIndexParam(value = TaskIndexNames.STATUS, parent = TaskIndexNames.STAGE)
+    val status: String,
+    @KVIndexParam(value = TaskIndexNames.LOCALITY, parent = TaskIndexNames.STAGE)
+    val taskLocality: String,
+    val speculative: Boolean,
+    val accumulatorUpdates: Seq[AccumulableInfo],
+    val errorMessage: Option[String],
+
+    // The following is an exploded view of a TaskMetrics API object. This saves 5 objects
+    // (= 80 bytes of Java object overhead) per instance of this wrapper. If the first value
+    // (executorDeserializeTime) is -1L, it means the metrics for this task have not been
+    // recorded.
+    @KVIndexParam(value = TaskIndexNames.DESER_TIME, parent = TaskIndexNames.STAGE)
+    val executorDeserializeTime: Long,
+    @KVIndexParam(value = TaskIndexNames.DESER_CPU_TIME, parent = TaskIndexNames.STAGE)
+    val executorDeserializeCpuTime: Long,
+    @KVIndexParam(value = TaskIndexNames.EXEC_RUN_TIME, parent = TaskIndexNames.STAGE)
+    val executorRunTime: Long,
+    @KVIndexParam(value = TaskIndexNames.EXEC_CPU_TIME, parent = TaskIndexNames.STAGE)
+    val executorCpuTime: Long,
+    @KVIndexParam(value = TaskIndexNames.RESULT_SIZE, parent = TaskIndexNames.STAGE)
+    val resultSize: Long,
+    @KVIndexParam(value = TaskIndexNames.GC_TIME, parent = TaskIndexNames.STAGE)
+    val jvmGcTime: Long,
+    @KVIndexParam(value = TaskIndexNames.SER_TIME, parent = TaskIndexNames.STAGE)
+    val resultSerializationTime: Long,
+    @KVIndexParam(value = TaskIndexNames.MEM_SPILL, parent = TaskIndexNames.STAGE)
+    val memoryBytesSpilled: Long,
+    @KVIndexParam(value = TaskIndexNames.DISK_SPILL, parent = TaskIndexNames.STAGE)
+    val diskBytesSpilled: Long,
+    @KVIndexParam(value = TaskIndexNames.PEAK_MEM, parent = TaskIndexNames.STAGE)
+    val peakExecutionMemory: Long,
+    @KVIndexParam(value = TaskIndexNames.INPUT_SIZE, parent = TaskIndexNames.STAGE)
+    val inputBytesRead: Long,
+    @KVIndexParam(value = TaskIndexNames.INPUT_RECORDS, parent = TaskIndexNames.STAGE)
+    val inputRecordsRead: Long,
+    @KVIndexParam(value = TaskIndexNames.OUTPUT_SIZE, parent = TaskIndexNames.STAGE)
+    val outputBytesWritten: Long,
+    @KVIndexParam(value = TaskIndexNames.OUTPUT_RECORDS, parent = TaskIndexNames.STAGE)
+    val outputRecordsWritten: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_REMOTE_BLOCKS, parent = TaskIndexNames.STAGE)
+    val shuffleRemoteBlocksFetched: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_LOCAL_BLOCKS, parent = TaskIndexNames.STAGE)
+    val shuffleLocalBlocksFetched: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_READ_TIME, parent = TaskIndexNames.STAGE)
+    val shuffleFetchWaitTime: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_REMOTE_READS, parent = TaskIndexNames.STAGE)
+    val shuffleRemoteBytesRead: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_REMOTE_READS_TO_DISK,
+      parent = TaskIndexNames.STAGE)
+    val shuffleRemoteBytesReadToDisk: Long,
+    val shuffleLocalBytesRead: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_READ_RECORDS, parent = TaskIndexNames.STAGE)
+    val shuffleRecordsRead: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_WRITE_SIZE, parent = TaskIndexNames.STAGE)
+    val shuffleBytesWritten: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_WRITE_TIME, parent = TaskIndexNames.STAGE)
+    val shuffleWriteTime: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_WRITE_RECORDS, parent = TaskIndexNames.STAGE)
+    val shuffleRecordsWritten: Long,
+
     val stageId: Int,
     val stageAttemptId: Int) {
 
-  @JsonIgnore @KVIndex
-  def id: Long = info.taskId
+  def hasMetrics: Boolean = executorDeserializeTime >= 0
+
+  def toApi: TaskData = {
+    val metrics = if (hasMetrics) {
+      Some(new TaskMetrics(
+        executorDeserializeTime,
+        executorDeserializeCpuTime,
+        executorRunTime,
+        executorCpuTime,
+        resultSize,
+        jvmGcTime,
+        resultSerializationTime,
+        memoryBytesSpilled,
+        diskBytesSpilled,
+        peakExecutionMemory,
+        new InputMetrics(
+          inputBytesRead,
+          inputRecordsRead),
+        new OutputMetrics(
+          outputBytesWritten,
+          outputRecordsWritten),
+        new ShuffleReadMetrics(
+          shuffleRemoteBlocksFetched,
+          shuffleLocalBlocksFetched,
+          shuffleFetchWaitTime,
+          shuffleRemoteBytesRead,
+          shuffleRemoteBytesReadToDisk,
+          shuffleLocalBytesRead,
+          shuffleRecordsRead),
+        new ShuffleWriteMetrics(
+          shuffleBytesWritten,
+          shuffleWriteTime,
+          shuffleRecordsWritten)))
+    } else {
+      None
+    }
 
-  @JsonIgnore @KVIndex("stage")
-  def stage: Array[Int] = Array(stageId, stageAttemptId)
+    new TaskData(
+      taskId,
+      index,
+      attempt,
+      new Date(launchTime),
+      if (resultFetchStart > 0L) Some(new Date(resultFetchStart)) else None,
+      if (duration > 0L) Some(duration) else None,
+      executorId,
+      host,
+      status,
+      taskLocality,
+      speculative,
+      accumulatorUpdates,
+      errorMessage,
+      metrics)
+  }
+
+  @JsonIgnore @KVIndex(TaskIndexNames.STAGE)
+  private def stage: Array[Int] = Array(stageId, stageAttemptId)
 
-  @JsonIgnore @KVIndex("runtime")
-  def runtime: Array[AnyRef] = {
-    val _runtime = info.taskMetrics.map(_.executorRunTime).getOrElse(-1L)
-    Array(stageId: JInteger, stageAttemptId: JInteger, _runtime: JLong)
+  @JsonIgnore @KVIndex(value = TaskIndexNames.SCHEDULER_DELAY, parent = TaskIndexNames.STAGE)
+  def schedulerDelay: Long = {
+    if (hasMetrics) {
+      AppStatusUtils.schedulerDelay(launchTime, resultFetchStart, duration, executorDeserializeTime,
+        resultSerializationTime, executorRunTime)
+    } else {
+      -1L
+    }
   }
 
-  @JsonIgnore @KVIndex("startTime")
-  def startTime: Array[AnyRef] = {
-    Array(stageId: JInteger, stageAttemptId: JInteger, info.launchTime.getTime(): JLong)
+  @JsonIgnore @KVIndex(value = TaskIndexNames.GETTING_RESULT_TIME, parent = TaskIndexNames.STAGE)
+  def gettingResultTime: Long = {
+    if (hasMetrics) {
+      AppStatusUtils.gettingResultTime(launchTime, resultFetchStart, duration)
+    } else {
+      -1L
+    }
   }
 
-  @JsonIgnore @KVIndex("active")
-  def active: Boolean = info.duration.isEmpty
+  /**
+   * Sorting by accumulators is a little weird, and the previous behavior would generate
+   * insanely long keys in the index. So this implementation just considers the first
+   * accumulator and its String representation.
+   */
+  @JsonIgnore @KVIndex(value = TaskIndexNames.ACCUMULATORS, parent = TaskIndexNames.STAGE)
+  private def accumulators: String = {
+    if (accumulatorUpdates.nonEmpty) {
+      val acc = accumulatorUpdates.head
+      s"${acc.name}:${acc.value}"
+    } else {
+      ""
+    }
+  }
+
+  @JsonIgnore @KVIndex(value = TaskIndexNames.SHUFFLE_TOTAL_READS, parent = TaskIndexNames.STAGE)
+  private def shuffleTotalReads: Long = {
+    if (hasMetrics) {
+      shuffleLocalBytesRead + shuffleRemoteBytesRead
+    } else {
+      -1L
+    }
+  }
+
+  @JsonIgnore @KVIndex(value = TaskIndexNames.SHUFFLE_TOTAL_BLOCKS, parent = TaskIndexNames.STAGE)
+  private def shuffleTotalBlocks: Long = {
+    if (hasMetrics) {
+      shuffleLocalBlocksFetched + shuffleRemoteBlocksFetched
+    } else {
+      -1L
+    }
+  }
+
+  @JsonIgnore @KVIndex(value = TaskIndexNames.ERROR, parent = TaskIndexNames.STAGE)
+  private def error: String = if (errorMessage.isDefined) errorMessage.get else ""
 
 }
 
@@ -134,10 +356,13 @@ private[spark] class ExecutorStageSummaryWrapper(
     val info: ExecutorStageSummary) {
 
   @JsonIgnore @KVIndex
-  val id: Array[Any] = Array(stageId, stageAttemptId, executorId)
+  private val _id: Array[Any] = Array(stageId, stageAttemptId, executorId)
 
   @JsonIgnore @KVIndex("stage")
-  private[this] val stage: Array[Int] = Array(stageId, stageAttemptId)
+  private def stage: Array[Int] = Array(stageId, stageAttemptId)
+
+  @JsonIgnore
+  def id: Array[Any] = _id
 
 }
 
@@ -203,3 +428,53 @@ private[spark] class AppSummary(
   def id: String = classOf[AppSummary].getName()
 
 }
+
+/**
+ * A cached view of a specific quantile for one stage attempt's metrics.
+ */
+private[spark] class CachedQuantile(
+    val stageId: Int,
+    val stageAttemptId: Int,
+    val quantile: String,
+    val taskCount: Long,
+
+    // The following fields are an exploded view of a single entry for TaskMetricDistributions.
+    val executorDeserializeTime: Double,
+    val executorDeserializeCpuTime: Double,
+    val executorRunTime: Double,
+    val executorCpuTime: Double,
+    val resultSize: Double,
+    val jvmGcTime: Double,
+    val resultSerializationTime: Double,
+    val gettingResultTime: Double,
+    val schedulerDelay: Double,
+    val peakExecutionMemory: Double,
+    val memoryBytesSpilled: Double,
+    val diskBytesSpilled: Double,
+
+    val bytesRead: Double,
+    val recordsRead: Double,
+
+    val bytesWritten: Double,
+    val recordsWritten: Double,
+
+    val shuffleReadBytes: Double,
+    val shuffleRecordsRead: Double,
+    val shuffleRemoteBlocksFetched: Double,
+    val shuffleLocalBlocksFetched: Double,
+    val shuffleFetchWaitTime: Double,
+    val shuffleRemoteBytesRead: Double,
+    val shuffleRemoteBytesReadToDisk: Double,
+    val shuffleTotalBlocksFetched: Double,
+
+    val shuffleWriteBytes: Double,
+    val shuffleWriteRecords: Double,
+    val shuffleWriteTime: Double) {
+
+  @KVIndex @JsonIgnore
+  def id: Array[Any] = Array(stageId, stageAttemptId, quantile)
+
+  @KVIndex("stage") @JsonIgnore
+  def stage: Array[Int] = Array(stageId, stageAttemptId)
+
+}
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
index 41d42b52430a5..95c12b1e73653 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
@@ -87,7 +87,9 @@ private[ui] class ExecutorTable(stage: StageData, store: AppStatusStore) {
   }
 
   private def createExecutorTable(stage: StageData) : Seq[Node] = {
-    stage.executorSummary.getOrElse(Map.empty).toSeq.sortBy(_._1).map { case (k, v) =>
+    val executorSummary = store.executorSummary(stage.stageId, stage.attemptId)
+
+    executorSummary.toSeq.sortBy(_._1).map { case (k, v) =>
       val executor = store.asOption(store.executorSummary(k))
       <tr>
         <td>
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala
index 740f12e7d13d4..bf59152c8c0cd 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala
@@ -201,7 +201,7 @@ private[ui] class JobPage(parent: JobsTab, store: AppStatusStore) extends WebUIP
     val stages = jobData.stageIds.map { stageId =>
       // This could be empty if the listener hasn't received information about the
       // stage or if the stage information has been garbage collected
-      store.stageData(stageId).lastOption.getOrElse {
+      store.asOption(store.lastStageAttempt(stageId)).getOrElse {
         new v1.StageData(
           v1.StageStatus.PENDING,
           stageId,
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 11a6a34344976..7c6e06cf183ba 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -19,6 +19,7 @@ package org.apache.spark.ui.jobs
 
 import java.net.URLEncoder
 import java.util.Date
+import java.util.concurrent.TimeUnit
 import javax.servlet.http.HttpServletRequest
 
 import scala.collection.mutable.{HashMap, HashSet}
@@ -29,15 +30,14 @@ import org.apache.commons.lang3.StringEscapeUtils
 import org.apache.spark.SparkConf
 import org.apache.spark.internal.config._
 import org.apache.spark.scheduler.TaskLocality
-import org.apache.spark.status.AppStatusStore
+import org.apache.spark.status._
 import org.apache.spark.status.api.v1._
 import org.apache.spark.ui._
-import org.apache.spark.util.{Distribution, Utils}
+import org.apache.spark.util.Utils
 
 /** Page showing statistics and task list for a given stage */
 private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends WebUIPage("stage") {
   import ApiHelper._
-  import StagePage._
 
   private val TIMELINE_LEGEND = {
     <div class="legend-area">
@@ -67,17 +67,17 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
   // if we find that it's okay.
   private val MAX_TIMELINE_TASKS = parent.conf.getInt("spark.ui.timeline.tasks.maximum", 1000)
 
-  private def getLocalitySummaryString(stageData: StageData, taskList: Seq[TaskData]): String = {
-    val localities = taskList.map(_.taskLocality)
-    val localityCounts = localities.groupBy(identity).mapValues(_.size)
+  private def getLocalitySummaryString(localitySummary: Map[String, Long]): String = {
     val names = Map(
       TaskLocality.PROCESS_LOCAL.toString() -> "Process local",
       TaskLocality.NODE_LOCAL.toString() -> "Node local",
       TaskLocality.RACK_LOCAL.toString() -> "Rack local",
       TaskLocality.ANY.toString() -> "Any")
-    val localityNamesAndCounts = localityCounts.toSeq.map { case (locality, count) =>
-      s"${names(locality)}: $count"
-    }
+    val localityNamesAndCounts = names.flatMap { case (key, name) =>
+      localitySummary.get(key).map { count =>
+        s"$name: $count"
+      }
+    }.toSeq
     localityNamesAndCounts.sorted.mkString("; ")
   }
 
@@ -108,7 +108,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
 
     val stageHeader = s"Details for Stage $stageId (Attempt $stageAttemptId)"
     val stageData = parent.store
-      .asOption(parent.store.stageAttempt(stageId, stageAttemptId, details = true))
+      .asOption(parent.store.stageAttempt(stageId, stageAttemptId, details = false))
       .getOrElse {
         val content =
           <div id="no-info">
@@ -117,8 +117,11 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
         return UIUtils.headerSparkPage(stageHeader, content, parent)
       }
 
-    val tasks = stageData.tasks.getOrElse(Map.empty).values.toSeq
-    if (tasks.isEmpty) {
+    val localitySummary = store.localitySummary(stageData.stageId, stageData.attemptId)
+
+    val totalTasks = stageData.numActiveTasks + stageData.numCompleteTasks +
+      stageData.numFailedTasks + stageData.numKilledTasks
+    if (totalTasks == 0) {
       val content =
         <div>
           <h4>Summary Metrics</h4> No tasks have started yet
@@ -127,18 +130,14 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
       return UIUtils.headerSparkPage(stageHeader, content, parent)
     }
 
+    val storedTasks = store.taskCount(stageData.stageId, stageData.attemptId)
     val numCompleted = stageData.numCompleteTasks
-    val totalTasks = stageData.numActiveTasks + stageData.numCompleteTasks +
-      stageData.numFailedTasks + stageData.numKilledTasks
-    val totalTasksNumStr = if (totalTasks == tasks.size) {
+    val totalTasksNumStr = if (totalTasks == storedTasks) {
       s"$totalTasks"
     } else {
-      s"$totalTasks, showing ${tasks.size}"
+      s"$totalTasks, showing ${storedTasks}"
     }
 
-    val externalAccumulables = stageData.accumulatorUpdates
-    val hasAccumulators = externalAccumulables.size > 0
-
     val summary =
       <div>
         <ul class="unstyled">
@@ -148,7 +147,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
           </li>
           <li>
             <strong>Locality Level Summary: </strong>
-            {getLocalitySummaryString(stageData, tasks)}
+            {getLocalitySummaryString(localitySummary)}
           </li>
           {if (hasInput(stageData)) {
             <li>
@@ -266,7 +265,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
     val accumulableTable = UIUtils.listingTable(
       accumulableHeaders,
       accumulableRow,
-      externalAccumulables.toSeq)
+      stageData.accumulatorUpdates.toSeq)
 
     val page: Int = {
       // If the user has changed to a larger page size, then go to page 1 in order to avoid
@@ -280,16 +279,9 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
     val currentTime = System.currentTimeMillis()
     val (taskTable, taskTableHTML) = try {
       val _taskTable = new TaskPagedTable(
-        parent.conf,
+        stageData,
         UIUtils.prependBaseUri(parent.basePath) +
           s"/stages/stage?id=${stageId}&attempt=${stageAttemptId}",
-        tasks,
-        hasAccumulators,
-        hasInput(stageData),
-        hasOutput(stageData),
-        hasShuffleRead(stageData),
-        hasShuffleWrite(stageData),
-        hasBytesSpilled(stageData),
         currentTime,
         pageSize = taskPageSize,
         sortColumn = taskSortColumn,
@@ -320,217 +312,155 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
             |  }
             |});
           """.stripMargin
-         }
+          }
         }
       </script>
 
-    val taskIdsInPage = if (taskTable == null) Set.empty[Long]
-      else taskTable.dataSource.slicedTaskIds
+    val metricsSummary = store.taskSummary(stageData.stageId, stageData.attemptId,
+      Array(0, 0.25, 0.5, 0.75, 1.0))
 
-    // Excludes tasks which failed and have incomplete metrics
-    val validTasks = tasks.filter(t => t.status == "SUCCESS" && t.taskMetrics.isDefined)
-
-    val summaryTable: Option[Seq[Node]] =
-      if (validTasks.size == 0) {
-        None
-      } else {
-        def getDistributionQuantiles(data: Seq[Double]): IndexedSeq[Double] = {
-          Distribution(data).get.getQuantiles()
-        }
-        def getFormattedTimeQuantiles(times: Seq[Double]): Seq[Node] = {
-          getDistributionQuantiles(times).map { millis =>
-            <td>{UIUtils.formatDuration(millis.toLong)}</td>
-          }
-        }
-        def getFormattedSizeQuantiles(data: Seq[Double]): Seq[Elem] = {
-          getDistributionQuantiles(data).map(d => <td>{Utils.bytesToString(d.toLong)}</td>)
+    val summaryTable = metricsSummary.map { metrics =>
+      def timeQuantiles(data: IndexedSeq[Double]): Seq[Node] = {
+        data.map { millis =>
+          <td>{UIUtils.formatDuration(millis.toLong)}</td>
         }
+      }
 
-        val deserializationTimes = validTasks.map { task =>
-          task.taskMetrics.get.executorDeserializeTime.toDouble
-        }
-        val deserializationQuantiles =
-          <td>
-            <span data-toggle="tooltip" title={ToolTips.TASK_DESERIALIZATION_TIME}
-                  data-placement="right">
-              Task Deserialization Time
-            </span>
-          </td> +: getFormattedTimeQuantiles(deserializationTimes)
-
-        val serviceTimes = validTasks.map(_.taskMetrics.get.executorRunTime.toDouble)
-        val serviceQuantiles = <td>Duration</td> +: getFormattedTimeQuantiles(serviceTimes)
-
-        val gcTimes = validTasks.map(_.taskMetrics.get.jvmGcTime.toDouble)
-        val gcQuantiles =
-          <td>
-            <span data-toggle="tooltip"
-                title={ToolTips.GC_TIME} data-placement="right">GC Time
-            </span>
-          </td> +: getFormattedTimeQuantiles(gcTimes)
-
-        val serializationTimes = validTasks.map(_.taskMetrics.get.resultSerializationTime.toDouble)
-        val serializationQuantiles =
-          <td>
-            <span data-toggle="tooltip"
-                  title={ToolTips.RESULT_SERIALIZATION_TIME} data-placement="right">
-              Result Serialization Time
-            </span>
-          </td> +: getFormattedTimeQuantiles(serializationTimes)
-
-        val gettingResultTimes = validTasks.map(getGettingResultTime(_, currentTime).toDouble)
-        val gettingResultQuantiles =
-          <td>
-            <span data-toggle="tooltip"
-                title={ToolTips.GETTING_RESULT_TIME} data-placement="right">
-              Getting Result Time
-            </span>
-          </td> +:
-          getFormattedTimeQuantiles(gettingResultTimes)
-
-        val peakExecutionMemory = validTasks.map(_.taskMetrics.get.peakExecutionMemory.toDouble)
-        val peakExecutionMemoryQuantiles = {
-          <td>
-            <span data-toggle="tooltip"
-                  title={ToolTips.PEAK_EXECUTION_MEMORY} data-placement="right">
-              Peak Execution Memory
-            </span>
-          </td> +: getFormattedSizeQuantiles(peakExecutionMemory)
+      def sizeQuantiles(data: IndexedSeq[Double]): Seq[Node] = {
+        data.map { size =>
+          <td>{Utils.bytesToString(size.toLong)}</td>
         }
+      }
 
-        // The scheduler delay includes the network delay to send the task to the worker
-        // machine and to send back the result (but not the time to fetch the task result,
-        // if it needed to be fetched from the block manager on the worker).
-        val schedulerDelays = validTasks.map { task =>
-          getSchedulerDelay(task, task.taskMetrics.get, currentTime).toDouble
-        }
-        val schedulerDelayTitle = <td><span data-toggle="tooltip"
-          title={ToolTips.SCHEDULER_DELAY} data-placement="right">Scheduler Delay</span></td>
-        val schedulerDelayQuantiles = schedulerDelayTitle +:
-          getFormattedTimeQuantiles(schedulerDelays)
-        def getFormattedSizeQuantilesWithRecords(data: Seq[Double], records: Seq[Double])
-          : Seq[Elem] = {
-          val recordDist = getDistributionQuantiles(records).iterator
-          getDistributionQuantiles(data).map(d =>
-            <td>{s"${Utils.bytesToString(d.toLong)} / ${recordDist.next().toLong}"}</td>
-          )
+      def sizeQuantilesWithRecords(
+          data: IndexedSeq[Double],
+          records: IndexedSeq[Double]) : Seq[Node] = {
+        data.zip(records).map { case (d, r) =>
+          <td>{s"${Utils.bytesToString(d.toLong)} / ${r.toLong}"}</td>
         }
+      }
 
-        val inputSizes = validTasks.map(_.taskMetrics.get.inputMetrics.bytesRead.toDouble)
-        val inputRecords = validTasks.map(_.taskMetrics.get.inputMetrics.recordsRead.toDouble)
-        val inputQuantiles = <td>Input Size / Records</td> +:
-          getFormattedSizeQuantilesWithRecords(inputSizes, inputRecords)
+      def titleCell(title: String, tooltip: String): Seq[Node] = {
+        <td>
+          <span data-toggle="tooltip" title={tooltip} data-placement="right">
+            {title}
+          </span>
+        </td>
+      }
 
-        val outputSizes = validTasks.map(_.taskMetrics.get.outputMetrics.bytesWritten.toDouble)
-        val outputRecords = validTasks.map(_.taskMetrics.get.outputMetrics.recordsWritten.toDouble)
-        val outputQuantiles = <td>Output Size / Records</td> +:
-          getFormattedSizeQuantilesWithRecords(outputSizes, outputRecords)
+      def simpleTitleCell(title: String): Seq[Node] = <td>{title}</td>
 
-        val shuffleReadBlockedTimes = validTasks.map { task =>
-          task.taskMetrics.get.shuffleReadMetrics.fetchWaitTime.toDouble
-        }
-        val shuffleReadBlockedQuantiles =
-          <td>
-            <span data-toggle="tooltip"
-                  title={ToolTips.SHUFFLE_READ_BLOCKED_TIME} data-placement="right">
-              Shuffle Read Blocked Time
-            </span>
-          </td> +:
-          getFormattedTimeQuantiles(shuffleReadBlockedTimes)
-
-        val shuffleReadTotalSizes = validTasks.map { task =>
-          totalBytesRead(task.taskMetrics.get.shuffleReadMetrics).toDouble
-        }
-        val shuffleReadTotalRecords = validTasks.map { task =>
-          task.taskMetrics.get.shuffleReadMetrics.recordsRead.toDouble
-        }
-        val shuffleReadTotalQuantiles =
-          <td>
-            <span data-toggle="tooltip"
-                  title={ToolTips.SHUFFLE_READ} data-placement="right">
-              Shuffle Read Size / Records
-            </span>
-          </td> +:
-          getFormattedSizeQuantilesWithRecords(shuffleReadTotalSizes, shuffleReadTotalRecords)
-
-        val shuffleReadRemoteSizes = validTasks.map { task =>
-          task.taskMetrics.get.shuffleReadMetrics.remoteBytesRead.toDouble
-        }
-        val shuffleReadRemoteQuantiles =
-          <td>
-            <span data-toggle="tooltip"
-                  title={ToolTips.SHUFFLE_READ_REMOTE_SIZE} data-placement="right">
-              Shuffle Remote Reads
-            </span>
-          </td> +:
-          getFormattedSizeQuantiles(shuffleReadRemoteSizes)
-
-        val shuffleWriteSizes = validTasks.map { task =>
-          task.taskMetrics.get.shuffleWriteMetrics.bytesWritten.toDouble
-        }
+      val deserializationQuantiles = titleCell("Task Deserialization Time",
+        ToolTips.TASK_DESERIALIZATION_TIME) ++ timeQuantiles(metrics.executorDeserializeTime)
 
-        val shuffleWriteRecords = validTasks.map { task =>
-          task.taskMetrics.get.shuffleWriteMetrics.recordsWritten.toDouble
-        }
+      val serviceQuantiles = simpleTitleCell("Duration") ++ timeQuantiles(metrics.executorRunTime)
 
-        val shuffleWriteQuantiles = <td>Shuffle Write Size / Records</td> +:
-          getFormattedSizeQuantilesWithRecords(shuffleWriteSizes, shuffleWriteRecords)
+      val gcQuantiles = titleCell("GC Time", ToolTips.GC_TIME) ++ timeQuantiles(metrics.jvmGcTime)
 
-        val memoryBytesSpilledSizes = validTasks.map(_.taskMetrics.get.memoryBytesSpilled.toDouble)
-        val memoryBytesSpilledQuantiles = <td>Shuffle spill (memory)</td> +:
-          getFormattedSizeQuantiles(memoryBytesSpilledSizes)
+      val serializationQuantiles = titleCell("Result Serialization Time",
+        ToolTips.RESULT_SERIALIZATION_TIME) ++ timeQuantiles(metrics.resultSerializationTime)
 
-        val diskBytesSpilledSizes = validTasks.map(_.taskMetrics.get.diskBytesSpilled.toDouble)
-        val diskBytesSpilledQuantiles = <td>Shuffle spill (disk)</td> +:
-          getFormattedSizeQuantiles(diskBytesSpilledSizes)
+      val gettingResultQuantiles = titleCell("Getting Result Time", ToolTips.GETTING_RESULT_TIME) ++
+        timeQuantiles(metrics.gettingResultTime)
 
-        val listings: Seq[Seq[Node]] = Seq(
-          <tr>{serviceQuantiles}</tr>,
-          <tr class={TaskDetailsClassNames.SCHEDULER_DELAY}>{schedulerDelayQuantiles}</tr>,
-          <tr class={TaskDetailsClassNames.TASK_DESERIALIZATION_TIME}>
-            {deserializationQuantiles}
-          </tr>
-          <tr>{gcQuantiles}</tr>,
-          <tr class={TaskDetailsClassNames.RESULT_SERIALIZATION_TIME}>
-            {serializationQuantiles}
-          </tr>,
-          <tr class={TaskDetailsClassNames.GETTING_RESULT_TIME}>{gettingResultQuantiles}</tr>,
-          <tr class={TaskDetailsClassNames.PEAK_EXECUTION_MEMORY}>
-            {peakExecutionMemoryQuantiles}
-          </tr>,
-          if (hasInput(stageData)) <tr>{inputQuantiles}</tr> else Nil,
-          if (hasOutput(stageData)) <tr>{outputQuantiles}</tr> else Nil,
-          if (hasShuffleRead(stageData)) {
-            <tr class={TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME}>
-              {shuffleReadBlockedQuantiles}
-            </tr>
-            <tr>{shuffleReadTotalQuantiles}</tr>
-            <tr class={TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE}>
-              {shuffleReadRemoteQuantiles}
-            </tr>
-          } else {
-            Nil
-          },
-          if (hasShuffleWrite(stageData)) <tr>{shuffleWriteQuantiles}</tr> else Nil,
-          if (hasBytesSpilled(stageData)) <tr>{memoryBytesSpilledQuantiles}</tr> else Nil,
-          if (hasBytesSpilled(stageData)) <tr>{diskBytesSpilledQuantiles}</tr> else Nil)
-
-        val quantileHeaders = Seq("Metric", "Min", "25th percentile",
-          "Median", "75th percentile", "Max")
-        // The summary table does not use CSS to stripe rows, which doesn't work with hidden
-        // rows (instead, JavaScript in table.js is used to stripe the non-hidden rows).
-        Some(UIUtils.listingTable(
-          quantileHeaders,
-          identity[Seq[Node]],
-          listings,
-          fixedWidth = true,
-          id = Some("task-summary-table"),
-          stripeRowsWithCss = false))
+      val peakExecutionMemoryQuantiles = titleCell("Peak Execution Memory",
+        ToolTips.PEAK_EXECUTION_MEMORY) ++ sizeQuantiles(metrics.peakExecutionMemory)
+
+      // The scheduler delay includes the network delay to send the task to the worker
+      // machine and to send back the result (but not the time to fetch the task result,
+      // if it needed to be fetched from the block manager on the worker).
+      val schedulerDelayQuantiles = titleCell("Scheduler Delay", ToolTips.SCHEDULER_DELAY) ++
+        timeQuantiles(metrics.schedulerDelay)
+
+      def inputQuantiles: Seq[Node] = {
+        simpleTitleCell("Input Size / Records") ++
+          sizeQuantilesWithRecords(metrics.inputMetrics.bytesRead, metrics.inputMetrics.recordsRead)
+      }
+
+      def outputQuantiles: Seq[Node] = {
+        simpleTitleCell("Output Size / Records") ++
+          sizeQuantilesWithRecords(metrics.outputMetrics.bytesWritten,
+            metrics.outputMetrics.recordsWritten)
       }
 
+      def shuffleReadBlockedQuantiles: Seq[Node] = {
+        titleCell("Shuffle Read Blocked Time", ToolTips.SHUFFLE_READ_BLOCKED_TIME) ++
+          timeQuantiles(metrics.shuffleReadMetrics.fetchWaitTime)
+      }
+
+      def shuffleReadTotalQuantiles: Seq[Node] = {
+        titleCell("Shuffle Read Size / Records", ToolTips.SHUFFLE_READ) ++
+          sizeQuantilesWithRecords(metrics.shuffleReadMetrics.readBytes,
+            metrics.shuffleReadMetrics.readRecords)
+      }
+
+      def shuffleReadRemoteQuantiles: Seq[Node] = {
+        titleCell("Shuffle Remote Reads", ToolTips.SHUFFLE_READ_REMOTE_SIZE) ++
+          sizeQuantiles(metrics.shuffleReadMetrics.remoteBytesRead)
+      }
+
+      def shuffleWriteQuantiles: Seq[Node] = {
+        simpleTitleCell("Shuffle Write Size / Records") ++
+          sizeQuantilesWithRecords(metrics.shuffleWriteMetrics.writeBytes,
+            metrics.shuffleWriteMetrics.writeRecords)
+      }
+
+      def memoryBytesSpilledQuantiles: Seq[Node] = {
+        simpleTitleCell("Shuffle spill (memory)") ++ sizeQuantiles(metrics.memoryBytesSpilled)
+      }
+
+      def diskBytesSpilledQuantiles: Seq[Node] = {
+        simpleTitleCell("Shuffle spill (disk)") ++ sizeQuantiles(metrics.diskBytesSpilled)
+      }
+
+      val listings: Seq[Seq[Node]] = Seq(
+        <tr>{serviceQuantiles}</tr>,
+        <tr class={TaskDetailsClassNames.SCHEDULER_DELAY}>{schedulerDelayQuantiles}</tr>,
+        <tr class={TaskDetailsClassNames.TASK_DESERIALIZATION_TIME}>
+          {deserializationQuantiles}
+        </tr>
+        <tr>{gcQuantiles}</tr>,
+        <tr class={TaskDetailsClassNames.RESULT_SERIALIZATION_TIME}>
+          {serializationQuantiles}
+        </tr>,
+        <tr class={TaskDetailsClassNames.GETTING_RESULT_TIME}>{gettingResultQuantiles}</tr>,
+        <tr class={TaskDetailsClassNames.PEAK_EXECUTION_MEMORY}>
+          {peakExecutionMemoryQuantiles}
+        </tr>,
+        if (hasInput(stageData)) <tr>{inputQuantiles}</tr> else Nil,
+        if (hasOutput(stageData)) <tr>{outputQuantiles}</tr> else Nil,
+        if (hasShuffleRead(stageData)) {
+          <tr class={TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME}>
+            {shuffleReadBlockedQuantiles}
+          </tr>
+          <tr>{shuffleReadTotalQuantiles}</tr>
+          <tr class={TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE}>
+            {shuffleReadRemoteQuantiles}
+          </tr>
+        } else {
+          Nil
+        },
+        if (hasShuffleWrite(stageData)) <tr>{shuffleWriteQuantiles}</tr> else Nil,
+        if (hasBytesSpilled(stageData)) <tr>{memoryBytesSpilledQuantiles}</tr> else Nil,
+        if (hasBytesSpilled(stageData)) <tr>{diskBytesSpilledQuantiles}</tr> else Nil)
+
+      val quantileHeaders = Seq("Metric", "Min", "25th percentile", "Median", "75th percentile",
+        "Max")
+      // The summary table does not use CSS to stripe rows, which doesn't work with hidden
+      // rows (instead, JavaScript in table.js is used to stripe the non-hidden rows).
+      UIUtils.listingTable(
+        quantileHeaders,
+        identity[Seq[Node]],
+        listings,
+        fixedWidth = true,
+        id = Some("task-summary-table"),
+        stripeRowsWithCss = false)
+    }
+
     val executorTable = new ExecutorTable(stageData, parent.store)
 
     val maybeAccumulableTable: Seq[Node] =
-      if (hasAccumulators) { <h4>Accumulators</h4> ++ accumulableTable } else Seq()
+      if (hasAccumulators(stageData)) { <h4>Accumulators</h4> ++ accumulableTable } else Seq()
 
     val aggMetrics =
       <span class="collapse-aggregated-metrics collapse-table"
@@ -550,7 +480,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
       showAdditionalMetrics ++
       makeTimeline(
         // Only show the tasks in the table
-        tasks.filter { t => taskIdsInPage.contains(t.taskId) },
+        Option(taskTable).map(_.dataSource.tasks).getOrElse(Nil),
         currentTime) ++
       <h4>Summary Metrics for <a href="#tasks-section">{numCompleted} Completed Tasks</a></h4> ++
       <div>{summaryTable.getOrElse("No tasks have reported metrics yet.")}</div> ++
@@ -593,10 +523,9 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
         val serializationTimeProportion = toProportion(serializationTime)
         val deserializationTime = metricsOpt.map(_.executorDeserializeTime).getOrElse(0L)
         val deserializationTimeProportion = toProportion(deserializationTime)
-        val gettingResultTime = getGettingResultTime(taskInfo, currentTime)
+        val gettingResultTime = AppStatusUtils.gettingResultTime(taskInfo)
         val gettingResultTimeProportion = toProportion(gettingResultTime)
-        val schedulerDelay =
-          metricsOpt.map(getSchedulerDelay(taskInfo, _, currentTime)).getOrElse(0L)
+        val schedulerDelay = AppStatusUtils.schedulerDelay(taskInfo)
         val schedulerDelayProportion = toProportion(schedulerDelay)
 
         val executorOverhead = serializationTime + deserializationTime
@@ -708,7 +637,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
       {
         if (MAX_TIMELINE_TASKS < tasks.size) {
           <strong>
-            This stage has more than the maximum number of tasks that can be shown in the
+            This page has more than the maximum number of tasks that can be shown in the
             visualization! Only the most recent {MAX_TIMELINE_TASKS} tasks
             (of {tasks.size} total) are shown.
           </strong>
@@ -733,402 +662,49 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
 
 }
 
-private[ui] object StagePage {
-  private[ui] def getGettingResultTime(info: TaskData, currentTime: Long): Long = {
-    info.resultFetchStart match {
-      case Some(start) =>
-        info.duration match {
-          case Some(duration) =>
-            info.launchTime.getTime() + duration - start.getTime()
-
-          case _ =>
-            currentTime - start.getTime()
-        }
-
-      case _ =>
-        0L
-    }
-  }
-
-  private[ui] def getSchedulerDelay(
-      info: TaskData,
-      metrics: TaskMetrics,
-      currentTime: Long): Long = {
-    info.duration match {
-      case Some(duration) =>
-        val executorOverhead = metrics.executorDeserializeTime + metrics.resultSerializationTime
-        math.max(
-          0,
-          duration - metrics.executorRunTime - executorOverhead -
-            getGettingResultTime(info, currentTime))
-
-      case _ =>
-        // The task is still running and the metrics like executorRunTime are not available.
-        0L
-    }
-  }
-
-}
-
-private[ui] case class TaskTableRowInputData(inputSortable: Long, inputReadable: String)
-
-private[ui] case class TaskTableRowOutputData(outputSortable: Long, outputReadable: String)
-
-private[ui] case class TaskTableRowShuffleReadData(
-    shuffleReadBlockedTimeSortable: Long,
-    shuffleReadBlockedTimeReadable: String,
-    shuffleReadSortable: Long,
-    shuffleReadReadable: String,
-    shuffleReadRemoteSortable: Long,
-    shuffleReadRemoteReadable: String)
-
-private[ui] case class TaskTableRowShuffleWriteData(
-    writeTimeSortable: Long,
-    writeTimeReadable: String,
-    shuffleWriteSortable: Long,
-    shuffleWriteReadable: String)
-
-private[ui] case class TaskTableRowBytesSpilledData(
-    memoryBytesSpilledSortable: Long,
-    memoryBytesSpilledReadable: String,
-    diskBytesSpilledSortable: Long,
-    diskBytesSpilledReadable: String)
-
-/**
- * Contains all data that needs for sorting and generating HTML. Using this one rather than
- * TaskData to avoid creating duplicate contents during sorting the data.
- */
-private[ui] class TaskTableRowData(
-    val index: Int,
-    val taskId: Long,
-    val attempt: Int,
-    val speculative: Boolean,
-    val status: String,
-    val taskLocality: String,
-    val executorId: String,
-    val host: String,
-    val launchTime: Long,
-    val duration: Long,
-    val formatDuration: String,
-    val schedulerDelay: Long,
-    val taskDeserializationTime: Long,
-    val gcTime: Long,
-    val serializationTime: Long,
-    val gettingResultTime: Long,
-    val peakExecutionMemoryUsed: Long,
-    val accumulators: Option[String], // HTML
-    val input: Option[TaskTableRowInputData],
-    val output: Option[TaskTableRowOutputData],
-    val shuffleRead: Option[TaskTableRowShuffleReadData],
-    val shuffleWrite: Option[TaskTableRowShuffleWriteData],
-    val bytesSpilled: Option[TaskTableRowBytesSpilledData],
-    val error: String,
-    val logs: Map[String, String])
-
 private[ui] class TaskDataSource(
-    tasks: Seq[TaskData],
-    hasAccumulators: Boolean,
-    hasInput: Boolean,
-    hasOutput: Boolean,
-    hasShuffleRead: Boolean,
-    hasShuffleWrite: Boolean,
-    hasBytesSpilled: Boolean,
+    stage: StageData,
     currentTime: Long,
     pageSize: Int,
     sortColumn: String,
     desc: Boolean,
-    store: AppStatusStore) extends PagedDataSource[TaskTableRowData](pageSize) {
-  import StagePage._
+    store: AppStatusStore) extends PagedDataSource[TaskData](pageSize) {
+  import ApiHelper._
 
   // Keep an internal cache of executor log maps so that long task lists render faster.
   private val executorIdToLogs = new HashMap[String, Map[String, String]]()
 
-  // Convert TaskData to TaskTableRowData which contains the final contents to show in the table
-  // so that we can avoid creating duplicate contents during sorting the data
-  private val data = tasks.map(taskRow).sorted(ordering(sortColumn, desc))
-
-  private var _slicedTaskIds: Set[Long] = _
+  private var _tasksToShow: Seq[TaskData] = null
 
-  override def dataSize: Int = data.size
+  override def dataSize: Int = stage.numCompleteTasks + stage.numFailedTasks + stage.numKilledTasks
 
-  override def sliceData(from: Int, to: Int): Seq[TaskTableRowData] = {
-    val r = data.slice(from, to)
-    _slicedTaskIds = r.map(_.taskId).toSet
-    r
-  }
-
-  def slicedTaskIds: Set[Long] = _slicedTaskIds
-
-  private def taskRow(info: TaskData): TaskTableRowData = {
-    val metrics = info.taskMetrics
-    val duration = info.duration.getOrElse(1L)
-    val formatDuration = info.duration.map(d => UIUtils.formatDuration(d)).getOrElse("")
-    val schedulerDelay = metrics.map(getSchedulerDelay(info, _, currentTime)).getOrElse(0L)
-    val gcTime = metrics.map(_.jvmGcTime).getOrElse(0L)
-    val taskDeserializationTime = metrics.map(_.executorDeserializeTime).getOrElse(0L)
-    val serializationTime = metrics.map(_.resultSerializationTime).getOrElse(0L)
-    val gettingResultTime = getGettingResultTime(info, currentTime)
-
-    val externalAccumulableReadable = info.accumulatorUpdates.map { acc =>
-      StringEscapeUtils.escapeHtml4(s"${acc.name}: ${acc.update}")
+  override def sliceData(from: Int, to: Int): Seq[TaskData] = {
+    if (_tasksToShow == null) {
+      _tasksToShow = store.taskList(stage.stageId, stage.attemptId, from, to - from,
+        indexName(sortColumn), !desc)
     }
-    val peakExecutionMemoryUsed = metrics.map(_.peakExecutionMemory).getOrElse(0L)
-
-    val maybeInput = metrics.map(_.inputMetrics)
-    val inputSortable = maybeInput.map(_.bytesRead).getOrElse(0L)
-    val inputReadable = maybeInput
-      .map(m => s"${Utils.bytesToString(m.bytesRead)}")
-      .getOrElse("")
-    val inputRecords = maybeInput.map(_.recordsRead.toString).getOrElse("")
-
-    val maybeOutput = metrics.map(_.outputMetrics)
-    val outputSortable = maybeOutput.map(_.bytesWritten).getOrElse(0L)
-    val outputReadable = maybeOutput
-      .map(m => s"${Utils.bytesToString(m.bytesWritten)}")
-      .getOrElse("")
-    val outputRecords = maybeOutput.map(_.recordsWritten.toString).getOrElse("")
-
-    val maybeShuffleRead = metrics.map(_.shuffleReadMetrics)
-    val shuffleReadBlockedTimeSortable = maybeShuffleRead.map(_.fetchWaitTime).getOrElse(0L)
-    val shuffleReadBlockedTimeReadable =
-      maybeShuffleRead.map(ms => UIUtils.formatDuration(ms.fetchWaitTime)).getOrElse("")
-
-    val totalShuffleBytes = maybeShuffleRead.map(ApiHelper.totalBytesRead)
-    val shuffleReadSortable = totalShuffleBytes.getOrElse(0L)
-    val shuffleReadReadable = totalShuffleBytes.map(Utils.bytesToString).getOrElse("")
-    val shuffleReadRecords = maybeShuffleRead.map(_.recordsRead.toString).getOrElse("")
-
-    val remoteShuffleBytes = maybeShuffleRead.map(_.remoteBytesRead)
-    val shuffleReadRemoteSortable = remoteShuffleBytes.getOrElse(0L)
-    val shuffleReadRemoteReadable = remoteShuffleBytes.map(Utils.bytesToString).getOrElse("")
-
-    val maybeShuffleWrite = metrics.map(_.shuffleWriteMetrics)
-    val shuffleWriteSortable = maybeShuffleWrite.map(_.bytesWritten).getOrElse(0L)
-    val shuffleWriteReadable = maybeShuffleWrite
-      .map(m => s"${Utils.bytesToString(m.bytesWritten)}").getOrElse("")
-    val shuffleWriteRecords = maybeShuffleWrite
-      .map(_.recordsWritten.toString).getOrElse("")
-
-    val maybeWriteTime = metrics.map(_.shuffleWriteMetrics.writeTime)
-    val writeTimeSortable = maybeWriteTime.getOrElse(0L)
-    val writeTimeReadable = maybeWriteTime.map(t => t / (1000 * 1000)).map { ms =>
-      if (ms == 0) "" else UIUtils.formatDuration(ms)
-    }.getOrElse("")
-
-    val maybeMemoryBytesSpilled = metrics.map(_.memoryBytesSpilled)
-    val memoryBytesSpilledSortable = maybeMemoryBytesSpilled.getOrElse(0L)
-    val memoryBytesSpilledReadable =
-      maybeMemoryBytesSpilled.map(Utils.bytesToString).getOrElse("")
-
-    val maybeDiskBytesSpilled = metrics.map(_.diskBytesSpilled)
-    val diskBytesSpilledSortable = maybeDiskBytesSpilled.getOrElse(0L)
-    val diskBytesSpilledReadable = maybeDiskBytesSpilled.map(Utils.bytesToString).getOrElse("")
-
-    val input =
-      if (hasInput) {
-        Some(TaskTableRowInputData(inputSortable, s"$inputReadable / $inputRecords"))
-      } else {
-        None
-      }
-
-    val output =
-      if (hasOutput) {
-        Some(TaskTableRowOutputData(outputSortable, s"$outputReadable / $outputRecords"))
-      } else {
-        None
-      }
-
-    val shuffleRead =
-      if (hasShuffleRead) {
-        Some(TaskTableRowShuffleReadData(
-          shuffleReadBlockedTimeSortable,
-          shuffleReadBlockedTimeReadable,
-          shuffleReadSortable,
-          s"$shuffleReadReadable / $shuffleReadRecords",
-          shuffleReadRemoteSortable,
-          shuffleReadRemoteReadable
-        ))
-      } else {
-        None
-      }
-
-    val shuffleWrite =
-      if (hasShuffleWrite) {
-        Some(TaskTableRowShuffleWriteData(
-          writeTimeSortable,
-          writeTimeReadable,
-          shuffleWriteSortable,
-          s"$shuffleWriteReadable / $shuffleWriteRecords"
-        ))
-      } else {
-        None
-      }
-
-    val bytesSpilled =
-      if (hasBytesSpilled) {
-        Some(TaskTableRowBytesSpilledData(
-          memoryBytesSpilledSortable,
-          memoryBytesSpilledReadable,
-          diskBytesSpilledSortable,
-          diskBytesSpilledReadable
-        ))
-      } else {
-        None
-      }
-
-    new TaskTableRowData(
-      info.index,
-      info.taskId,
-      info.attempt,
-      info.speculative,
-      info.status,
-      info.taskLocality.toString,
-      info.executorId,
-      info.host,
-      info.launchTime.getTime(),
-      duration,
-      formatDuration,
-      schedulerDelay,
-      taskDeserializationTime,
-      gcTime,
-      serializationTime,
-      gettingResultTime,
-      peakExecutionMemoryUsed,
-      if (hasAccumulators) Some(externalAccumulableReadable.mkString("<br/>")) else None,
-      input,
-      output,
-      shuffleRead,
-      shuffleWrite,
-      bytesSpilled,
-      info.errorMessage.getOrElse(""),
-      executorLogs(info.executorId))
+    _tasksToShow
   }
 
-  private def executorLogs(id: String): Map[String, String] = {
+  def tasks: Seq[TaskData] = _tasksToShow
+
+  def executorLogs(id: String): Map[String, String] = {
     executorIdToLogs.getOrElseUpdate(id,
       store.asOption(store.executorSummary(id)).map(_.executorLogs).getOrElse(Map.empty))
   }
 
-  /**
-   * Return Ordering according to sortColumn and desc
-   */
-  private def ordering(sortColumn: String, desc: Boolean): Ordering[TaskTableRowData] = {
-    val ordering: Ordering[TaskTableRowData] = sortColumn match {
-      case "Index" => Ordering.by(_.index)
-      case "ID" => Ordering.by(_.taskId)
-      case "Attempt" => Ordering.by(_.attempt)
-      case "Status" => Ordering.by(_.status)
-      case "Locality Level" => Ordering.by(_.taskLocality)
-      case "Executor ID" => Ordering.by(_.executorId)
-      case "Host" => Ordering.by(_.host)
-      case "Launch Time" => Ordering.by(_.launchTime)
-      case "Duration" => Ordering.by(_.duration)
-      case "Scheduler Delay" => Ordering.by(_.schedulerDelay)
-      case "Task Deserialization Time" => Ordering.by(_.taskDeserializationTime)
-      case "GC Time" => Ordering.by(_.gcTime)
-      case "Result Serialization Time" => Ordering.by(_.serializationTime)
-      case "Getting Result Time" => Ordering.by(_.gettingResultTime)
-      case "Peak Execution Memory" => Ordering.by(_.peakExecutionMemoryUsed)
-      case "Accumulators" =>
-        if (hasAccumulators) {
-          Ordering.by(_.accumulators.get)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Accumulators because of no accumulators")
-        }
-      case "Input Size / Records" =>
-        if (hasInput) {
-          Ordering.by(_.input.get.inputSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Input Size / Records because of no inputs")
-        }
-      case "Output Size / Records" =>
-        if (hasOutput) {
-          Ordering.by(_.output.get.outputSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Output Size / Records because of no outputs")
-        }
-      // ShuffleRead
-      case "Shuffle Read Blocked Time" =>
-        if (hasShuffleRead) {
-          Ordering.by(_.shuffleRead.get.shuffleReadBlockedTimeSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Shuffle Read Blocked Time because of no shuffle reads")
-        }
-      case "Shuffle Read Size / Records" =>
-        if (hasShuffleRead) {
-          Ordering.by(_.shuffleRead.get.shuffleReadSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Shuffle Read Size / Records because of no shuffle reads")
-        }
-      case "Shuffle Remote Reads" =>
-        if (hasShuffleRead) {
-          Ordering.by(_.shuffleRead.get.shuffleReadRemoteSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Shuffle Remote Reads because of no shuffle reads")
-        }
-      // ShuffleWrite
-      case "Write Time" =>
-        if (hasShuffleWrite) {
-          Ordering.by(_.shuffleWrite.get.writeTimeSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Write Time because of no shuffle writes")
-        }
-      case "Shuffle Write Size / Records" =>
-        if (hasShuffleWrite) {
-          Ordering.by(_.shuffleWrite.get.shuffleWriteSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Shuffle Write Size / Records because of no shuffle writes")
-        }
-      // BytesSpilled
-      case "Shuffle Spill (Memory)" =>
-        if (hasBytesSpilled) {
-          Ordering.by(_.bytesSpilled.get.memoryBytesSpilledSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Shuffle Spill (Memory) because of no spills")
-        }
-      case "Shuffle Spill (Disk)" =>
-        if (hasBytesSpilled) {
-          Ordering.by(_.bytesSpilled.get.diskBytesSpilledSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Shuffle Spill (Disk) because of no spills")
-        }
-      case "Errors" => Ordering.by(_.error)
-      case unknownColumn => throw new IllegalArgumentException(s"Unknown column: $unknownColumn")
-    }
-    if (desc) {
-      ordering.reverse
-    } else {
-      ordering
-    }
-  }
-
 }
 
 private[ui] class TaskPagedTable(
-    conf: SparkConf,
+    stage: StageData,
     basePath: String,
-    data: Seq[TaskData],
-    hasAccumulators: Boolean,
-    hasInput: Boolean,
-    hasOutput: Boolean,
-    hasShuffleRead: Boolean,
-    hasShuffleWrite: Boolean,
-    hasBytesSpilled: Boolean,
     currentTime: Long,
     pageSize: Int,
     sortColumn: String,
     desc: Boolean,
-    store: AppStatusStore) extends PagedTable[TaskTableRowData] {
+    store: AppStatusStore) extends PagedTable[TaskData] {
+
+  import ApiHelper._
 
   override def tableId: String = "task-table"
 
@@ -1142,13 +718,7 @@ private[ui] class TaskPagedTable(
   override def pageNumberFormField: String = "task.page"
 
   override val dataSource: TaskDataSource = new TaskDataSource(
-    data,
-    hasAccumulators,
-    hasInput,
-    hasOutput,
-    hasShuffleRead,
-    hasShuffleWrite,
-    hasBytesSpilled,
+    stage,
     currentTime,
     pageSize,
     sortColumn,
@@ -1180,22 +750,22 @@ private[ui] class TaskPagedTable(
         ("Result Serialization Time", TaskDetailsClassNames.RESULT_SERIALIZATION_TIME),
         ("Getting Result Time", TaskDetailsClassNames.GETTING_RESULT_TIME),
         ("Peak Execution Memory", TaskDetailsClassNames.PEAK_EXECUTION_MEMORY)) ++
-        {if (hasAccumulators) Seq(("Accumulators", "")) else Nil} ++
-        {if (hasInput) Seq(("Input Size / Records", "")) else Nil} ++
-        {if (hasOutput) Seq(("Output Size / Records", "")) else Nil} ++
-        {if (hasShuffleRead) {
+        {if (hasAccumulators(stage)) Seq(("Accumulators", "")) else Nil} ++
+        {if (hasInput(stage)) Seq(("Input Size / Records", "")) else Nil} ++
+        {if (hasOutput(stage)) Seq(("Output Size / Records", "")) else Nil} ++
+        {if (hasShuffleRead(stage)) {
           Seq(("Shuffle Read Blocked Time", TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME),
             ("Shuffle Read Size / Records", ""),
             ("Shuffle Remote Reads", TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE))
         } else {
           Nil
         }} ++
-        {if (hasShuffleWrite) {
+        {if (hasShuffleWrite(stage)) {
           Seq(("Write Time", ""), ("Shuffle Write Size / Records", ""))
         } else {
           Nil
         }} ++
-        {if (hasBytesSpilled) {
+        {if (hasBytesSpilled(stage)) {
           Seq(("Shuffle Spill (Memory)", ""), ("Shuffle Spill (Disk)", ""))
         } else {
           Nil
@@ -1237,7 +807,17 @@ private[ui] class TaskPagedTable(
     <thead>{headerRow}</thead>
   }
 
-  def row(task: TaskTableRowData): Seq[Node] = {
+  def row(task: TaskData): Seq[Node] = {
+    def formatDuration(value: Option[Long], hideZero: Boolean = false): String = {
+      value.map { v =>
+        if (v > 0 || !hideZero) UIUtils.formatDuration(v) else ""
+      }.getOrElse("")
+    }
+
+    def formatBytes(value: Option[Long]): String = {
+      Utils.bytesToString(value.getOrElse(0L))
+    }
+
     <tr>
       <td>{task.index}</td>
       <td>{task.taskId}</td>
@@ -1249,62 +829,98 @@ private[ui] class TaskPagedTable(
         <div style="float: left">{task.host}</div>
         <div style="float: right">
         {
-          task.logs.map {
+          dataSource.executorLogs(task.executorId).map {
             case (logName, logUrl) => <div><a href={logUrl}>{logName}</a></div>
           }
         }
         </div>
       </td>
-      <td>{UIUtils.formatDate(new Date(task.launchTime))}</td>
-      <td>{task.formatDuration}</td>
+      <td>{UIUtils.formatDate(task.launchTime)}</td>
+      <td>{formatDuration(task.duration)}</td>
       <td class={TaskDetailsClassNames.SCHEDULER_DELAY}>
-        {UIUtils.formatDuration(task.schedulerDelay)}
+        {UIUtils.formatDuration(AppStatusUtils.schedulerDelay(task))}
       </td>
       <td class={TaskDetailsClassNames.TASK_DESERIALIZATION_TIME}>
-        {UIUtils.formatDuration(task.taskDeserializationTime)}
+        {formatDuration(task.taskMetrics.map(_.executorDeserializeTime))}
       </td>
       <td>
-        {if (task.gcTime > 0) UIUtils.formatDuration(task.gcTime) else ""}
+        {formatDuration(task.taskMetrics.map(_.jvmGcTime), hideZero = true)}
       </td>
       <td class={TaskDetailsClassNames.RESULT_SERIALIZATION_TIME}>
-        {UIUtils.formatDuration(task.serializationTime)}
+        {formatDuration(task.taskMetrics.map(_.resultSerializationTime))}
       </td>
       <td class={TaskDetailsClassNames.GETTING_RESULT_TIME}>
-        {UIUtils.formatDuration(task.gettingResultTime)}
+        {UIUtils.formatDuration(AppStatusUtils.gettingResultTime(task))}
       </td>
       <td class={TaskDetailsClassNames.PEAK_EXECUTION_MEMORY}>
-        {Utils.bytesToString(task.peakExecutionMemoryUsed)}
+        {formatBytes(task.taskMetrics.map(_.peakExecutionMemory))}
       </td>
-      {if (task.accumulators.nonEmpty) {
-        <td>{Unparsed(task.accumulators.get)}</td>
+      {if (hasAccumulators(stage)) {
+        accumulatorsInfo(task)
       }}
-      {if (task.input.nonEmpty) {
-        <td>{task.input.get.inputReadable}</td>
+      {if (hasInput(stage)) {
+        metricInfo(task) { m =>
+          val bytesRead = Utils.bytesToString(m.inputMetrics.bytesRead)
+          val records = m.inputMetrics.recordsRead
+          <td>{bytesRead} / {records}</td>
+        }
       }}
-      {if (task.output.nonEmpty) {
-        <td>{task.output.get.outputReadable}</td>
+      {if (hasOutput(stage)) {
+        metricInfo(task) { m =>
+          val bytesWritten = Utils.bytesToString(m.outputMetrics.bytesWritten)
+          val records = m.outputMetrics.recordsWritten
+          <td>{bytesWritten} / {records}</td>
+        }
       }}
-      {if (task.shuffleRead.nonEmpty) {
+      {if (hasShuffleRead(stage)) {
         <td class={TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME}>
-          {task.shuffleRead.get.shuffleReadBlockedTimeReadable}
+          {formatDuration(task.taskMetrics.map(_.shuffleReadMetrics.fetchWaitTime))}
         </td>
-        <td>{task.shuffleRead.get.shuffleReadReadable}</td>
+        <td>{
+          metricInfo(task) { m =>
+            val bytesRead = Utils.bytesToString(totalBytesRead(m.shuffleReadMetrics))
+            val records = m.shuffleReadMetrics.recordsRead
+            Unparsed(s"$bytesRead / $records")
+          }
+        }</td>
         <td class={TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE}>
-          {task.shuffleRead.get.shuffleReadRemoteReadable}
+          {formatBytes(task.taskMetrics.map(_.shuffleReadMetrics.remoteBytesRead))}
         </td>
       }}
-      {if (task.shuffleWrite.nonEmpty) {
-        <td>{task.shuffleWrite.get.writeTimeReadable}</td>
-        <td>{task.shuffleWrite.get.shuffleWriteReadable}</td>
+      {if (hasShuffleWrite(stage)) {
+        <td>{
+          formatDuration(
+            task.taskMetrics.map { m =>
+              TimeUnit.NANOSECONDS.toMillis(m.shuffleWriteMetrics.writeTime)
+            },
+            hideZero = true)
+        }</td>
+        <td>{
+          metricInfo(task) { m =>
+            val bytesWritten = Utils.bytesToString(m.shuffleWriteMetrics.bytesWritten)
+            val records = m.shuffleWriteMetrics.recordsWritten
+            Unparsed(s"$bytesWritten / $records")
+          }
+        }</td>
       }}
-      {if (task.bytesSpilled.nonEmpty) {
-        <td>{task.bytesSpilled.get.memoryBytesSpilledReadable}</td>
-        <td>{task.bytesSpilled.get.diskBytesSpilledReadable}</td>
+      {if (hasBytesSpilled(stage)) {
+        <td>{formatBytes(task.taskMetrics.map(_.memoryBytesSpilled))}</td>
+        <td>{formatBytes(task.taskMetrics.map(_.diskBytesSpilled))}</td>
       }}
-      {errorMessageCell(task.error)}
+      {errorMessageCell(task.errorMessage.getOrElse(""))}
     </tr>
   }
 
+  private def accumulatorsInfo(task: TaskData): Seq[Node] = {
+    task.accumulatorUpdates.map { acc =>
+      Unparsed(StringEscapeUtils.escapeHtml4(s"${acc.name}: ${acc.update}"))
+    }
+  }
+
+  private def metricInfo(task: TaskData)(fn: TaskMetrics => Seq[Node]): Seq[Node] = {
+    task.taskMetrics.map(fn).getOrElse(Nil)
+  }
+
   private def errorMessageCell(error: String): Seq[Node] = {
     val isMultiline = error.indexOf('\n') >= 0
     // Display the first line by default
@@ -1333,6 +949,36 @@ private[ui] class TaskPagedTable(
 
 private object ApiHelper {
 
+
+  private val COLUMN_TO_INDEX = Map(
+    "ID" -> null.asInstanceOf[String],
+    "Index" -> TaskIndexNames.TASK_INDEX,
+    "Attempt" -> TaskIndexNames.ATTEMPT,
+    "Status" -> TaskIndexNames.STATUS,
+    "Locality Level" -> TaskIndexNames.LOCALITY,
+    "Executor ID / Host" -> TaskIndexNames.EXECUTOR,
+    "Launch Time" -> TaskIndexNames.LAUNCH_TIME,
+    "Duration" -> TaskIndexNames.DURATION,
+    "Scheduler Delay" -> TaskIndexNames.SCHEDULER_DELAY,
+    "Task Deserialization Time" -> TaskIndexNames.DESER_TIME,
+    "GC Time" -> TaskIndexNames.GC_TIME,
+    "Result Serialization Time" -> TaskIndexNames.SER_TIME,
+    "Getting Result Time" -> TaskIndexNames.GETTING_RESULT_TIME,
+    "Peak Execution Memory" -> TaskIndexNames.PEAK_MEM,
+    "Accumulators" -> TaskIndexNames.ACCUMULATORS,
+    "Input Size / Records" -> TaskIndexNames.INPUT_SIZE,
+    "Output Size / Records" -> TaskIndexNames.OUTPUT_SIZE,
+    "Shuffle Read Blocked Time" -> TaskIndexNames.SHUFFLE_READ_TIME,
+    "Shuffle Read Size / Records" -> TaskIndexNames.SHUFFLE_TOTAL_READS,
+    "Shuffle Remote Reads" -> TaskIndexNames.SHUFFLE_REMOTE_READS,
+    "Write Time" -> TaskIndexNames.SHUFFLE_WRITE_TIME,
+    "Shuffle Write Size / Records" -> TaskIndexNames.SHUFFLE_WRITE_SIZE,
+    "Shuffle Spill (Memory)" -> TaskIndexNames.MEM_SPILL,
+    "Shuffle Spill (Disk)" -> TaskIndexNames.DISK_SPILL,
+    "Errors" -> TaskIndexNames.ERROR)
+
+  def hasAccumulators(stageData: StageData): Boolean = stageData.accumulatorUpdates.size > 0
+
   def hasInput(stageData: StageData): Boolean = stageData.inputBytes > 0
 
   def hasOutput(stageData: StageData): Boolean = stageData.outputBytes > 0
@@ -1349,4 +995,11 @@ private object ApiHelper {
     metrics.localBytesRead + metrics.remoteBytesRead
   }
 
+  def indexName(sortColumn: String): Option[String] = {
+    COLUMN_TO_INDEX.get(sortColumn) match {
+      case Some(v) => Option(v)
+      case _ => throw new IllegalArgumentException(s"Invalid sort column: $sortColumn")
+    }
+  }
+
 }
diff --git a/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w__custom_quantiles_expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w__custom_quantiles_expectation.json
index f8e27703c0def..5c42ac1d87f4c 100644
--- a/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w__custom_quantiles_expectation.json
+++ b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w__custom_quantiles_expectation.json
@@ -7,6 +7,9 @@
   "resultSize" : [ 2010.0, 2065.0, 2065.0 ],
   "jvmGcTime" : [ 0.0, 0.0, 7.0 ],
   "resultSerializationTime" : [ 0.0, 0.0, 2.0 ],
+  "gettingResultTime" : [ 0.0, 0.0, 0.0 ],
+  "schedulerDelay" : [ 2.0, 6.0, 53.0 ],
+  "peakExecutionMemory" : [ 0.0, 0.0, 0.0 ],
   "memoryBytesSpilled" : [ 0.0, 0.0, 0.0 ],
   "diskBytesSpilled" : [ 0.0, 0.0, 0.0 ],
   "inputMetrics" : {
diff --git a/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_read_expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_read_expectation.json
index a28bda16a956e..e6b705989cc97 100644
--- a/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_read_expectation.json
+++ b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_read_expectation.json
@@ -7,6 +7,9 @@
   "resultSize" : [ 1034.0, 1034.0, 1034.0, 1034.0, 1034.0 ],
   "jvmGcTime" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
   "resultSerializationTime" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
+  "gettingResultTime" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
+  "schedulerDelay" : [ 4.0, 4.0, 6.0, 7.0, 9.0 ],
+  "peakExecutionMemory" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
   "memoryBytesSpilled" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
   "diskBytesSpilled" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
   "inputMetrics" : {
diff --git a/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_write_expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_write_expectation.json
index ede3eaed1d1d2..788f28cf7b365 100644
--- a/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_write_expectation.json
+++ b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_write_expectation.json
@@ -7,6 +7,9 @@
   "resultSize" : [ 2010.0, 2065.0, 2065.0, 2065.0, 2065.0 ],
   "jvmGcTime" : [ 0.0, 0.0, 0.0, 5.0, 7.0 ],
   "resultSerializationTime" : [ 0.0, 0.0, 0.0, 0.0, 1.0 ],
+  "gettingResultTime" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
+  "schedulerDelay" : [ 2.0, 4.0, 6.0, 13.0, 40.0 ],
+  "peakExecutionMemory" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
   "memoryBytesSpilled" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
   "diskBytesSpilled" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
   "inputMetrics" : {
diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
index b8c84e24c2c3f..ca66b6b9db890 100644
--- a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
@@ -213,45 +213,42 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
     s1Tasks.foreach { task =>
       check[TaskDataWrapper](task.taskId) { wrapper =>
-        assert(wrapper.info.taskId === task.taskId)
+        assert(wrapper.taskId === task.taskId)
         assert(wrapper.stageId === stages.head.stageId)
-        assert(wrapper.stageAttemptId === stages.head.attemptNumber)
-        assert(Arrays.equals(wrapper.stage, Array(stages.head.stageId, stages.head.attemptNumber)))
-
-        val runtime = Array[AnyRef](stages.head.stageId: JInteger,
-          stages.head.attemptNumber: JInteger,
-          -1L: JLong)
-        assert(Arrays.equals(wrapper.runtime, runtime))
-
-        assert(wrapper.info.index === task.index)
-        assert(wrapper.info.attempt === task.attemptNumber)
-        assert(wrapper.info.launchTime === new Date(task.launchTime))
-        assert(wrapper.info.executorId === task.executorId)
-        assert(wrapper.info.host === task.host)
-        assert(wrapper.info.status === task.status)
-        assert(wrapper.info.taskLocality === task.taskLocality.toString())
-        assert(wrapper.info.speculative === task.speculative)
+        assert(wrapper.stageAttemptId === stages.head.attemptId)
+        assert(wrapper.index === task.index)
+        assert(wrapper.attempt === task.attemptNumber)
+        assert(wrapper.launchTime === task.launchTime)
+        assert(wrapper.executorId === task.executorId)
+        assert(wrapper.host === task.host)
+        assert(wrapper.status === task.status)
+        assert(wrapper.taskLocality === task.taskLocality.toString())
+        assert(wrapper.speculative === task.speculative)
       }
     }
 
-    // Send executor metrics update. Only update one metric to avoid a lot of boilerplate code.
-    s1Tasks.foreach { task =>
-      val accum = new AccumulableInfo(1L, Some(InternalAccumulator.MEMORY_BYTES_SPILLED),
-        Some(1L), None, true, false, None)
-      listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate(
-        task.executorId,
-        Seq((task.taskId, stages.head.stageId, stages.head.attemptNumber, Seq(accum)))))
-    }
+    // Send two executor metrics update. Only update one metric to avoid a lot of boilerplate code.
+    // The tasks are distributed among the two executors, so the executor-level metrics should
+    // hold half of the cummulative value of the metric being updated.
+    Seq(1L, 2L).foreach { value =>
+      s1Tasks.foreach { task =>
+        val accum = new AccumulableInfo(1L, Some(InternalAccumulator.MEMORY_BYTES_SPILLED),
+          Some(value), None, true, false, None)
+        listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate(
+          task.executorId,
+          Seq((task.taskId, stages.head.stageId, stages.head.attemptNumber, Seq(accum)))))
+      }
 
-    check[StageDataWrapper](key(stages.head)) { stage =>
-      assert(stage.info.memoryBytesSpilled === s1Tasks.size)
-    }
+      check[StageDataWrapper](key(stages.head)) { stage =>
+        assert(stage.info.memoryBytesSpilled === s1Tasks.size * value)
+      }
 
-    val execs = store.view(classOf[ExecutorStageSummaryWrapper]).index("stage")
-      .first(key(stages.head)).last(key(stages.head)).asScala.toSeq
-    assert(execs.size > 0)
-    execs.foreach { exec =>
-      assert(exec.info.memoryBytesSpilled === s1Tasks.size / 2)
+      val execs = store.view(classOf[ExecutorStageSummaryWrapper]).index("stage")
+        .first(key(stages.head)).last(key(stages.head)).asScala.toSeq
+      assert(execs.size > 0)
+      execs.foreach { exec =>
+        assert(exec.info.memoryBytesSpilled === s1Tasks.size * value / 2)
+      }
     }
 
     // Fail one of the tasks, re-start it.
@@ -278,13 +275,13 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     }
 
     check[TaskDataWrapper](s1Tasks.head.taskId) { task =>
-      assert(task.info.status === s1Tasks.head.status)
-      assert(task.info.errorMessage == Some(TaskResultLost.toErrorString))
+      assert(task.status === s1Tasks.head.status)
+      assert(task.errorMessage == Some(TaskResultLost.toErrorString))
     }
 
     check[TaskDataWrapper](reattempt.taskId) { task =>
-      assert(task.info.index === s1Tasks.head.index)
-      assert(task.info.attempt === reattempt.attemptNumber)
+      assert(task.index === s1Tasks.head.index)
+      assert(task.attempt === reattempt.attemptNumber)
     }
 
     // Kill one task, restart it.
@@ -306,8 +303,8 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     }
 
     check[TaskDataWrapper](killed.taskId) { task =>
-      assert(task.info.index === killed.index)
-      assert(task.info.errorMessage === Some("killed"))
+      assert(task.index === killed.index)
+      assert(task.errorMessage === Some("killed"))
     }
 
     // Start a new attempt and finish it with TaskCommitDenied, make sure it's handled like a kill.
@@ -334,8 +331,8 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     }
 
     check[TaskDataWrapper](denied.taskId) { task =>
-      assert(task.info.index === killed.index)
-      assert(task.info.errorMessage === Some(denyReason.toErrorString))
+      assert(task.index === killed.index)
+      assert(task.errorMessage === Some(denyReason.toErrorString))
     }
 
     // Start a new attempt.
@@ -373,10 +370,10 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
     pending.foreach { task =>
       check[TaskDataWrapper](task.taskId) { wrapper =>
-        assert(wrapper.info.errorMessage === None)
-        assert(wrapper.info.taskMetrics.get.executorCpuTime === 2L)
-        assert(wrapper.info.taskMetrics.get.executorRunTime === 4L)
-        assert(wrapper.info.duration === Some(task.duration))
+        assert(wrapper.errorMessage === None)
+        assert(wrapper.executorCpuTime === 2L)
+        assert(wrapper.executorRunTime === 4L)
+        assert(wrapper.duration === task.duration)
       }
     }
 
@@ -894,6 +891,23 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     assert(store.count(classOf[StageDataWrapper]) === 3)
     assert(store.count(classOf[RDDOperationGraphWrapper]) === 3)
 
+    val dropped = stages.drop(1).head
+
+    // Cache some quantiles by calling AppStatusStore.taskSummary(). For quantiles to be
+    // calculcated, we need at least one finished task.
+    time += 1
+    val task = createTasks(1, Array("1")).head
+    listener.onTaskStart(SparkListenerTaskStart(dropped.stageId, dropped.attemptId, task))
+
+    time += 1
+    task.markFinished(TaskState.FINISHED, time)
+    listener.onTaskEnd(SparkListenerTaskEnd(dropped.stageId, dropped.attemptId,
+      "taskType", Success, task, null))
+
+    new AppStatusStore(store)
+      .taskSummary(dropped.stageId, dropped.attemptId, Array(0.25d, 0.50d, 0.75d))
+    assert(store.count(classOf[CachedQuantile], "stage", key(dropped)) === 3)
+
     stages.drop(1).foreach { s =>
       time += 1
       s.completionTime = Some(time)
@@ -905,6 +919,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     intercept[NoSuchElementException] {
       store.read(classOf[StageDataWrapper], Array(2, 0))
     }
+    assert(store.count(classOf[CachedQuantile], "stage", key(dropped)) === 0)
 
     val attempt2 = new StageInfo(3, 1, "stage3", 4, Nil, Nil, "details3")
     time += 1
diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala
new file mode 100644
index 0000000000000..92f90f3d96ddf
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.status
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.status.api.v1.TaskMetricDistributions
+import org.apache.spark.util.Distribution
+import org.apache.spark.util.kvstore._
+
+class AppStatusStoreSuite extends SparkFunSuite {
+
+  private val uiQuantiles = Array(0.0, 0.25, 0.5, 0.75, 1.0)
+  private val stageId = 1
+  private val attemptId = 1
+
+  test("quantile calculation: 1 task") {
+    compareQuantiles(1, uiQuantiles)
+  }
+
+  test("quantile calculation: few tasks") {
+    compareQuantiles(4, uiQuantiles)
+  }
+
+  test("quantile calculation: more tasks") {
+    compareQuantiles(100, uiQuantiles)
+  }
+
+  test("quantile calculation: lots of tasks") {
+    compareQuantiles(4096, uiQuantiles)
+  }
+
+  test("quantile calculation: custom quantiles") {
+    compareQuantiles(4096, Array(0.01, 0.33, 0.5, 0.42, 0.69, 0.99))
+  }
+
+  test("quantile cache") {
+    val store = new InMemoryStore()
+    (0 until 4096).foreach { i => store.write(newTaskData(i)) }
+
+    val appStore = new AppStatusStore(store)
+
+    appStore.taskSummary(stageId, attemptId, Array(0.13d))
+    intercept[NoSuchElementException] {
+      store.read(classOf[CachedQuantile], Array(stageId, attemptId, "13"))
+    }
+
+    appStore.taskSummary(stageId, attemptId, Array(0.25d))
+    val d1 = store.read(classOf[CachedQuantile], Array(stageId, attemptId, "25"))
+
+    // Add a new task to force the cached quantile to be evicted, and make sure it's updated.
+    store.write(newTaskData(4096))
+    appStore.taskSummary(stageId, attemptId, Array(0.25d, 0.50d, 0.73d))
+
+    val d2 = store.read(classOf[CachedQuantile], Array(stageId, attemptId, "25"))
+    assert(d1.taskCount != d2.taskCount)
+
+    store.read(classOf[CachedQuantile], Array(stageId, attemptId, "50"))
+    intercept[NoSuchElementException] {
+      store.read(classOf[CachedQuantile], Array(stageId, attemptId, "73"))
+    }
+
+    assert(store.count(classOf[CachedQuantile]) === 2)
+  }
+
+  private def compareQuantiles(count: Int, quantiles: Array[Double]): Unit = {
+    val store = new InMemoryStore()
+    val values = (0 until count).map { i =>
+      val task = newTaskData(i)
+      store.write(task)
+      i.toDouble
+    }.toArray
+
+    val summary = new AppStatusStore(store).taskSummary(stageId, attemptId, quantiles).get
+    val dist = new Distribution(values, 0, values.length).getQuantiles(quantiles.sorted)
+
+    dist.zip(summary.executorRunTime).foreach { case (expected, actual) =>
+      assert(expected === actual)
+    }
+  }
+
+  private def newTaskData(i: Int): TaskDataWrapper = {
+    new TaskDataWrapper(
+      i, i, i, i, i, i, i.toString, i.toString, i.toString, i.toString, false, Nil, None,
+      i, i, i, i, i, i, i, i, i, i,
+      i, i, i, i, i, i, i, i, i, i,
+      i, i, i, i, stageId, attemptId)
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
index 661d0d48d2f37..0aeddf730cd35 100644
--- a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
@@ -28,6 +28,7 @@ import org.apache.spark._
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler._
 import org.apache.spark.status.AppStatusStore
+import org.apache.spark.status.config._
 import org.apache.spark.ui.jobs.{StagePage, StagesTab}
 
 class StagePageSuite extends SparkFunSuite with LocalSparkContext {
@@ -35,15 +36,13 @@ class StagePageSuite extends SparkFunSuite with LocalSparkContext {
   private val peakExecutionMemory = 10
 
   test("peak execution memory should displayed") {
-    val conf = new SparkConf(false)
-    val html = renderStagePage(conf).toString().toLowerCase(Locale.ROOT)
+    val html = renderStagePage().toString().toLowerCase(Locale.ROOT)
     val targetString = "peak execution memory"
     assert(html.contains(targetString))
   }
 
   test("SPARK-10543: peak execution memory should be per-task rather than cumulative") {
-    val conf = new SparkConf(false)
-    val html = renderStagePage(conf).toString().toLowerCase(Locale.ROOT)
+    val html = renderStagePage().toString().toLowerCase(Locale.ROOT)
     // verify min/25/50/75/max show task value not cumulative values
     assert(html.contains(s"<td>$peakExecutionMemory.0 b</td>" * 5))
   }
@@ -52,7 +51,8 @@ class StagePageSuite extends SparkFunSuite with LocalSparkContext {
    * Render a stage page started with the given conf and return the HTML.
    * This also runs a dummy stage to populate the page with useful content.
    */
-  private def renderStagePage(conf: SparkConf): Seq[Node] = {
+  private def renderStagePage(): Seq[Node] = {
+    val conf = new SparkConf(false).set(LIVE_ENTITY_UPDATE_PERIOD, 0L)
     val statusStore = AppStatusStore.createLiveStore(conf)
     val listener = statusStore.listener.get
 
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index 7bdd3fac773a3..e2fa5754afaee 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -93,7 +93,7 @@ This file is divided into 3 sections:
     <parameters><parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter></parameters>
   </check>
 
-  <check level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
+  <check customId="argcount" level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
     <parameters><parameter name="maxParameters"><![CDATA[10]]></parameter></parameters>
   </check>
 

From 79959890570d216c33069c8382b29d53977665b1 Mon Sep 17 00:00:00 2001
From: wuyi5 <ngone_5451@163.com>
Date: Thu, 11 Jan 2018 22:17:15 +0900
Subject: [PATCH 065/566] [SPARK-22967][TESTS] Fix VersionSuite's unit tests by
 change Windows path into URI path

## What changes were proposed in this pull request?

Two unit test will fail due to Windows format path:

1.test(s"$version: read avro file containing decimal")
```
org.apache.hadoop.hive.ql.metadata.HiveException: MetaException(message:java.lang.IllegalArgumentException: Can not create a Path from an empty string);
```

2.test(s"$version: SPARK-17920: Insert into/overwrite avro table")
```
Unable to infer the schema. The schema specification is required to create the table `default`.`tab2`.;
org.apache.spark.sql.AnalysisException: Unable to infer the schema. The schema specification is required to create the table `default`.`tab2`.;
```

This pr fix these two unit test by change Windows path into URI path.

## How was this patch tested?
Existed.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: wuyi5 <ngone_5451@163.com>

Closes #20199 from Ngone51/SPARK-22967.

(cherry picked from commit 0552c36e02434c60dad82024334d291f6008b822)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 .../org/apache/spark/sql/hive/client/VersionsSuite.scala  | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index ff90e9dda5f7c..e64389e56b5a1 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -811,7 +811,7 @@ class VersionsSuite extends SparkFunSuite with Logging {
 
     test(s"$version: read avro file containing decimal") {
       val url = Thread.currentThread().getContextClassLoader.getResource("avroDecimal")
-      val location = new File(url.getFile)
+      val location = new File(url.getFile).toURI.toString
 
       val tableName = "tab1"
       val avroSchema =
@@ -851,6 +851,8 @@ class VersionsSuite extends SparkFunSuite with Logging {
     }
 
     test(s"$version: SPARK-17920: Insert into/overwrite avro table") {
+      // skipped because it's failed in the condition on Windows
+      assume(!(Utils.isWindows && version == "0.12"))
       withTempDir { dir =>
         val avroSchema =
           """
@@ -875,10 +877,10 @@ class VersionsSuite extends SparkFunSuite with Logging {
         val writer = new PrintWriter(schemaFile)
         writer.write(avroSchema)
         writer.close()
-        val schemaPath = schemaFile.getCanonicalPath
+        val schemaPath = schemaFile.toURI.toString
 
         val url = Thread.currentThread().getContextClassLoader.getResource("avroDecimal")
-        val srcLocation = new File(url.getFile).getCanonicalPath
+        val srcLocation = new File(url.getFile).toURI.toString
         val destTableName = "tab1"
         val srcTableName = "tab2"
 

From 9ca0f6eaf6744c090cab4ac6720cf11c9b83915e Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 11 Jan 2018 21:32:36 +0800
Subject: [PATCH 066/566] [SPARK-23000][TEST-HADOOP2.6] Fix Flaky test suite
 DataSourceWithHiveMetastoreCatalogSuite

## What changes were proposed in this pull request?
The Spark 2.3 branch still failed due to the flaky test suite `DataSourceWithHiveMetastoreCatalogSuite `. https://amplab.cs.berkeley.edu/jenkins/job/spark-branch-2.3-test-sbt-hadoop-2.6/

Although https://github.com/apache/spark/pull/20207 is unable to reproduce it in Spark 2.3, it sounds like the current DB of Spark's Catalog is changed based on the following stacktrace. Thus, we just need to reset it.

```
[info] DataSourceWithHiveMetastoreCatalogSuite:
02:40:39.486 ERROR org.apache.hadoop.hive.ql.parse.CalcitePlanner: org.apache.hadoop.hive.ql.parse.SemanticException: Line 1:14 Table not found 't'
	at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.getMetaData(SemanticAnalyzer.java:1594)
	at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.getMetaData(SemanticAnalyzer.java:1545)
	at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genResolvedParseTree(SemanticAnalyzer.java:10077)
	at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:10128)
	at org.apache.hadoop.hive.ql.parse.CalcitePlanner.analyzeInternal(CalcitePlanner.java:209)
	at org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:227)
	at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:424)
	at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:308)
	at org.apache.hadoop.hive.ql.Driver.compileInternal(Driver.java:1122)
	at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1170)
	at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1059)
	at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1049)
	at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$runHive$1.apply(HiveClientImpl.scala:694)
	at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$runHive$1.apply(HiveClientImpl.scala:683)
	at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$withHiveState$1.apply(HiveClientImpl.scala:272)
	at org.apache.spark.sql.hive.client.HiveClientImpl.liftedTree1$1(HiveClientImpl.scala:210)
	at org.apache.spark.sql.hive.client.HiveClientImpl.retryLocked(HiveClientImpl.scala:209)
	at org.apache.spark.sql.hive.client.HiveClientImpl.withHiveState(HiveClientImpl.scala:255)
	at org.apache.spark.sql.hive.client.HiveClientImpl.runHive(HiveClientImpl.scala:683)
	at org.apache.spark.sql.hive.client.HiveClientImpl.runSqlHive(HiveClientImpl.scala:673)
	at org.apache.spark.sql.hive.DataSourceWithHiveMetastoreCatalogSuite$$anonfun$9$$anonfun$apply$1$$anonfun$apply$mcV$sp$3.apply$mcV$sp(HiveMetastoreCatalogSuite.scala:185)
	at org.apache.spark.sql.test.SQLTestUtilsBase$class.withTable(SQLTestUtils.scala:273)
	at org.apache.spark.sql.hive.DataSourceWithHiveMetastoreCatalogSuite.withTable(HiveMetastoreCatalogSuite.scala:139)
	at org.apache.spark.sql.hive.DataSourceWithHiveMetastoreCatalogSuite$$anonfun$9$$anonfun$apply$1.apply$mcV$sp(HiveMetastoreCatalogSuite.scala:163)
	at org.apache.spark.sql.hive.DataSourceWithHiveMetastoreCatalogSuite$$anonfun$9$$anonfun$apply$1.apply(HiveMetastoreCatalogSuite.scala:163)
	at org.apache.spark.sql.hive.DataSourceWithHiveMetastoreCatalogSuite$$anonfun$9$$anonfun$apply$1.apply(HiveMetastoreCatalogSuite.scala:163)
	at org.scalatest.OutcomeOf$class.outcomeOf(OutcomeOf.scala:85)
	at org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104)
	at org.scalatest.Transformer.apply(Transformer.scala:22)
	at org.scalatest.Transformer.apply(Transformer.scala:20)
	at org.scalatest.FunSuiteLike$$anon$1.apply(FunSuiteLike.scala:186)
	at org.apache.spark.SparkFunSuite.withFixture(SparkFunSuite.scala:68)
	at org.scalatest.FunSuiteLike$class.invokeWithFixture$1(FunSuiteLike.scala:183)
	at org.scalatest.FunSuiteLike$$anonfun$runTest$1.apply(FunSuiteLike.scala:196)
	at org.scalatest.FunSuiteLike$$anonfun$runTest$1.apply(FunSuiteLike.scala:196)
	at org.scalatest.SuperEngine.runTestImpl(Engine.scala:289)
	at org.scalatest.FunSuiteLike$class.runTest(FunSuiteLike.scala:196)
	at org.scalatest.FunSuite.runTest(FunSuite.scala:1560)
	at org.scalatest.FunSuiteLike$$anonfun$runTests$1.apply(FunSuiteLike.scala:229)
	at org.scalatest.FunSuiteLike$$anonfun$runTests$1.apply(FunSuiteLike.scala:229)
	at org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:396)
	at org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:384)
	at scala.collection.immutable.List.foreach(List.scala:381)
	at org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:384)
	at org.scalatest.SuperEngine.org$scalatest$SuperEngine$$runTestsInBranch(Engine.scala:379)
	at org.scalatest.SuperEngine.runTestsImpl(Engine.scala:461)
	at org.scalatest.FunSuiteLike$class.runTests(FunSuiteLike.scala:229)
	at org.scalatest.FunSuite.runTests(FunSuite.scala:1560)
	at org.scalatest.Suite$class.run(Suite.scala:1147)
	at org.scalatest.FunSuite.org$scalatest$FunSuiteLike$$super$run(FunSuite.scala:1560)
	at org.scalatest.FunSuiteLike$$anonfun$run$1.apply(FunSuiteLike.scala:233)
	at org.scalatest.FunSuiteLike$$anonfun$run$1.apply(FunSuiteLike.scala:233)
	at org.scalatest.SuperEngine.runImpl(Engine.scala:521)
	at org.scalatest.FunSuiteLike$class.run(FunSuiteLike.scala:233)
	at org.apache.spark.SparkFunSuite.org$scalatest$BeforeAndAfterAll$$super$run(SparkFunSuite.scala:31)
	at org.scalatest.BeforeAndAfterAll$class.liftedTree1$1(BeforeAndAfterAll.scala:213)
	at org.scalatest.BeforeAndAfterAll$class.run(BeforeAndAfterAll.scala:210)
	at org.apache.spark.SparkFunSuite.run(SparkFunSuite.scala:31)
	at org.scalatest.tools.Framework.org$scalatest$tools$Framework$$runSuite(Framework.scala:314)
	at org.scalatest.tools.Framework$ScalaTestTask.execute(Framework.scala:480)
	at sbt.ForkMain$Run$2.call(ForkMain.java:296)
	at sbt.ForkMain$Run$2.call(ForkMain.java:286)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)
```

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20218 from gatorsmile/testFixAgain.

(cherry picked from commit 76892bcf2c08efd7e9c5b16d377e623d82fe695e)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index cf4ce83124d88..ba9b944e4a055 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -148,6 +148,7 @@ class DataSourceWithHiveMetastoreCatalogSuite
 
   override def beforeAll(): Unit = {
     super.beforeAll()
+    sparkSession.sessionState.catalog.reset()
     sparkSession.metadataHive.reset()
   }
 

From f624850fe8acce52240217f376316734a23be00b Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 11 Jan 2018 22:33:42 +0900
Subject: [PATCH 067/566] [SPARK-19732][FOLLOW-UP] Document behavior changes
 made in na.fill and fillna

## What changes were proposed in this pull request?
https://github.com/apache/spark/pull/18164 introduces the behavior changes. We need to document it.

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20234 from gatorsmile/docBehaviorChange.

(cherry picked from commit b46e58b74c82dac37b7b92284ea3714919c5a886)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 docs/sql-programming-guide.md | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 72f79d6909ecc..258c769ff593b 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1788,12 +1788,10 @@ options.
     Note that, for <b>DecimalType(38,0)*</b>, the table above intentionally does not cover all other combinations of scales and precisions because currently we only infer decimal type like `BigInteger`/`BigInt`. For example, 1.1 is inferred as double type.
   - In PySpark, now we need Pandas 0.19.2 or upper if you want to use Pandas related functionalities, such as `toPandas`, `createDataFrame` from Pandas DataFrame, etc.
   - In PySpark, the behavior of timestamp values for Pandas related functionalities was changed to respect session timezone. If you want to use the old behavior, you need to set a configuration `spark.sql.execution.pandas.respectSessionTimeZone` to `False`. See [SPARK-22395](https://issues.apache.org/jira/browse/SPARK-22395) for details.
-
- - Since Spark 2.3, when either broadcast hash join or broadcast nested loop join is applicable, we prefer to broadcasting the table that is explicitly specified in a broadcast hint. For details, see the section [Broadcast Hint](#broadcast-hint-for-sql-queries) and [SPARK-22489](https://issues.apache.org/jira/browse/SPARK-22489).
-
- - Since Spark 2.3, when all inputs are binary, `functions.concat()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.concatBinaryAsString` to `true`.
-
- - Since Spark 2.3, when all inputs are binary, SQL `elt()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.eltOutputAsString` to `true`.
+  - In PySpark, `na.fill()` or `fillna` also accepts boolean and replaces nulls with booleans. In prior Spark versions, PySpark just ignores it and returns the original Dataset/DataFrame.  
+  - Since Spark 2.3, when either broadcast hash join or broadcast nested loop join is applicable, we prefer to broadcasting the table that is explicitly specified in a broadcast hint. For details, see the section [Broadcast Hint](#broadcast-hint-for-sql-queries) and [SPARK-22489](https://issues.apache.org/jira/browse/SPARK-22489).
+  - Since Spark 2.3, when all inputs are binary, `functions.concat()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.concatBinaryAsString` to `true`.
+  - Since Spark 2.3, when all inputs are binary, SQL `elt()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.eltOutputAsString` to `true`.
 
 ## Upgrading From Spark SQL 2.1 to 2.2
 

From b94debd2b01b87ef1d2a34d48877e38ade0969e6 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 11 Jan 2018 10:37:35 -0800
Subject: [PATCH 068/566] [SPARK-22994][K8S] Use a single image for all Spark
 containers.

This change allows a user to submit a Spark application on kubernetes
having to provide a single image, instead of one image for each type
of container. The image's entry point now takes an extra argument that
identifies the process that is being started.

The configuration still allows the user to provide different images
for each container type if they so desire.

On top of that, the entry point was simplified a bit to share more
code; mainly, the same env variable is used to propagate the user-defined
classpath to the different containers.

Aside from being modified to match the new behavior, the
'build-push-docker-images.sh' script was renamed to 'docker-image-tool.sh'
to more closely match its purpose; the old name was a little awkward
and now also not entirely correct, since there is a single image. It
was also moved to 'bin' since it's not necessarily an admin tool.

Docs have been updated to match the new behavior.

Tested locally with minikube.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20192 from vanzin/SPARK-22994.

(cherry picked from commit 0b2eefb674151a0af64806728b38d9410da552ec)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../docker-image-tool.sh                      | 68 ++++++-------
 docs/running-on-kubernetes.md                 | 58 +++++------
 .../org/apache/spark/deploy/k8s/Config.scala  | 17 ++--
 .../apache/spark/deploy/k8s/Constants.scala   |  3 +-
 .../deploy/k8s/InitContainerBootstrap.scala   |  1 +
 .../steps/BasicDriverConfigurationStep.scala  |  3 +-
 .../cluster/k8s/ExecutorPodFactory.scala      |  3 +-
 .../DriverConfigOrchestratorSuite.scala       | 12 +--
 .../BasicDriverConfigurationStepSuite.scala   |  4 +-
 ...InitContainerConfigOrchestratorSuite.scala |  4 +-
 .../cluster/k8s/ExecutorPodFactorySuite.scala |  4 +-
 .../src/main/dockerfiles/driver/Dockerfile    | 35 -------
 .../src/main/dockerfiles/executor/Dockerfile  | 35 -------
 .../dockerfiles/init-container/Dockerfile     | 24 -----
 .../main/dockerfiles/spark-base/entrypoint.sh | 37 -------
 .../{spark-base => spark}/Dockerfile          | 10 +-
 .../src/main/dockerfiles/spark/entrypoint.sh  | 97 +++++++++++++++++++
 17 files changed, 189 insertions(+), 226 deletions(-)
 rename sbin/build-push-docker-images.sh => bin/docker-image-tool.sh (63%)
 delete mode 100644 resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
 delete mode 100644 resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
 delete mode 100644 resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
 delete mode 100755 resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/entrypoint.sh
 rename resource-managers/kubernetes/docker/src/main/dockerfiles/{spark-base => spark}/Dockerfile (87%)
 create mode 100755 resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh

diff --git a/sbin/build-push-docker-images.sh b/bin/docker-image-tool.sh
similarity index 63%
rename from sbin/build-push-docker-images.sh
rename to bin/docker-image-tool.sh
index b9532597419a5..071406336d1b1 100755
--- a/sbin/build-push-docker-images.sh
+++ b/bin/docker-image-tool.sh
@@ -24,29 +24,11 @@ function error {
   exit 1
 }
 
-# Detect whether this is a git clone or a Spark distribution and adjust paths
-# accordingly.
 if [ -z "${SPARK_HOME}" ]; then
   SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
 fi
 . "${SPARK_HOME}/bin/load-spark-env.sh"
 
-if [ -f "$SPARK_HOME/RELEASE" ]; then
-  IMG_PATH="kubernetes/dockerfiles"
-  SPARK_JARS="jars"
-else
-  IMG_PATH="resource-managers/kubernetes/docker/src/main/dockerfiles"
-  SPARK_JARS="assembly/target/scala-$SPARK_SCALA_VERSION/jars"
-fi
-
-if [ ! -d "$IMG_PATH" ]; then
-  error "Cannot find docker images. This script must be run from a runnable distribution of Apache Spark."
-fi
-
-declare -A path=( [spark-driver]="$IMG_PATH/driver/Dockerfile" \
-                  [spark-executor]="$IMG_PATH/executor/Dockerfile" \
-                  [spark-init]="$IMG_PATH/init-container/Dockerfile" )
-
 function image_ref {
   local image="$1"
   local add_repo="${2:-1}"
@@ -60,35 +42,49 @@ function image_ref {
 }
 
 function build {
-  docker build \
-    --build-arg "spark_jars=$SPARK_JARS" \
-    --build-arg "img_path=$IMG_PATH" \
-    -t spark-base \
-    -f "$IMG_PATH/spark-base/Dockerfile" .
-  for image in "${!path[@]}"; do
-    docker build -t "$(image_ref $image)" -f ${path[$image]} .
-  done
+  local BUILD_ARGS
+  local IMG_PATH
+
+  if [ ! -f "$SPARK_HOME/RELEASE" ]; then
+    # Set image build arguments accordingly if this is a source repo and not a distribution archive.
+    IMG_PATH=resource-managers/kubernetes/docker/src/main/dockerfiles
+    BUILD_ARGS=(
+      --build-arg
+      img_path=$IMG_PATH
+      --build-arg
+      spark_jars=assembly/target/scala-$SPARK_SCALA_VERSION/jars
+    )
+  else
+    # Not passed as an argument to docker, but used to validate the Spark directory.
+    IMG_PATH="kubernetes/dockerfiles"
+  fi
+
+  if [ ! -d "$IMG_PATH" ]; then
+    error "Cannot find docker image. This script must be run from a runnable distribution of Apache Spark."
+  fi
+
+  docker build "${BUILD_ARGS[@]}" \
+    -t $(image_ref spark) \
+    -f "$IMG_PATH/spark/Dockerfile" .
 }
 
 function push {
-  for image in "${!path[@]}"; do
-    docker push "$(image_ref $image)"
-  done
+  docker push "$(image_ref spark)"
 }
 
 function usage {
   cat <<EOF
 Usage: $0 [options] [command]
-Builds or pushes the built-in Spark Docker images.
+Builds or pushes the built-in Spark Docker image.
 
 Commands:
-  build       Build images.
-  push        Push images to a registry. Requires a repository address to be provided, both
-              when building and when pushing the images.
+  build       Build image. Requires a repository address to be provided if the image will be
+              pushed to a different registry.
+  push        Push a pre-built image to a registry. Requires a repository address to be provided.
 
 Options:
   -r repo     Repository address.
-  -t tag      Tag to apply to built images, or to identify images to be pushed.
+  -t tag      Tag to apply to the built image, or to identify the image to be pushed.
   -m          Use minikube's Docker daemon.
 
 Using minikube when building images will do so directly into minikube's Docker daemon.
@@ -100,10 +96,10 @@ Check the following documentation for more information on using the minikube Doc
   https://kubernetes.io/docs/getting-started-guides/minikube/#reusing-the-docker-daemon
 
 Examples:
-  - Build images in minikube with tag "testing"
+  - Build image in minikube with tag "testing"
     $0 -m -t testing build
 
-  - Build and push images with tag "v2.3.0" to docker.io/myrepo
+  - Build and push image with tag "v2.3.0" to docker.io/myrepo
     $0 -r docker.io/myrepo -t v2.3.0 build
     $0 -r docker.io/myrepo -t v2.3.0 push
 EOF
diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 2d69f636472ae..08ec34c63ba3f 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -53,20 +53,17 @@ in a future release.
 
 Kubernetes requires users to supply images that can be deployed into containers within pods. The images are built to
 be run in a container runtime environment that Kubernetes supports. Docker is a container runtime environment that is
-frequently used with Kubernetes. With Spark 2.3, there are Dockerfiles provided in the runnable distribution that can be customized
-and built for your usage.
+frequently used with Kubernetes. Spark (starting with version 2.3) ships with a Dockerfile that can be used for this
+purpose, or customized to match an individual application's needs. It can be found in the `kubernetes/dockerfiles/`
+directory.
 
-You may build these docker images from sources.
-There is a script, `sbin/build-push-docker-images.sh` that you can use to build and push
-customized Spark distribution images consisting of all the above components.
+Spark also ships with a `bin/docker-image-tool.sh` script that can be used to build and publish the Docker images to
+use with the Kubernetes backend.
 
 Example usage is:
 
-    ./sbin/build-push-docker-images.sh -r <repo> -t my-tag build
-    ./sbin/build-push-docker-images.sh -r <repo> -t my-tag push
-
-Docker files are under the `kubernetes/dockerfiles/` directory and can be customized further before
-building using the supplied script, or manually.
+    ./bin/docker-image-tool.sh -r <repo> -t my-tag build
+    ./bin/docker-image-tool.sh -r <repo> -t my-tag push
 
 ## Cluster Mode
 
@@ -79,8 +76,7 @@ $ bin/spark-submit \
     --name spark-pi \
     --class org.apache.spark.examples.SparkPi \
     --conf spark.executor.instances=5 \
-    --conf spark.kubernetes.driver.container.image=<driver-image> \
-    --conf spark.kubernetes.executor.container.image=<executor-image> \
+    --conf spark.kubernetes.container.image=<spark-image> \
     local:///path/to/examples.jar
 ```
 
@@ -126,13 +122,7 @@ Those dependencies can be added to the classpath by referencing them with `local
 ### Using Remote Dependencies
 When there are application dependencies hosted in remote locations like HDFS or HTTP servers, the driver and executor pods
 need a Kubernetes [init-container](https://kubernetes.io/docs/concepts/workloads/pods/init-containers/) for downloading
-the dependencies so the driver and executor containers can use them locally. This requires users to specify the container
-image for the init-container using the configuration property `spark.kubernetes.initContainer.image`. For example, users
-simply add the following option to the `spark-submit` command to specify the init-container image:
-
-```
---conf spark.kubernetes.initContainer.image=<init-container image>
-```
+the dependencies so the driver and executor containers can use them locally.
 
 The init-container handles remote dependencies specified in `spark.jars` (or the `--jars` option of `spark-submit`) and
 `spark.files` (or the `--files` option of `spark-submit`). It also handles remotely hosted main application resources, e.g.,
@@ -147,9 +137,7 @@ $ bin/spark-submit \
     --jars https://path/to/dependency1.jar,https://path/to/dependency2.jar
     --files hdfs://host:port/path/to/file1,hdfs://host:port/path/to/file2
     --conf spark.executor.instances=5 \
-    --conf spark.kubernetes.driver.container.image=<driver-image> \
-    --conf spark.kubernetes.executor.container.image=<executor-image> \
-    --conf spark.kubernetes.initContainer.image=<init-container image>
+    --conf spark.kubernetes.container.image=<spark-image> \
     https://path/to/examples.jar
 ```
 
@@ -322,21 +310,27 @@ specific to Spark on Kubernetes.
   </td>
 </tr>
 <tr>
-  <td><code>spark.kubernetes.driver.container.image</code></td>
+  <td><code>spark.kubernetes.container.image</code></td>
   <td><code>(none)</code></td>
   <td>
-    Container image to use for the driver.
-    This is usually of the form <code>example.com/repo/spark-driver:v1.0.0</code>.
-    This configuration is required and must be provided by the user.
+    Container image to use for the Spark application.
+    This is usually of the form <code>example.com/repo/spark:v1.0.0</code>.
+    This configuration is required and must be provided by the user, unless explicit
+    images are provided for each different container type.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.driver.container.image</code></td>
+  <td><code>(value of spark.kubernetes.container.image)</code></td>
+  <td>
+    Custom container image to use for the driver.
   </td>
 </tr>
 <tr>
   <td><code>spark.kubernetes.executor.container.image</code></td>
-  <td><code>(none)</code></td>
+  <td><code>(value of spark.kubernetes.container.image)</code></td>
   <td>
-    Container image to use for the executors.
-    This is usually of the form <code>example.com/repo/spark-executor:v1.0.0</code>.
-    This configuration is required and must be provided by the user.
+    Custom container image to use for executors.
   </td>
 </tr>
 <tr>
@@ -643,9 +637,9 @@ specific to Spark on Kubernetes.
 </tr>
 <tr>
   <td><code>spark.kubernetes.initContainer.image</code></td>
-  <td>(none)</td>
+  <td><code>(value of spark.kubernetes.container.image)</code></td>
   <td>
-   Container image for the <a href="https://kubernetes.io/docs/concepts/workloads/pods/init-containers/">init-container</a> of the driver and executors for downloading dependencies. This is usually of the form <code>example.com/repo/spark-init:v1.0.0</code>. This configuration is optional and must be provided by the user if any non-container local dependency is used and must be downloaded remotely.
+   Custom container image for the init container of both driver and executors.
   </td>
 </tr>
 <tr>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
index e5d79d9a9d9da..471196ac0e3f6 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
@@ -29,17 +29,23 @@ private[spark] object Config extends Logging {
       .stringConf
       .createWithDefault("default")
 
+  val CONTAINER_IMAGE =
+    ConfigBuilder("spark.kubernetes.container.image")
+      .doc("Container image to use for Spark containers. Individual container types " +
+        "(e.g. driver or executor) can also be configured to use different images if desired, " +
+        "by setting the container type-specific image name.")
+      .stringConf
+      .createOptional
+
   val DRIVER_CONTAINER_IMAGE =
     ConfigBuilder("spark.kubernetes.driver.container.image")
       .doc("Container image to use for the driver.")
-      .stringConf
-      .createOptional
+      .fallbackConf(CONTAINER_IMAGE)
 
   val EXECUTOR_CONTAINER_IMAGE =
     ConfigBuilder("spark.kubernetes.executor.container.image")
       .doc("Container image to use for the executors.")
-      .stringConf
-      .createOptional
+      .fallbackConf(CONTAINER_IMAGE)
 
   val CONTAINER_IMAGE_PULL_POLICY =
     ConfigBuilder("spark.kubernetes.container.image.pullPolicy")
@@ -148,8 +154,7 @@ private[spark] object Config extends Logging {
   val INIT_CONTAINER_IMAGE =
     ConfigBuilder("spark.kubernetes.initContainer.image")
       .doc("Image for the driver and executor's init-container for downloading dependencies.")
-      .stringConf
-      .createOptional
+      .fallbackConf(CONTAINER_IMAGE)
 
   val INIT_CONTAINER_MOUNT_TIMEOUT =
     ConfigBuilder("spark.kubernetes.mountDependencies.timeout")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
index 111cb2a3b75e5..9411956996843 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
@@ -60,10 +60,9 @@ private[spark] object Constants {
   val ENV_APPLICATION_ID = "SPARK_APPLICATION_ID"
   val ENV_EXECUTOR_ID = "SPARK_EXECUTOR_ID"
   val ENV_EXECUTOR_POD_IP = "SPARK_EXECUTOR_POD_IP"
-  val ENV_EXECUTOR_EXTRA_CLASSPATH = "SPARK_EXECUTOR_EXTRA_CLASSPATH"
   val ENV_MOUNTED_CLASSPATH = "SPARK_MOUNTED_CLASSPATH"
   val ENV_JAVA_OPT_PREFIX = "SPARK_JAVA_OPT_"
-  val ENV_SUBMIT_EXTRA_CLASSPATH = "SPARK_SUBMIT_EXTRA_CLASSPATH"
+  val ENV_CLASSPATH = "SPARK_CLASSPATH"
   val ENV_DRIVER_MAIN_CLASS = "SPARK_DRIVER_CLASS"
   val ENV_DRIVER_ARGS = "SPARK_DRIVER_ARGS"
   val ENV_DRIVER_JAVA_OPTS = "SPARK_DRIVER_JAVA_OPTS"
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/InitContainerBootstrap.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/InitContainerBootstrap.scala
index dfeccf9e2bd1c..f6a57dfe00171 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/InitContainerBootstrap.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/InitContainerBootstrap.scala
@@ -77,6 +77,7 @@ private[spark] class InitContainerBootstrap(
         .withMountPath(INIT_CONTAINER_PROPERTIES_FILE_DIR)
         .endVolumeMount()
       .addToVolumeMounts(sharedVolumeMounts: _*)
+      .addToArgs("init")
       .addToArgs(INIT_CONTAINER_PROPERTIES_FILE_PATH)
       .build()
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala
index eca46b84c6066..164e2e5594778 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala
@@ -66,7 +66,7 @@ private[spark] class BasicDriverConfigurationStep(
   override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
     val driverExtraClasspathEnv = driverExtraClasspath.map { classPath =>
       new EnvVarBuilder()
-        .withName(ENV_SUBMIT_EXTRA_CLASSPATH)
+        .withName(ENV_CLASSPATH)
         .withValue(classPath)
         .build()
     }
@@ -133,6 +133,7 @@ private[spark] class BasicDriverConfigurationStep(
         .addToLimits("memory", driverMemoryLimitQuantity)
         .addToLimits(maybeCpuLimitQuantity.toMap.asJava)
         .endResources()
+      .addToArgs("driver")
       .build()
 
     val baseDriverPod = new PodBuilder(driverSpec.driverPod)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
index bcacb3934d36a..141bd2827e7c5 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
@@ -128,7 +128,7 @@ private[spark] class ExecutorPodFactory(
       .build()
     val executorExtraClasspathEnv = executorExtraClasspath.map { cp =>
       new EnvVarBuilder()
-        .withName(ENV_EXECUTOR_EXTRA_CLASSPATH)
+        .withName(ENV_CLASSPATH)
         .withValue(cp)
         .build()
     }
@@ -181,6 +181,7 @@ private[spark] class ExecutorPodFactory(
         .endResources()
       .addAllToEnv(executorEnv.asJava)
       .withPorts(requiredPorts.asJava)
+      .addToArgs("executor")
       .build()
 
     val executorPod = new PodBuilder()
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala
index f193b1f4d3664..65274c6f50e01 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala
@@ -34,8 +34,7 @@ class DriverConfigOrchestratorSuite extends SparkFunSuite {
   private val SECRET_MOUNT_PATH = "/etc/secrets/driver"
 
   test("Base submission steps with a main app resource.") {
-    val sparkConf = new SparkConf(false)
-      .set(DRIVER_CONTAINER_IMAGE, DRIVER_IMAGE)
+    val sparkConf = new SparkConf(false).set(CONTAINER_IMAGE, DRIVER_IMAGE)
     val mainAppResource = JavaMainAppResource("local:///var/apps/jars/main.jar")
     val orchestrator = new DriverConfigOrchestrator(
       APP_ID,
@@ -55,8 +54,7 @@ class DriverConfigOrchestratorSuite extends SparkFunSuite {
   }
 
   test("Base submission steps without a main app resource.") {
-    val sparkConf = new SparkConf(false)
-      .set(DRIVER_CONTAINER_IMAGE, DRIVER_IMAGE)
+    val sparkConf = new SparkConf(false).set(CONTAINER_IMAGE, DRIVER_IMAGE)
     val orchestrator = new DriverConfigOrchestrator(
       APP_ID,
       LAUNCH_TIME,
@@ -75,8 +73,8 @@ class DriverConfigOrchestratorSuite extends SparkFunSuite {
 
   test("Submission steps with an init-container.") {
     val sparkConf = new SparkConf(false)
-      .set(DRIVER_CONTAINER_IMAGE, DRIVER_IMAGE)
-      .set(INIT_CONTAINER_IMAGE, IC_IMAGE)
+      .set(CONTAINER_IMAGE, DRIVER_IMAGE)
+      .set(INIT_CONTAINER_IMAGE.key, IC_IMAGE)
       .set("spark.jars", "hdfs://localhost:9000/var/apps/jars/jar1.jar")
     val mainAppResource = JavaMainAppResource("local:///var/apps/jars/main.jar")
     val orchestrator = new DriverConfigOrchestrator(
@@ -98,7 +96,7 @@ class DriverConfigOrchestratorSuite extends SparkFunSuite {
 
   test("Submission steps with driver secrets to mount") {
     val sparkConf = new SparkConf(false)
-      .set(DRIVER_CONTAINER_IMAGE, DRIVER_IMAGE)
+      .set(CONTAINER_IMAGE, DRIVER_IMAGE)
       .set(s"$KUBERNETES_DRIVER_SECRETS_PREFIX$SECRET_FOO", SECRET_MOUNT_PATH)
       .set(s"$KUBERNETES_DRIVER_SECRETS_PREFIX$SECRET_BAR", SECRET_MOUNT_PATH)
     val mainAppResource = JavaMainAppResource("local:///var/apps/jars/main.jar")
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala
index 8ee629ac8ddc1..b136f2c02ffba 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala
@@ -47,7 +47,7 @@ class BasicDriverConfigurationStepSuite extends SparkFunSuite {
       .set(KUBERNETES_DRIVER_LIMIT_CORES, "4")
       .set(org.apache.spark.internal.config.DRIVER_MEMORY.key, "256M")
       .set(org.apache.spark.internal.config.DRIVER_MEMORY_OVERHEAD, 200L)
-      .set(DRIVER_CONTAINER_IMAGE, "spark-driver:latest")
+      .set(CONTAINER_IMAGE, "spark-driver:latest")
       .set(s"$KUBERNETES_DRIVER_ANNOTATION_PREFIX$CUSTOM_ANNOTATION_KEY", CUSTOM_ANNOTATION_VALUE)
       .set(s"$KUBERNETES_DRIVER_ENV_KEY$DRIVER_CUSTOM_ENV_KEY1", "customDriverEnv1")
       .set(s"$KUBERNETES_DRIVER_ENV_KEY$DRIVER_CUSTOM_ENV_KEY2", "customDriverEnv2")
@@ -79,7 +79,7 @@ class BasicDriverConfigurationStepSuite extends SparkFunSuite {
       .asScala
       .map(env => (env.getName, env.getValue))
       .toMap
-    assert(envs(ENV_SUBMIT_EXTRA_CLASSPATH) === "/opt/spark/spark-examples.jar")
+    assert(envs(ENV_CLASSPATH) === "/opt/spark/spark-examples.jar")
     assert(envs(ENV_DRIVER_MEMORY) === "256M")
     assert(envs(ENV_DRIVER_MAIN_CLASS) === MAIN_CLASS)
     assert(envs(ENV_DRIVER_ARGS) === "arg1 arg2 \"arg 3\"")
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestratorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestratorSuite.scala
index 20f2e5bc15df3..09b42e4484d86 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestratorSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestratorSuite.scala
@@ -40,7 +40,7 @@ class InitContainerConfigOrchestratorSuite extends SparkFunSuite {
 
   test("including basic configuration step") {
     val sparkConf = new SparkConf(true)
-      .set(INIT_CONTAINER_IMAGE, DOCKER_IMAGE)
+      .set(CONTAINER_IMAGE, DOCKER_IMAGE)
       .set(s"$KUBERNETES_DRIVER_LABEL_PREFIX$CUSTOM_LABEL_KEY", CUSTOM_LABEL_VALUE)
 
     val orchestrator = new InitContainerConfigOrchestrator(
@@ -59,7 +59,7 @@ class InitContainerConfigOrchestratorSuite extends SparkFunSuite {
 
   test("including step to mount user-specified secrets") {
     val sparkConf = new SparkConf(false)
-      .set(INIT_CONTAINER_IMAGE, DOCKER_IMAGE)
+      .set(CONTAINER_IMAGE, DOCKER_IMAGE)
       .set(s"$KUBERNETES_DRIVER_SECRETS_PREFIX$SECRET_FOO", SECRET_MOUNT_PATH)
       .set(s"$KUBERNETES_DRIVER_SECRETS_PREFIX$SECRET_BAR", SECRET_MOUNT_PATH)
 
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
index 7cfbe54c95390..a3c615be031d2 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
@@ -54,7 +54,7 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
     baseConf = new SparkConf()
       .set(KUBERNETES_DRIVER_POD_NAME, driverPodName)
       .set(KUBERNETES_EXECUTOR_POD_NAME_PREFIX, executorPrefix)
-      .set(EXECUTOR_CONTAINER_IMAGE, executorImage)
+      .set(CONTAINER_IMAGE, executorImage)
   }
 
   test("basic executor pod has reasonable defaults") {
@@ -107,7 +107,7 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
 
     checkEnv(executor,
       Map("SPARK_JAVA_OPT_0" -> "foo=bar",
-        "SPARK_EXECUTOR_EXTRA_CLASSPATH" -> "bar=baz",
+        ENV_CLASSPATH -> "bar=baz",
         "qux" -> "quux"))
     checkOwnerReferences(executor, driverPodUid)
   }
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
deleted file mode 100644
index 45fbcd9cd0deb..0000000000000
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
+++ /dev/null
@@ -1,35 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-FROM spark-base
-
-# Before building the docker image, first build and make a Spark distribution following
-# the instructions in http://spark.apache.org/docs/latest/building-spark.html.
-# If this docker file is being used in the context of building your images from a Spark
-# distribution, the docker build command should be invoked from the top level directory
-# of the Spark distribution. E.g.:
-# docker build -t spark-driver:latest -f kubernetes/dockerfiles/driver/Dockerfile .
-
-COPY examples /opt/spark/examples
-
-CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
-    env | grep SPARK_JAVA_OPT_ | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt && \
-    readarray -t SPARK_DRIVER_JAVA_OPTS < /tmp/java_opts.txt && \
-    if ! [ -z ${SPARK_MOUNTED_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_MOUNTED_CLASSPATH:$SPARK_CLASSPATH"; fi && \
-    if ! [ -z ${SPARK_SUBMIT_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_SUBMIT_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
-    if ! [ -z ${SPARK_MOUNTED_FILES_DIR+x} ]; then cp -R "$SPARK_MOUNTED_FILES_DIR/." .; fi && \
-    ${JAVA_HOME}/bin/java "${SPARK_DRIVER_JAVA_OPTS[@]}" -cp "$SPARK_CLASSPATH" -Xms$SPARK_DRIVER_MEMORY -Xmx$SPARK_DRIVER_MEMORY -Dspark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS $SPARK_DRIVER_CLASS $SPARK_DRIVER_ARGS
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
deleted file mode 100644
index 0f806cf7e148e..0000000000000
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
+++ /dev/null
@@ -1,35 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-FROM spark-base
-
-# Before building the docker image, first build and make a Spark distribution following
-# the instructions in http://spark.apache.org/docs/latest/building-spark.html.
-# If this docker file is being used in the context of building your images from a Spark
-# distribution, the docker build command should be invoked from the top level directory
-# of the Spark distribution. E.g.:
-# docker build -t spark-executor:latest -f kubernetes/dockerfiles/executor/Dockerfile .
-
-COPY examples /opt/spark/examples
-
-CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
-    env | grep SPARK_JAVA_OPT_ | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt && \
-    readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt && \
-    if ! [ -z ${SPARK_MOUNTED_CLASSPATH}+x} ]; then SPARK_CLASSPATH="$SPARK_MOUNTED_CLASSPATH:$SPARK_CLASSPATH"; fi && \
-    if ! [ -z ${SPARK_EXECUTOR_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_EXECUTOR_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
-    if ! [ -z ${SPARK_MOUNTED_FILES_DIR+x} ]; then cp -R "$SPARK_MOUNTED_FILES_DIR/." .; fi && \
-    ${JAVA_HOME}/bin/java "${SPARK_EXECUTOR_JAVA_OPTS[@]}" -Xms$SPARK_EXECUTOR_MEMORY -Xmx$SPARK_EXECUTOR_MEMORY -cp "$SPARK_CLASSPATH" org.apache.spark.executor.CoarseGrainedExecutorBackend --driver-url $SPARK_DRIVER_URL --executor-id $SPARK_EXECUTOR_ID --cores $SPARK_EXECUTOR_CORES --app-id $SPARK_APPLICATION_ID --hostname $SPARK_EXECUTOR_POD_IP
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
deleted file mode 100644
index 047056ab2633b..0000000000000
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
+++ /dev/null
@@ -1,24 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-FROM spark-base
-
-# If this docker file is being used in the context of building your images from a Spark distribution, the docker build
-# command should be invoked from the top level directory of the Spark distribution. E.g.:
-# docker build -t spark-init:latest -f kubernetes/dockerfiles/init-container/Dockerfile .
-
-ENTRYPOINT [ "/opt/entrypoint.sh", "/opt/spark/bin/spark-class", "org.apache.spark.deploy.k8s.SparkPodInitContainer" ]
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/entrypoint.sh b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/entrypoint.sh
deleted file mode 100755
index 82559889f4beb..0000000000000
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/entrypoint.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# echo commands to the terminal output
-set -ex
-
-# Check whether there is a passwd entry for the container UID
-myuid=$(id -u)
-mygid=$(id -g)
-uidentry=$(getent passwd $myuid)
-
-# If there is no passwd entry for the container UID, attempt to create one
-if [ -z "$uidentry" ] ; then
-    if [ -w /etc/passwd ] ; then
-        echo "$myuid:x:$myuid:$mygid:anonymous uid:$SPARK_HOME:/bin/false" >> /etc/passwd
-    else
-        echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID"
-    fi
-fi
-
-# Execute the container CMD under tini for better hygiene
-/sbin/tini -s -- "$@"
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
similarity index 87%
rename from resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile
rename to resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
index da1d6b9e161cc..491b7cf692478 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
@@ -17,15 +17,15 @@
 
 FROM openjdk:8-alpine
 
-ARG spark_jars
-ARG img_path
+ARG spark_jars=jars
+ARG img_path=kubernetes/dockerfiles
 
 # Before building the docker image, first build and make a Spark distribution following
 # the instructions in http://spark.apache.org/docs/latest/building-spark.html.
 # If this docker file is being used in the context of building your images from a Spark
 # distribution, the docker build command should be invoked from the top level directory
 # of the Spark distribution. E.g.:
-# docker build -t spark-base:latest -f kubernetes/dockerfiles/spark-base/Dockerfile .
+# docker build -t spark:latest -f kubernetes/dockerfiles/spark/Dockerfile .
 
 RUN set -ex && \
     apk upgrade --no-cache && \
@@ -41,7 +41,9 @@ COPY ${spark_jars} /opt/spark/jars
 COPY bin /opt/spark/bin
 COPY sbin /opt/spark/sbin
 COPY conf /opt/spark/conf
-COPY ${img_path}/spark-base/entrypoint.sh /opt/
+COPY ${img_path}/spark/entrypoint.sh /opt/
+COPY examples /opt/spark/examples
+COPY data /opt/spark/data
 
 ENV SPARK_HOME /opt/spark
 
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
new file mode 100755
index 0000000000000..0c28c75857871
--- /dev/null
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# echo commands to the terminal output
+set -ex
+
+# Check whether there is a passwd entry for the container UID
+myuid=$(id -u)
+mygid=$(id -g)
+uidentry=$(getent passwd $myuid)
+
+# If there is no passwd entry for the container UID, attempt to create one
+if [ -z "$uidentry" ] ; then
+    if [ -w /etc/passwd ] ; then
+        echo "$myuid:x:$myuid:$mygid:anonymous uid:$SPARK_HOME:/bin/false" >> /etc/passwd
+    else
+        echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID"
+    fi
+fi
+
+SPARK_K8S_CMD="$1"
+if [ -z "$SPARK_K8S_CMD" ]; then
+  echo "No command to execute has been provided." 1>&2
+  exit 1
+fi
+shift 1
+
+SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*"
+env | grep SPARK_JAVA_OPT_ | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt
+readarray -t SPARK_DRIVER_JAVA_OPTS < /tmp/java_opts.txt
+if [ -n "$SPARK_MOUNTED_CLASSPATH" ]; then
+  SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_MOUNTED_CLASSPATH"
+fi
+if [ -n "$SPARK_MOUNTED_FILES_DIR" ]; then
+  cp -R "$SPARK_MOUNTED_FILES_DIR/." .
+fi
+
+case "$SPARK_K8S_CMD" in
+  driver)
+    CMD=(
+      ${JAVA_HOME}/bin/java
+      "${SPARK_DRIVER_JAVA_OPTS[@]}"
+      -cp "$SPARK_CLASSPATH"
+      -Xms$SPARK_DRIVER_MEMORY
+      -Xmx$SPARK_DRIVER_MEMORY
+      -Dspark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS
+      $SPARK_DRIVER_CLASS
+      $SPARK_DRIVER_ARGS
+    )
+    ;;
+
+  executor)
+    CMD=(
+      ${JAVA_HOME}/bin/java
+      "${SPARK_EXECUTOR_JAVA_OPTS[@]}"
+      -Xms$SPARK_EXECUTOR_MEMORY
+      -Xmx$SPARK_EXECUTOR_MEMORY
+      -cp "$SPARK_CLASSPATH"
+      org.apache.spark.executor.CoarseGrainedExecutorBackend
+      --driver-url $SPARK_DRIVER_URL
+      --executor-id $SPARK_EXECUTOR_ID
+      --cores $SPARK_EXECUTOR_CORES
+      --app-id $SPARK_APPLICATION_ID
+      --hostname $SPARK_EXECUTOR_POD_IP
+    )
+    ;;
+
+  init)
+    CMD=(
+      "$SPARK_HOME/bin/spark-class"
+      "org.apache.spark.deploy.k8s.SparkPodInitContainer"
+      "$@"
+    )
+    ;;
+
+  *)
+    echo "Unknown command: $SPARK_K8S_CMD" 1>&2
+    exit 1
+esac
+
+# Execute the container CMD under tini for better hygiene
+exec /sbin/tini -s -- "${CMD[@]}"

From f891ee3249e04576dd579cbab6f8f1632550e6bd Mon Sep 17 00:00:00 2001
From: Jose Torres <jose@databricks.com>
Date: Thu, 11 Jan 2018 10:52:12 -0800
Subject: [PATCH 069/566] [SPARK-22908] Add kafka source and sink for
 continuous processing.

## What changes were proposed in this pull request?

Add kafka source and sink for continuous processing. This involves two small changes to the execution engine:

* Bring data reader close() into the normal data reader thread to avoid thread safety issues.
* Fix up the semantics of the RECONFIGURING StreamExecution state. State updates are now atomic, and we don't have to deal with swallowing an exception.

## How was this patch tested?

new unit tests

Author: Jose Torres <jose@databricks.com>

Closes #20096 from jose-torres/continuous-kafka.

(cherry picked from commit 6f7aaed805070d29dcba32e04ca7a1f581fa54b9)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../sql/kafka010/KafkaContinuousReader.scala  | 232 +++++++++
 .../sql/kafka010/KafkaContinuousWriter.scala  | 119 +++++
 .../sql/kafka010/KafkaOffsetReader.scala      |  21 +-
 .../spark/sql/kafka010/KafkaSource.scala      |  17 +-
 .../sql/kafka010/KafkaSourceOffset.scala      |   7 +-
 .../sql/kafka010/KafkaSourceProvider.scala    | 105 +++-
 .../spark/sql/kafka010/KafkaWriteTask.scala   |  71 ++-
 .../spark/sql/kafka010/KafkaWriter.scala      |   5 +-
 .../kafka010/KafkaContinuousSinkSuite.scala   | 474 ++++++++++++++++++
 .../kafka010/KafkaContinuousSourceSuite.scala |  96 ++++
 .../sql/kafka010/KafkaContinuousTest.scala    |  64 +++
 .../spark/sql/kafka010/KafkaSourceSuite.scala | 470 +++++++++--------
 .../apache/spark/sql/DataFrameReader.scala    |  32 +-
 .../apache/spark/sql/DataFrameWriter.scala    |  25 +-
 .../datasources/v2/WriteToDataSourceV2.scala  |   8 +-
 .../execution/streaming/StreamExecution.scala |  15 +-
 .../ContinuousDataSourceRDDIter.scala         |   3 +-
 .../continuous/ContinuousExecution.scala      |  67 ++-
 .../continuous/EpochCoordinator.scala         |  21 +-
 .../sql/streaming/DataStreamWriter.scala      |  26 +-
 .../spark/sql/streaming/StreamTest.scala      |  36 +-
 21 files changed, 1531 insertions(+), 383 deletions(-)
 create mode 100644 external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
 create mode 100644 external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala
 create mode 100644 external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
 create mode 100644 external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala
 create mode 100644 external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
new file mode 100644
index 0000000000000..928379544758c
--- /dev/null
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.{util => ju}
+
+import org.apache.kafka.clients.consumer.ConsumerRecord
+import org.apache.kafka.common.TopicPartition
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter}
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.kafka010.KafkaSource.{INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE, INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE}
+import org.apache.spark.sql.sources.v2.reader._
+import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousDataReader, ContinuousReader, Offset, PartitionOffset}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.unsafe.types.UTF8String
+
+/**
+ * A [[ContinuousReader]] for data from kafka.
+ *
+ * @param offsetReader  a reader used to get kafka offsets. Note that the actual data will be
+ *                      read by per-task consumers generated later.
+ * @param kafkaParams   String params for per-task Kafka consumers.
+ * @param sourceOptions The [[org.apache.spark.sql.sources.v2.DataSourceV2Options]] params which
+ *                      are not Kafka consumer params.
+ * @param metadataPath Path to a directory this reader can use for writing metadata.
+ * @param initialOffsets The Kafka offsets to start reading data at.
+ * @param failOnDataLoss Flag indicating whether reading should fail in data loss
+ *                       scenarios, where some offsets after the specified initial ones can't be
+ *                       properly read.
+ */
+class KafkaContinuousReader(
+    offsetReader: KafkaOffsetReader,
+    kafkaParams: ju.Map[String, Object],
+    sourceOptions: Map[String, String],
+    metadataPath: String,
+    initialOffsets: KafkaOffsetRangeLimit,
+    failOnDataLoss: Boolean)
+  extends ContinuousReader with SupportsScanUnsafeRow with Logging {
+
+  private lazy val session = SparkSession.getActiveSession.get
+  private lazy val sc = session.sparkContext
+
+  // Initialized when creating read tasks. If this diverges from the partitions at the latest
+  // offsets, we need to reconfigure.
+  // Exposed outside this object only for unit tests.
+  private[sql] var knownPartitions: Set[TopicPartition] = _
+
+  override def readSchema: StructType = KafkaOffsetReader.kafkaSchema
+
+  private var offset: Offset = _
+  override def setOffset(start: ju.Optional[Offset]): Unit = {
+    offset = start.orElse {
+      val offsets = initialOffsets match {
+        case EarliestOffsetRangeLimit => KafkaSourceOffset(offsetReader.fetchEarliestOffsets())
+        case LatestOffsetRangeLimit => KafkaSourceOffset(offsetReader.fetchLatestOffsets())
+        case SpecificOffsetRangeLimit(p) => offsetReader.fetchSpecificOffsets(p, reportDataLoss)
+      }
+      logInfo(s"Initial offsets: $offsets")
+      offsets
+    }
+  }
+
+  override def getStartOffset(): Offset = offset
+
+  override def deserializeOffset(json: String): Offset = {
+    KafkaSourceOffset(JsonUtils.partitionOffsets(json))
+  }
+
+  override def createUnsafeRowReadTasks(): ju.List[ReadTask[UnsafeRow]] = {
+    import scala.collection.JavaConverters._
+
+    val oldStartPartitionOffsets = KafkaSourceOffset.getPartitionOffsets(offset)
+
+    val currentPartitionSet = offsetReader.fetchEarliestOffsets().keySet
+    val newPartitions = currentPartitionSet.diff(oldStartPartitionOffsets.keySet)
+    val newPartitionOffsets = offsetReader.fetchEarliestOffsets(newPartitions.toSeq)
+
+    val deletedPartitions = oldStartPartitionOffsets.keySet.diff(currentPartitionSet)
+    if (deletedPartitions.nonEmpty) {
+      reportDataLoss(s"Some partitions were deleted: $deletedPartitions")
+    }
+
+    val startOffsets = newPartitionOffsets ++
+      oldStartPartitionOffsets.filterKeys(!deletedPartitions.contains(_))
+    knownPartitions = startOffsets.keySet
+
+    startOffsets.toSeq.map {
+      case (topicPartition, start) =>
+        KafkaContinuousReadTask(
+          topicPartition, start, kafkaParams, failOnDataLoss)
+          .asInstanceOf[ReadTask[UnsafeRow]]
+    }.asJava
+  }
+
+  /** Stop this source and free any resources it has allocated. */
+  def stop(): Unit = synchronized {
+    offsetReader.close()
+  }
+
+  override def commit(end: Offset): Unit = {}
+
+  override def mergeOffsets(offsets: Array[PartitionOffset]): Offset = {
+    val mergedMap = offsets.map {
+      case KafkaSourcePartitionOffset(p, o) => Map(p -> o)
+    }.reduce(_ ++ _)
+    KafkaSourceOffset(mergedMap)
+  }
+
+  override def needsReconfiguration(): Boolean = {
+    knownPartitions != null && offsetReader.fetchLatestOffsets().keySet != knownPartitions
+  }
+
+  override def toString(): String = s"KafkaSource[$offsetReader]"
+
+  /**
+   * If `failOnDataLoss` is true, this method will throw an `IllegalStateException`.
+   * Otherwise, just log a warning.
+   */
+  private def reportDataLoss(message: String): Unit = {
+    if (failOnDataLoss) {
+      throw new IllegalStateException(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE")
+    } else {
+      logWarning(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE")
+    }
+  }
+}
+
+/**
+ * A read task for continuous Kafka processing. This will be serialized and transformed into a
+ * full reader on executors.
+ *
+ * @param topicPartition The (topic, partition) pair this task is responsible for.
+ * @param startOffset The offset to start reading from within the partition.
+ * @param kafkaParams Kafka consumer params to use.
+ * @param failOnDataLoss Flag indicating whether data reader should fail if some offsets
+ *                       are skipped.
+ */
+case class KafkaContinuousReadTask(
+    topicPartition: TopicPartition,
+    startOffset: Long,
+    kafkaParams: ju.Map[String, Object],
+    failOnDataLoss: Boolean) extends ReadTask[UnsafeRow] {
+  override def createDataReader(): KafkaContinuousDataReader = {
+    new KafkaContinuousDataReader(topicPartition, startOffset, kafkaParams, failOnDataLoss)
+  }
+}
+
+/**
+ * A per-task data reader for continuous Kafka processing.
+ *
+ * @param topicPartition The (topic, partition) pair this data reader is responsible for.
+ * @param startOffset The offset to start reading from within the partition.
+ * @param kafkaParams Kafka consumer params to use.
+ * @param failOnDataLoss Flag indicating whether data reader should fail if some offsets
+ *                       are skipped.
+ */
+class KafkaContinuousDataReader(
+    topicPartition: TopicPartition,
+    startOffset: Long,
+    kafkaParams: ju.Map[String, Object],
+    failOnDataLoss: Boolean) extends ContinuousDataReader[UnsafeRow] {
+  private val topic = topicPartition.topic
+  private val kafkaPartition = topicPartition.partition
+  private val consumer = CachedKafkaConsumer.createUncached(topic, kafkaPartition, kafkaParams)
+
+  private val sharedRow = new UnsafeRow(7)
+  private val bufferHolder = new BufferHolder(sharedRow)
+  private val rowWriter = new UnsafeRowWriter(bufferHolder, 7)
+
+  private var nextKafkaOffset = startOffset
+  private var currentRecord: ConsumerRecord[Array[Byte], Array[Byte]] = _
+
+  override def next(): Boolean = {
+    var r: ConsumerRecord[Array[Byte], Array[Byte]] = null
+    while (r == null) {
+      r = consumer.get(
+        nextKafkaOffset,
+        untilOffset = Long.MaxValue,
+        pollTimeoutMs = Long.MaxValue,
+        failOnDataLoss)
+    }
+    nextKafkaOffset = r.offset + 1
+    currentRecord = r
+    true
+  }
+
+  override def get(): UnsafeRow = {
+    bufferHolder.reset()
+
+    if (currentRecord.key == null) {
+      rowWriter.setNullAt(0)
+    } else {
+      rowWriter.write(0, currentRecord.key)
+    }
+    rowWriter.write(1, currentRecord.value)
+    rowWriter.write(2, UTF8String.fromString(currentRecord.topic))
+    rowWriter.write(3, currentRecord.partition)
+    rowWriter.write(4, currentRecord.offset)
+    rowWriter.write(5,
+      DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(currentRecord.timestamp)))
+    rowWriter.write(6, currentRecord.timestampType.id)
+    sharedRow.setTotalSize(bufferHolder.totalSize)
+    sharedRow
+  }
+
+  override def getOffset(): KafkaSourcePartitionOffset = {
+    KafkaSourcePartitionOffset(topicPartition, nextKafkaOffset)
+  }
+
+  override def close(): Unit = {
+    consumer.close()
+  }
+}
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala
new file mode 100644
index 0000000000000..9843f469c5b25
--- /dev/null
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import org.apache.kafka.clients.producer.{Callback, ProducerRecord, RecordMetadata}
+import scala.collection.JavaConverters._
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Literal, UnsafeProjection}
+import org.apache.spark.sql.kafka010.KafkaSourceProvider.{kafkaParamsForProducer, TOPIC_OPTION_KEY}
+import org.apache.spark.sql.kafka010.KafkaWriter.validateQuery
+import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
+import org.apache.spark.sql.sources.v2.writer._
+import org.apache.spark.sql.streaming.OutputMode
+import org.apache.spark.sql.types.{BinaryType, StringType, StructType}
+
+/**
+ * Dummy commit message. The DataSourceV2 framework requires a commit message implementation but we
+ * don't need to really send one.
+ */
+case object KafkaWriterCommitMessage extends WriterCommitMessage
+
+/**
+ * A [[ContinuousWriter]] for Kafka writing. Responsible for generating the writer factory.
+ * @param topic The topic this writer is responsible for. If None, topic will be inferred from
+ *              a `topic` field in the incoming data.
+ * @param producerParams Parameters for Kafka producers in each task.
+ * @param schema The schema of the input data.
+ */
+class KafkaContinuousWriter(
+    topic: Option[String], producerParams: Map[String, String], schema: StructType)
+  extends ContinuousWriter with SupportsWriteInternalRow {
+
+  validateQuery(schema.toAttributes, producerParams.toMap[String, Object].asJava, topic)
+
+  override def createInternalRowWriterFactory(): KafkaContinuousWriterFactory =
+    KafkaContinuousWriterFactory(topic, producerParams, schema)
+
+  override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {}
+  override def abort(messages: Array[WriterCommitMessage]): Unit = {}
+}
+
+/**
+ * A [[DataWriterFactory]] for Kafka writing. Will be serialized and sent to executors to generate
+ * the per-task data writers.
+ * @param topic The topic that should be written to. If None, topic will be inferred from
+ *              a `topic` field in the incoming data.
+ * @param producerParams Parameters for Kafka producers in each task.
+ * @param schema The schema of the input data.
+ */
+case class KafkaContinuousWriterFactory(
+    topic: Option[String], producerParams: Map[String, String], schema: StructType)
+  extends DataWriterFactory[InternalRow] {
+
+  override def createDataWriter(partitionId: Int, attemptNumber: Int): DataWriter[InternalRow] = {
+    new KafkaContinuousDataWriter(topic, producerParams, schema.toAttributes)
+  }
+}
+
+/**
+ * A [[DataWriter]] for Kafka writing. One data writer will be created in each partition to
+ * process incoming rows.
+ *
+ * @param targetTopic The topic that this data writer is targeting. If None, topic will be inferred
+ *                    from a `topic` field in the incoming data.
+ * @param producerParams Parameters to use for the Kafka producer.
+ * @param inputSchema The attributes in the input data.
+ */
+class KafkaContinuousDataWriter(
+    targetTopic: Option[String], producerParams: Map[String, String], inputSchema: Seq[Attribute])
+  extends KafkaRowWriter(inputSchema, targetTopic) with DataWriter[InternalRow] {
+  import scala.collection.JavaConverters._
+
+  private lazy val producer = CachedKafkaProducer.getOrCreate(
+    new java.util.HashMap[String, Object](producerParams.asJava))
+
+  def write(row: InternalRow): Unit = {
+    checkForErrors()
+    sendRow(row, producer)
+  }
+
+  def commit(): WriterCommitMessage = {
+    // Send is asynchronous, but we can't commit until all rows are actually in Kafka.
+    // This requires flushing and then checking that no callbacks produced errors.
+    // We also check for errors before to fail as soon as possible - the check is cheap.
+    checkForErrors()
+    producer.flush()
+    checkForErrors()
+    KafkaWriterCommitMessage
+  }
+
+  def abort(): Unit = {}
+
+  def close(): Unit = {
+    checkForErrors()
+    if (producer != null) {
+      producer.flush()
+      checkForErrors()
+      CachedKafkaProducer.close(new java.util.HashMap[String, Object](producerParams.asJava))
+    }
+  }
+}
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
index 3e65949a6fd1b..551641cfdbca8 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
@@ -117,10 +117,14 @@ private[kafka010] class KafkaOffsetReader(
    * Resolves the specific offsets based on Kafka seek positions.
    * This method resolves offset value -1 to the latest and -2 to the
    * earliest Kafka seek position.
+   *
+   * @param partitionOffsets the specific offsets to resolve
+   * @param reportDataLoss callback to either report or log data loss depending on setting
    */
   def fetchSpecificOffsets(
-      partitionOffsets: Map[TopicPartition, Long]): Map[TopicPartition, Long] =
-    runUninterruptibly {
+      partitionOffsets: Map[TopicPartition, Long],
+      reportDataLoss: String => Unit): KafkaSourceOffset = {
+    val fetched = runUninterruptibly {
       withRetriesWithoutInterrupt {
         // Poll to get the latest assigned partitions
         consumer.poll(0)
@@ -145,6 +149,19 @@ private[kafka010] class KafkaOffsetReader(
       }
     }
 
+    partitionOffsets.foreach {
+      case (tp, off) if off != KafkaOffsetRangeLimit.LATEST &&
+        off != KafkaOffsetRangeLimit.EARLIEST =>
+        if (fetched(tp) != off) {
+          reportDataLoss(
+            s"startingOffsets for $tp was $off but consumer reset to ${fetched(tp)}")
+        }
+      case _ =>
+        // no real way to check that beginning or end is reasonable
+    }
+    KafkaSourceOffset(fetched)
+  }
+
   /**
    * Fetch the earliest offsets for the topic partitions that are indicated
    * in the [[ConsumerStrategy]].
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
index e9cff04ba5f2e..27da76068a66f 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -130,7 +130,7 @@ private[kafka010] class KafkaSource(
       val offsets = startingOffsets match {
         case EarliestOffsetRangeLimit => KafkaSourceOffset(kafkaReader.fetchEarliestOffsets())
         case LatestOffsetRangeLimit => KafkaSourceOffset(kafkaReader.fetchLatestOffsets())
-        case SpecificOffsetRangeLimit(p) => fetchAndVerify(p)
+        case SpecificOffsetRangeLimit(p) => kafkaReader.fetchSpecificOffsets(p, reportDataLoss)
       }
       metadataLog.add(0, offsets)
       logInfo(s"Initial offsets: $offsets")
@@ -138,21 +138,6 @@ private[kafka010] class KafkaSource(
     }.partitionToOffsets
   }
 
-  private def fetchAndVerify(specificOffsets: Map[TopicPartition, Long]) = {
-    val result = kafkaReader.fetchSpecificOffsets(specificOffsets)
-    specificOffsets.foreach {
-      case (tp, off) if off != KafkaOffsetRangeLimit.LATEST &&
-          off != KafkaOffsetRangeLimit.EARLIEST =>
-        if (result(tp) != off) {
-          reportDataLoss(
-            s"startingOffsets for $tp was $off but consumer reset to ${result(tp)}")
-        }
-      case _ =>
-      // no real way to check that beginning or end is reasonable
-    }
-    KafkaSourceOffset(result)
-  }
-
   private var currentPartitionOffsets: Option[Map[TopicPartition, Long]] = None
 
   override def schema: StructType = KafkaOffsetReader.kafkaSchema
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
index b5da415b3097e..c82154cfbad7f 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
@@ -20,17 +20,22 @@ package org.apache.spark.sql.kafka010
 import org.apache.kafka.common.TopicPartition
 
 import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset}
+import org.apache.spark.sql.sources.v2.streaming.reader.{Offset => OffsetV2, PartitionOffset}
 
 /**
  * An [[Offset]] for the [[KafkaSource]]. This one tracks all partitions of subscribed topics and
  * their offsets.
  */
 private[kafka010]
-case class KafkaSourceOffset(partitionToOffsets: Map[TopicPartition, Long]) extends Offset {
+case class KafkaSourceOffset(partitionToOffsets: Map[TopicPartition, Long]) extends OffsetV2 {
 
   override val json = JsonUtils.partitionOffsets(partitionToOffsets)
 }
 
+private[kafka010]
+case class KafkaSourcePartitionOffset(topicPartition: TopicPartition, partitionOffset: Long)
+  extends PartitionOffset
+
 /** Companion object of the [[KafkaSourceOffset]] */
 private[kafka010] object KafkaSourceOffset {
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
index 3cb4d8cad12cc..3914370a96595 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.kafka010
 
 import java.{util => ju}
-import java.util.{Locale, UUID}
+import java.util.{Locale, Optional, UUID}
 
 import scala.collection.JavaConverters._
 
@@ -27,9 +27,12 @@ import org.apache.kafka.clients.producer.ProducerConfig
 import org.apache.kafka.common.serialization.{ByteArrayDeserializer, ByteArraySerializer}
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
-import org.apache.spark.sql.execution.streaming.{Sink, Source}
+import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SparkSession, SQLContext}
+import org.apache.spark.sql.execution.streaming.{Offset, Sink, Source}
 import org.apache.spark.sql.sources._
+import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, ContinuousWriteSupport}
+import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 
@@ -43,6 +46,8 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
     with StreamSinkProvider
     with RelationProvider
     with CreatableRelationProvider
+    with ContinuousWriteSupport
+    with ContinuousReadSupport
     with Logging {
   import KafkaSourceProvider._
 
@@ -101,6 +106,43 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
       failOnDataLoss(caseInsensitiveParams))
   }
 
+  override def createContinuousReader(
+      schema: Optional[StructType],
+      metadataPath: String,
+      options: DataSourceV2Options): KafkaContinuousReader = {
+    val parameters = options.asMap().asScala.toMap
+    validateStreamOptions(parameters)
+    // Each running query should use its own group id. Otherwise, the query may be only assigned
+    // partial data since Kafka will assign partitions to multiple consumers having the same group
+    // id. Hence, we should generate a unique id for each query.
+    val uniqueGroupId = s"spark-kafka-source-${UUID.randomUUID}-${metadataPath.hashCode}"
+
+    val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
+    val specifiedKafkaParams =
+      parameters
+        .keySet
+        .filter(_.toLowerCase(Locale.ROOT).startsWith("kafka."))
+        .map { k => k.drop(6).toString -> parameters(k) }
+        .toMap
+
+    val startingStreamOffsets = KafkaSourceProvider.getKafkaOffsetRangeLimit(caseInsensitiveParams,
+      STARTING_OFFSETS_OPTION_KEY, LatestOffsetRangeLimit)
+
+    val kafkaOffsetReader = new KafkaOffsetReader(
+      strategy(caseInsensitiveParams),
+      kafkaParamsForDriver(specifiedKafkaParams),
+      parameters,
+      driverGroupIdPrefix = s"$uniqueGroupId-driver")
+
+    new KafkaContinuousReader(
+      kafkaOffsetReader,
+      kafkaParamsForExecutors(specifiedKafkaParams, uniqueGroupId),
+      parameters,
+      metadataPath,
+      startingStreamOffsets,
+      failOnDataLoss(caseInsensitiveParams))
+  }
+
   /**
    * Returns a new base relation with the given parameters.
    *
@@ -181,26 +223,22 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
     }
   }
 
-  private def kafkaParamsForProducer(parameters: Map[String, String]): Map[String, String] = {
-    val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
-    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}")) {
-      throw new IllegalArgumentException(
-        s"Kafka option '${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}' is not supported as keys "
-          + "are serialized with ByteArraySerializer.")
-    }
+  override def createContinuousWriter(
+      queryId: String,
+      schema: StructType,
+      mode: OutputMode,
+      options: DataSourceV2Options): Optional[ContinuousWriter] = {
+    import scala.collection.JavaConverters._
 
-    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}"))
-    {
-      throw new IllegalArgumentException(
-        s"Kafka option '${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}' is not supported as "
-          + "value are serialized with ByteArraySerializer.")
-    }
-    parameters
-      .keySet
-      .filter(_.toLowerCase(Locale.ROOT).startsWith("kafka."))
-      .map { k => k.drop(6).toString -> parameters(k) }
-      .toMap + (ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName,
-        ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName)
+    val spark = SparkSession.getActiveSession.get
+    val topic = Option(options.get(TOPIC_OPTION_KEY).orElse(null)).map(_.trim)
+    // We convert the options argument from V2 -> Java map -> scala mutable -> scala immutable.
+    val producerParams = kafkaParamsForProducer(options.asMap.asScala.toMap)
+
+    KafkaWriter.validateQuery(
+      schema.toAttributes, new java.util.HashMap[String, Object](producerParams.asJava), topic)
+
+    Optional.of(new KafkaContinuousWriter(topic, producerParams, schema))
   }
 
   private def strategy(caseInsensitiveParams: Map[String, String]) =
@@ -450,4 +488,27 @@ private[kafka010] object KafkaSourceProvider extends Logging {
 
     def build(): ju.Map[String, Object] = map
   }
+
+  private[kafka010] def kafkaParamsForProducer(
+      parameters: Map[String, String]): Map[String, String] = {
+    val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
+    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}")) {
+      throw new IllegalArgumentException(
+        s"Kafka option '${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}' is not supported as keys "
+          + "are serialized with ByteArraySerializer.")
+    }
+
+    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}"))
+    {
+      throw new IllegalArgumentException(
+        s"Kafka option '${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}' is not supported as "
+          + "value are serialized with ByteArraySerializer.")
+    }
+    parameters
+      .keySet
+      .filter(_.toLowerCase(Locale.ROOT).startsWith("kafka."))
+      .map { k => k.drop(6).toString -> parameters(k) }
+      .toMap + (ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName,
+      ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName)
+  }
 }
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
index 6fd333e2f43ba..baa60febf661d 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
@@ -33,10 +33,8 @@ import org.apache.spark.sql.types.{BinaryType, StringType}
 private[kafka010] class KafkaWriteTask(
     producerConfiguration: ju.Map[String, Object],
     inputSchema: Seq[Attribute],
-    topic: Option[String]) {
+    topic: Option[String]) extends KafkaRowWriter(inputSchema, topic) {
   // used to synchronize with Kafka callbacks
-  @volatile private var failedWrite: Exception = null
-  private val projection = createProjection
   private var producer: KafkaProducer[Array[Byte], Array[Byte]] = _
 
   /**
@@ -46,23 +44,7 @@ private[kafka010] class KafkaWriteTask(
     producer = CachedKafkaProducer.getOrCreate(producerConfiguration)
     while (iterator.hasNext && failedWrite == null) {
       val currentRow = iterator.next()
-      val projectedRow = projection(currentRow)
-      val topic = projectedRow.getUTF8String(0)
-      val key = projectedRow.getBinary(1)
-      val value = projectedRow.getBinary(2)
-      if (topic == null) {
-        throw new NullPointerException(s"null topic present in the data. Use the " +
-        s"${KafkaSourceProvider.TOPIC_OPTION_KEY} option for setting a default topic.")
-      }
-      val record = new ProducerRecord[Array[Byte], Array[Byte]](topic.toString, key, value)
-      val callback = new Callback() {
-        override def onCompletion(recordMetadata: RecordMetadata, e: Exception): Unit = {
-          if (failedWrite == null && e != null) {
-            failedWrite = e
-          }
-        }
-      }
-      producer.send(record, callback)
+      sendRow(currentRow, producer)
     }
   }
 
@@ -74,8 +56,49 @@ private[kafka010] class KafkaWriteTask(
       producer = null
     }
   }
+}
+
+private[kafka010] abstract class KafkaRowWriter(
+    inputSchema: Seq[Attribute], topic: Option[String]) {
+
+  // used to synchronize with Kafka callbacks
+  @volatile protected var failedWrite: Exception = _
+  protected val projection = createProjection
+
+  private val callback = new Callback() {
+    override def onCompletion(recordMetadata: RecordMetadata, e: Exception): Unit = {
+      if (failedWrite == null && e != null) {
+        failedWrite = e
+      }
+    }
+  }
 
-  private def createProjection: UnsafeProjection = {
+  /**
+   * Send the specified row to the producer, with a callback that will save any exception
+   * to failedWrite. Note that send is asynchronous; subclasses must flush() their producer before
+   * assuming the row is in Kafka.
+   */
+  protected def sendRow(
+      row: InternalRow, producer: KafkaProducer[Array[Byte], Array[Byte]]): Unit = {
+    val projectedRow = projection(row)
+    val topic = projectedRow.getUTF8String(0)
+    val key = projectedRow.getBinary(1)
+    val value = projectedRow.getBinary(2)
+    if (topic == null) {
+      throw new NullPointerException(s"null topic present in the data. Use the " +
+        s"${KafkaSourceProvider.TOPIC_OPTION_KEY} option for setting a default topic.")
+    }
+    val record = new ProducerRecord[Array[Byte], Array[Byte]](topic.toString, key, value)
+    producer.send(record, callback)
+  }
+
+  protected def checkForErrors(): Unit = {
+    if (failedWrite != null) {
+      throw failedWrite
+    }
+  }
+
+  private def createProjection = {
     val topicExpression = topic.map(Literal(_)).orElse {
       inputSchema.find(_.name == KafkaWriter.TOPIC_ATTRIBUTE_NAME)
     }.getOrElse {
@@ -112,11 +135,5 @@ private[kafka010] class KafkaWriteTask(
       Seq(topicExpression, Cast(keyExpression, BinaryType),
         Cast(valueExpression, BinaryType)), inputSchema)
   }
-
-  private def checkForErrors(): Unit = {
-    if (failedWrite != null) {
-      throw failedWrite
-    }
-  }
 }
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
index 5e9ae35b3f008..15cd44812cb0c 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
@@ -43,10 +43,9 @@ private[kafka010] object KafkaWriter extends Logging {
   override def toString: String = "KafkaWriter"
 
   def validateQuery(
-      queryExecution: QueryExecution,
+      schema: Seq[Attribute],
       kafkaParameters: ju.Map[String, Object],
       topic: Option[String] = None): Unit = {
-    val schema = queryExecution.analyzed.output
     schema.find(_.name == TOPIC_ATTRIBUTE_NAME).getOrElse(
       if (topic.isEmpty) {
         throw new AnalysisException(s"topic option required when no " +
@@ -84,7 +83,7 @@ private[kafka010] object KafkaWriter extends Logging {
       kafkaParameters: ju.Map[String, Object],
       topic: Option[String] = None): Unit = {
     val schema = queryExecution.analyzed.output
-    validateQuery(queryExecution, kafkaParameters, topic)
+    validateQuery(schema, kafkaParameters, topic)
     queryExecution.toRdd.foreachPartition { iter =>
       val writeTask = new KafkaWriteTask(kafkaParameters, schema, topic)
       Utils.tryWithSafeFinally(block = writeTask.execute(iter))(
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
new file mode 100644
index 0000000000000..dfc97b1c38bb5
--- /dev/null
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
@@ -0,0 +1,474 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.util.Locale
+import java.util.concurrent.atomic.AtomicInteger
+
+import org.apache.kafka.clients.producer.ProducerConfig
+import org.apache.kafka.common.serialization.ByteArraySerializer
+import org.scalatest.time.SpanSugar._
+import scala.collection.JavaConverters._
+
+import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SaveMode}
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, SpecificInternalRow, UnsafeProjection}
+import org.apache.spark.sql.execution.streaming.MemoryStream
+import org.apache.spark.sql.streaming._
+import org.apache.spark.sql.types.{BinaryType, DataType}
+import org.apache.spark.util.Utils
+
+/**
+ * This is a temporary port of KafkaSinkSuite, since we do not yet have a V2 memory stream.
+ * Once we have one, this will be changed to a specialization of KafkaSinkSuite and we won't have
+ * to duplicate all the code.
+ */
+class KafkaContinuousSinkSuite extends KafkaContinuousTest {
+  import testImplicits._
+
+  override val streamingTimeout = 30.seconds
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    testUtils = new KafkaTestUtils(
+      withBrokerProps = Map("auto.create.topics.enable" -> "false"))
+    testUtils.setup()
+  }
+
+  override def afterAll(): Unit = {
+    if (testUtils != null) {
+      testUtils.teardown()
+      testUtils = null
+    }
+    super.afterAll()
+  }
+
+  test("streaming - write to kafka with topic field") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    val writer = createKafkaWriter(
+      input.toDF(),
+      withTopic = None,
+      withOutputMode = Some(OutputMode.Append))(
+      withSelectExpr = s"'$topic' as topic", "value")
+
+    val reader = createKafkaReader(topic)
+      .selectExpr("CAST(key as STRING) key", "CAST(value as STRING) value")
+      .selectExpr("CAST(key as INT) key", "CAST(value as INT) value")
+      .as[(Int, Int)]
+      .map(_._2)
+
+    try {
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      failAfter(streamingTimeout) {
+        writer.processAllAvailable()
+      }
+      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5)
+      testUtils.sendMessages(inputTopic, Array("6", "7", "8", "9", "10"))
+      failAfter(streamingTimeout) {
+        writer.processAllAvailable()
+      }
+      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+    } finally {
+      writer.stop()
+    }
+  }
+
+  test("streaming - write w/o topic field, with topic option") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    val writer = createKafkaWriter(
+      input.toDF(),
+      withTopic = Some(topic),
+      withOutputMode = Some(OutputMode.Append()))()
+
+    val reader = createKafkaReader(topic)
+      .selectExpr("CAST(key as STRING) key", "CAST(value as STRING) value")
+      .selectExpr("CAST(key as INT) key", "CAST(value as INT) value")
+      .as[(Int, Int)]
+      .map(_._2)
+
+    try {
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      failAfter(streamingTimeout) {
+        writer.processAllAvailable()
+      }
+      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5)
+      testUtils.sendMessages(inputTopic, Array("6", "7", "8", "9", "10"))
+      failAfter(streamingTimeout) {
+        writer.processAllAvailable()
+      }
+      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+    } finally {
+      writer.stop()
+    }
+  }
+
+  test("streaming - topic field and topic option") {
+    /* The purpose of this test is to ensure that the topic option
+     * overrides the topic field. We begin by writing some data that
+     * includes a topic field and value (e.g., 'foo') along with a topic
+     * option. Then when we read from the topic specified in the option
+     * we should see the data i.e., the data was written to the topic
+     * option, and not to the topic in the data e.g., foo
+     */
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    val writer = createKafkaWriter(
+      input.toDF(),
+      withTopic = Some(topic),
+      withOutputMode = Some(OutputMode.Append()))(
+      withSelectExpr = "'foo' as topic", "CAST(value as STRING) value")
+
+    val reader = createKafkaReader(topic)
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .selectExpr("CAST(key AS INT)", "CAST(value AS INT)")
+      .as[(Int, Int)]
+      .map(_._2)
+
+    try {
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      failAfter(streamingTimeout) {
+        writer.processAllAvailable()
+      }
+      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5)
+      testUtils.sendMessages(inputTopic, Array("6", "7", "8", "9", "10"))
+      failAfter(streamingTimeout) {
+        writer.processAllAvailable()
+      }
+      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+    } finally {
+      writer.stop()
+    }
+  }
+
+  test("null topic attribute") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    /* No topic field or topic option */
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(input.toDF())(
+          withSelectExpr = "CAST(null as STRING) as topic", "value"
+        )
+        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+        writer.processAllAvailable()
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getCause.getCause.getMessage
+      .toLowerCase(Locale.ROOT)
+      .contains("null topic present in the data."))
+  }
+
+  test("streaming - write data with bad schema") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    /* No topic field or topic option */
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(input.toDF())(
+          withSelectExpr = "value as key", "value"
+        )
+        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+        writer.processAllAvailable()
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage
+      .toLowerCase(Locale.ROOT)
+      .contains("topic option required when no 'topic' attribute is present"))
+
+    try {
+      /* No value field */
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(input.toDF())(
+          withSelectExpr = s"'$topic' as topic", "value as key"
+        )
+        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+        writer.processAllAvailable()
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+      "required attribute 'value' not found"))
+  }
+
+  test("streaming - write data with valid schema but wrong types") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+      .selectExpr("CAST(value as STRING) value")
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      /* topic field wrong type */
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(input.toDF())(
+          withSelectExpr = s"CAST('1' as INT) as topic", "value"
+        )
+        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+        writer.processAllAvailable()
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains("topic type must be a string"))
+
+    try {
+      /* value field wrong type */
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(input.toDF())(
+          withSelectExpr = s"'$topic' as topic", "CAST(value as INT) as value"
+        )
+        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+        writer.processAllAvailable()
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+      "value attribute type must be a string or binarytype"))
+
+    try {
+      ex = intercept[StreamingQueryException] {
+        /* key field wrong type */
+        writer = createKafkaWriter(input.toDF())(
+          withSelectExpr = s"'$topic' as topic", "CAST(value as INT) as key", "value"
+        )
+        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+        writer.processAllAvailable()
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+      "key attribute type must be a string or binarytype"))
+  }
+
+  test("streaming - write to non-existing topic") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+    val topic = newTopic()
+
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(input.toDF(), withTopic = Some(topic))()
+        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+        eventually(timeout(streamingTimeout)) {
+          assert(writer.exception.isDefined)
+        }
+        throw writer.exception.get
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains("job aborted"))
+  }
+
+  test("streaming - exception on config serializer") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+    testUtils.sendMessages(inputTopic, Array("0"))
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .load()
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(
+          input.toDF(),
+          withOptions = Map("kafka.key.serializer" -> "foo"))()
+        writer.processAllAvailable()
+      }
+      assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+        "kafka option 'key.serializer' is not supported"))
+    } finally {
+      writer.stop()
+    }
+
+    try {
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(
+          input.toDF(),
+          withOptions = Map("kafka.value.serializer" -> "foo"))()
+        writer.processAllAvailable()
+      }
+      assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+        "kafka option 'value.serializer' is not supported"))
+    } finally {
+      writer.stop()
+    }
+  }
+
+  test("generic - write big data with small producer buffer") {
+    /* This test ensures that we understand the semantics of Kafka when
+    * is comes to blocking on a call to send when the send buffer is full.
+    * This test will configure the smallest possible producer buffer and
+    * indicate that we should block when it is full. Thus, no exception should
+    * be thrown in the case of a full buffer.
+    */
+    val topic = newTopic()
+    testUtils.createTopic(topic, 1)
+    val options = new java.util.HashMap[String, String]
+    options.put("bootstrap.servers", testUtils.brokerAddress)
+    options.put("buffer.memory", "16384") // min buffer size
+    options.put("block.on.buffer.full", "true")
+    options.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, classOf[ByteArraySerializer].getName)
+    options.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, classOf[ByteArraySerializer].getName)
+    val inputSchema = Seq(AttributeReference("value", BinaryType)())
+    val data = new Array[Byte](15000) // large value
+    val writeTask = new KafkaContinuousDataWriter(Some(topic), options.asScala.toMap, inputSchema)
+    try {
+      val fieldTypes: Array[DataType] = Array(BinaryType)
+      val converter = UnsafeProjection.create(fieldTypes)
+      val row = new SpecificInternalRow(fieldTypes)
+      row.update(0, data)
+      val iter = Seq.fill(1000)(converter.apply(row)).iterator
+      iter.foreach(writeTask.write(_))
+      writeTask.commit()
+    } finally {
+      writeTask.close()
+    }
+  }
+
+  private def createKafkaReader(topic: String): DataFrame = {
+    spark.read
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("startingOffsets", "earliest")
+      .option("endingOffsets", "latest")
+      .option("subscribe", topic)
+      .load()
+  }
+
+  private def createKafkaWriter(
+      input: DataFrame,
+      withTopic: Option[String] = None,
+      withOutputMode: Option[OutputMode] = None,
+      withOptions: Map[String, String] = Map[String, String]())
+      (withSelectExpr: String*): StreamingQuery = {
+    var stream: DataStreamWriter[Row] = null
+    val checkpointDir = Utils.createTempDir()
+    var df = input.toDF()
+    if (withSelectExpr.length > 0) {
+      df = df.selectExpr(withSelectExpr: _*)
+    }
+    stream = df.writeStream
+      .format("kafka")
+      .option("checkpointLocation", checkpointDir.getCanonicalPath)
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      // We need to reduce blocking time to efficiently test non-existent partition behavior.
+      .option("kafka.max.block.ms", "1000")
+      .trigger(Trigger.Continuous(1000))
+      .queryName("kafkaStream")
+    withTopic.foreach(stream.option("topic", _))
+    withOutputMode.foreach(stream.outputMode(_))
+    withOptions.foreach(opt => stream.option(opt._1, opt._2))
+    stream.start()
+  }
+}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala
new file mode 100644
index 0000000000000..b3dade414f625
--- /dev/null
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.util.Properties
+import java.util.concurrent.atomic.AtomicInteger
+
+import org.scalatest.time.SpanSugar._
+import scala.collection.mutable
+import scala.util.Random
+
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row}
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
+import org.apache.spark.sql.execution.streaming.StreamExecution
+import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
+import org.apache.spark.sql.streaming.{StreamTest, Trigger}
+import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}
+
+// Run tests in KafkaSourceSuiteBase in continuous execution mode.
+class KafkaContinuousSourceSuite extends KafkaSourceSuiteBase with KafkaContinuousTest
+
+class KafkaContinuousSourceTopicDeletionSuite extends KafkaContinuousTest {
+  import testImplicits._
+
+  override val brokerProps = Map("auto.create.topics.enable" -> "false")
+
+  test("subscribing topic by pattern with topic deletions") {
+    val topicPrefix = newTopic()
+    val topic = topicPrefix + "-seems"
+    val topic2 = topicPrefix + "-bad"
+    testUtils.createTopic(topic, partitions = 5)
+    testUtils.sendMessages(topic, Array("-1"))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("subscribePattern", s"$topicPrefix-.*")
+      .option("failOnDataLoss", "false")
+
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val mapped = kafka.map(kv => kv._2.toInt + 1)
+
+    testStream(mapped)(
+      makeSureGetOffsetCalled,
+      AddKafkaData(Set(topic), 1, 2, 3),
+      CheckAnswer(2, 3, 4),
+      Execute { query =>
+        testUtils.deleteTopic(topic)
+        testUtils.createTopic(topic2, partitions = 5)
+        eventually(timeout(streamingTimeout)) {
+          assert(
+            query.lastExecution.logical.collectFirst {
+              case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
+            }.exists { r =>
+              // Ensure the new topic is present and the old topic is gone.
+              r.knownPartitions.exists(_.topic == topic2)
+            },
+            s"query never reconfigured to new topic $topic2")
+        }
+      },
+      AddKafkaData(Set(topic2), 4, 5, 6),
+      CheckAnswer(2, 3, 4, 5, 6, 7)
+    )
+  }
+}
+
+class KafkaContinuousSourceStressForDontFailOnDataLossSuite
+    extends KafkaSourceStressForDontFailOnDataLossSuite {
+  override protected def startStream(ds: Dataset[Int]) = {
+    ds.writeStream
+      .format("memory")
+      .queryName("memory")
+      .start()
+  }
+}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala
new file mode 100644
index 0000000000000..e713e6695d2bd
--- /dev/null
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
+import org.apache.spark.sql.execution.streaming.StreamExecution
+import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
+import org.apache.spark.sql.streaming.Trigger
+import org.apache.spark.sql.test.TestSparkSession
+
+// Trait to configure StreamTest for kafka continuous execution tests.
+trait KafkaContinuousTest extends KafkaSourceTest {
+  override val defaultTrigger = Trigger.Continuous(1000)
+  override val defaultUseV2Sink = true
+
+  // We need more than the default local[2] to be able to schedule all partitions simultaneously.
+  override protected def createSparkSession = new TestSparkSession(
+    new SparkContext(
+      "local[10]",
+      "continuous-stream-test-sql-context",
+      sparkConf.set("spark.sql.testkey", "true")))
+
+  // In addition to setting the partitions in Kafka, we have to wait until the query has
+  // reconfigured to the new count so the test framework can hook in properly.
+  override protected def setTopicPartitions(
+      topic: String, newCount: Int, query: StreamExecution) = {
+    testUtils.addPartitions(topic, newCount)
+    eventually(timeout(streamingTimeout)) {
+      assert(
+        query.lastExecution.logical.collectFirst {
+          case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
+        }.exists(_.knownPartitions.size == newCount),
+        s"query never reconfigured to $newCount partitions")
+    }
+  }
+
+  test("ensure continuous stream is being used") {
+    val query = spark.readStream
+      .format("rate")
+      .option("numPartitions", "1")
+      .option("rowsPerSecond", "1")
+      .load()
+
+    testStream(query)(
+      Execute(q => assert(q.isInstanceOf[ContinuousExecution]))
+    )
+  }
+}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index 2034b9be07f24..d66908f86ccc7 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -34,11 +34,14 @@ import org.scalatest.concurrent.PatienceConfiguration.Timeout
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.SparkContext
-import org.apache.spark.sql.ForeachWriter
+import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row}
+import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, WriteToDataSourceV2Exec}
 import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
+import org.apache.spark.sql.execution.streaming.sources.ContinuousMemoryWriter
 import org.apache.spark.sql.functions.{count, window}
 import org.apache.spark.sql.kafka010.KafkaSourceProvider._
-import org.apache.spark.sql.streaming.{ProcessingTime, StreamTest}
+import org.apache.spark.sql.streaming.{ProcessingTime, StreamTest, Trigger}
 import org.apache.spark.sql.streaming.util.StreamManualClock
 import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}
 import org.apache.spark.util.Utils
@@ -49,9 +52,11 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
 
   override val streamingTimeout = 30.seconds
 
+  protected val brokerProps = Map[String, Object]()
+
   override def beforeAll(): Unit = {
     super.beforeAll()
-    testUtils = new KafkaTestUtils
+    testUtils = new KafkaTestUtils(brokerProps)
     testUtils.setup()
   }
 
@@ -59,18 +64,25 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
     if (testUtils != null) {
       testUtils.teardown()
       testUtils = null
-      super.afterAll()
     }
+    super.afterAll()
   }
 
   protected def makeSureGetOffsetCalled = AssertOnQuery { q =>
     // Because KafkaSource's initialPartitionOffsets is set lazily, we need to make sure
-    // its "getOffset" is called before pushing any data. Otherwise, because of the race contion,
+    // its "getOffset" is called before pushing any data. Otherwise, because of the race condition,
     // we don't know which data should be fetched when `startingOffsets` is latest.
-    q.processAllAvailable()
+    q match {
+      case c: ContinuousExecution => c.awaitEpoch(0)
+      case m: MicroBatchExecution => m.processAllAvailable()
+    }
     true
   }
 
+  protected def setTopicPartitions(topic: String, newCount: Int, query: StreamExecution) : Unit = {
+    testUtils.addPartitions(topic, newCount)
+  }
+
   /**
    * Add data to Kafka.
    *
@@ -82,7 +94,7 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
       message: String = "",
       topicAction: (String, Option[Int]) => Unit = (_, _) => {}) extends AddData {
 
-    override def addData(query: Option[StreamExecution]): (Source, Offset) = {
+    override def addData(query: Option[StreamExecution]): (BaseStreamingSource, Offset) = {
       if (query.get.isActive) {
         // Make sure no Spark job is running when deleting a topic
         query.get.processAllAvailable()
@@ -97,16 +109,18 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
         topicAction(existingTopicPartitions._1, Some(existingTopicPartitions._2))
       }
 
-      // Read all topics again in case some topics are delete.
-      val allTopics = testUtils.getAllTopicsAndPartitionSize().toMap.keys
       require(
         query.nonEmpty,
         "Cannot add data when there is no query for finding the active kafka source")
 
       val sources = query.get.logicalPlan.collect {
-        case StreamingExecutionRelation(source, _) if source.isInstanceOf[KafkaSource] =>
-          source.asInstanceOf[KafkaSource]
-      }
+        case StreamingExecutionRelation(source: KafkaSource, _) => source
+      } ++ (query.get.lastExecution match {
+        case null => Seq()
+        case e => e.logical.collect {
+          case DataSourceV2Relation(_, reader: KafkaContinuousReader) => reader
+        }
+      })
       if (sources.isEmpty) {
         throw new Exception(
           "Could not find Kafka source in the StreamExecution logical plan to add data to")
@@ -137,14 +151,158 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
     override def toString: String =
       s"AddKafkaData(topics = $topics, data = $data, message = $message)"
   }
-}
 
+  private val topicId = new AtomicInteger(0)
+  protected def newTopic(): String = s"topic-${topicId.getAndIncrement()}"
+}
 
-class KafkaSourceSuite extends KafkaSourceTest {
+class KafkaMicroBatchSourceSuite extends KafkaSourceSuiteBase {
 
   import testImplicits._
 
-  private val topicId = new AtomicInteger(0)
+  test("(de)serialization of initial offsets") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 5)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", topic)
+
+    testStream(reader.load)(
+      makeSureGetOffsetCalled,
+      StopStream,
+      StartStream(),
+      StopStream)
+  }
+
+  test("maxOffsetsPerTrigger") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 3)
+    testUtils.sendMessages(topic, (100 to 200).map(_.toString).toArray, Some(0))
+    testUtils.sendMessages(topic, (10 to 20).map(_.toString).toArray, Some(1))
+    testUtils.sendMessages(topic, Array("1"), Some(2))
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("maxOffsetsPerTrigger", 10)
+      .option("subscribe", topic)
+      .option("startingOffsets", "earliest")
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val mapped: org.apache.spark.sql.Dataset[_] = kafka.map(kv => kv._2.toInt)
+
+    val clock = new StreamManualClock
+
+    val waitUntilBatchProcessed = AssertOnQuery { q =>
+      eventually(Timeout(streamingTimeout)) {
+        if (!q.exception.isDefined) {
+          assert(clock.isStreamWaitingAt(clock.getTimeMillis()))
+        }
+      }
+      if (q.exception.isDefined) {
+        throw q.exception.get
+      }
+      true
+    }
+
+    testStream(mapped)(
+      StartStream(ProcessingTime(100), clock),
+      waitUntilBatchProcessed,
+      // 1 from smallest, 1 from middle, 8 from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107),
+      AdvanceManualClock(100),
+      waitUntilBatchProcessed,
+      // smallest now empty, 1 more from middle, 9 more from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
+        11, 108, 109, 110, 111, 112, 113, 114, 115, 116
+      ),
+      StopStream,
+      StartStream(ProcessingTime(100), clock),
+      waitUntilBatchProcessed,
+      // smallest now empty, 1 more from middle, 9 more from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
+        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+        12, 117, 118, 119, 120, 121, 122, 123, 124, 125
+      ),
+      AdvanceManualClock(100),
+      waitUntilBatchProcessed,
+      // smallest now empty, 1 more from middle, 9 more from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
+        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+        12, 117, 118, 119, 120, 121, 122, 123, 124, 125,
+        13, 126, 127, 128, 129, 130, 131, 132, 133, 134
+      )
+    )
+  }
+
+  test("input row metrics") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 5)
+    testUtils.sendMessages(topic, Array("-1"))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val kafka = spark
+      .readStream
+      .format("kafka")
+      .option("subscribe", topic)
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+
+    val mapped = kafka.map(kv => kv._2.toInt + 1)
+    testStream(mapped)(
+      StartStream(trigger = ProcessingTime(1)),
+      makeSureGetOffsetCalled,
+      AddKafkaData(Set(topic), 1, 2, 3),
+      CheckAnswer(2, 3, 4),
+      AssertOnQuery { query =>
+        val recordsRead = query.recentProgress.map(_.numInputRows).sum
+        recordsRead == 3
+      }
+    )
+  }
+
+  test("subscribing topic by pattern with topic deletions") {
+    val topicPrefix = newTopic()
+    val topic = topicPrefix + "-seems"
+    val topic2 = topicPrefix + "-bad"
+    testUtils.createTopic(topic, partitions = 5)
+    testUtils.sendMessages(topic, Array("-1"))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("subscribePattern", s"$topicPrefix-.*")
+      .option("failOnDataLoss", "false")
+
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val mapped = kafka.map(kv => kv._2.toInt + 1)
+
+    testStream(mapped)(
+      makeSureGetOffsetCalled,
+      AddKafkaData(Set(topic), 1, 2, 3),
+      CheckAnswer(2, 3, 4),
+      Assert {
+        testUtils.deleteTopic(topic)
+        testUtils.createTopic(topic2, partitions = 5)
+        true
+      },
+      AddKafkaData(Set(topic2), 4, 5, 6),
+      CheckAnswer(2, 3, 4, 5, 6, 7)
+    )
+  }
 
   testWithUninterruptibleThread(
     "deserialization of initial offset with Spark 2.1.0") {
@@ -237,86 +395,51 @@ class KafkaSourceSuite extends KafkaSourceTest {
     }
   }
 
-  test("(de)serialization of initial offsets") {
-    val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 64)
-
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", topic)
-
-    testStream(reader.load)(
-      makeSureGetOffsetCalled,
-      StopStream,
-      StartStream(),
-      StopStream)
-  }
-
-  test("maxOffsetsPerTrigger") {
+  test("KafkaSource with watermark") {
+    val now = System.currentTimeMillis()
     val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 3)
-    testUtils.sendMessages(topic, (100 to 200).map(_.toString).toArray, Some(0))
-    testUtils.sendMessages(topic, (10 to 20).map(_.toString).toArray, Some(1))
-    testUtils.sendMessages(topic, Array("1"), Some(2))
+    testUtils.createTopic(newTopic(), partitions = 1)
+    testUtils.sendMessages(topic, Array(1).map(_.toString))
 
-    val reader = spark
+    val kafka = spark
       .readStream
       .format("kafka")
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
       .option("kafka.metadata.max.age.ms", "1")
-      .option("maxOffsetsPerTrigger", 10)
+      .option("startingOffsets", s"earliest")
       .option("subscribe", topic)
-      .option("startingOffsets", "earliest")
-    val kafka = reader.load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-    val mapped: org.apache.spark.sql.Dataset[_] = kafka.map(kv => kv._2.toInt)
-
-    val clock = new StreamManualClock
+      .load()
 
-    val waitUntilBatchProcessed = AssertOnQuery { q =>
-      eventually(Timeout(streamingTimeout)) {
-        if (!q.exception.isDefined) {
-          assert(clock.isStreamWaitingAt(clock.getTimeMillis()))
-        }
-      }
-      if (q.exception.isDefined) {
-        throw q.exception.get
-      }
-      true
-    }
+    val windowedAggregation = kafka
+      .withWatermark("timestamp", "10 seconds")
+      .groupBy(window($"timestamp", "5 seconds") as 'window)
+      .agg(count("*") as 'count)
+      .select($"window".getField("start") as 'window, $"count")
 
-    testStream(mapped)(
-      StartStream(ProcessingTime(100), clock),
-      waitUntilBatchProcessed,
-      // 1 from smallest, 1 from middle, 8 from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107),
-      AdvanceManualClock(100),
-      waitUntilBatchProcessed,
-      // smallest now empty, 1 more from middle, 9 more from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
-        11, 108, 109, 110, 111, 112, 113, 114, 115, 116
-      ),
-      StopStream,
-      StartStream(ProcessingTime(100), clock),
-      waitUntilBatchProcessed,
-      // smallest now empty, 1 more from middle, 9 more from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
-        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
-        12, 117, 118, 119, 120, 121, 122, 123, 124, 125
-      ),
-      AdvanceManualClock(100),
-      waitUntilBatchProcessed,
-      // smallest now empty, 1 more from middle, 9 more from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
-        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
-        12, 117, 118, 119, 120, 121, 122, 123, 124, 125,
-        13, 126, 127, 128, 129, 130, 131, 132, 133, 134
-      )
-    )
+    val query = windowedAggregation
+      .writeStream
+      .format("memory")
+      .outputMode("complete")
+      .queryName("kafkaWatermark")
+      .start()
+    query.processAllAvailable()
+    val rows = spark.table("kafkaWatermark").collect()
+    assert(rows.length === 1, s"Unexpected results: ${rows.toList}")
+    val row = rows(0)
+    // We cannot check the exact window start time as it depands on the time that messages were
+    // inserted by the producer. So here we just use a low bound to make sure the internal
+    // conversion works.
+    assert(
+      row.getAs[java.sql.Timestamp]("window").getTime >= now - 5 * 1000,
+      s"Unexpected results: $row")
+    assert(row.getAs[Int]("count") === 1, s"Unexpected results: $row")
+    query.stop()
   }
+}
+
+class KafkaSourceSuiteBase extends KafkaSourceTest {
+
+  import testImplicits._
 
   test("cannot stop Kafka stream") {
     val topic = newTopic()
@@ -328,7 +451,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
       .format("kafka")
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
       .option("kafka.metadata.max.age.ms", "1")
-      .option("subscribePattern", s"topic-.*")
+      .option("subscribePattern", s"$topic.*")
 
     val kafka = reader.load()
       .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
@@ -422,65 +545,6 @@ class KafkaSourceSuite extends KafkaSourceTest {
     }
   }
 
-  test("subscribing topic by pattern with topic deletions") {
-    val topicPrefix = newTopic()
-    val topic = topicPrefix + "-seems"
-    val topic2 = topicPrefix + "-bad"
-    testUtils.createTopic(topic, partitions = 5)
-    testUtils.sendMessages(topic, Array("-1"))
-    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
-
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("subscribePattern", s"$topicPrefix-.*")
-      .option("failOnDataLoss", "false")
-
-    val kafka = reader.load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-    val mapped = kafka.map(kv => kv._2.toInt + 1)
-
-    testStream(mapped)(
-      makeSureGetOffsetCalled,
-      AddKafkaData(Set(topic), 1, 2, 3),
-      CheckAnswer(2, 3, 4),
-      Assert {
-        testUtils.deleteTopic(topic)
-        testUtils.createTopic(topic2, partitions = 5)
-        true
-      },
-      AddKafkaData(Set(topic2), 4, 5, 6),
-      CheckAnswer(2, 3, 4, 5, 6, 7)
-    )
-  }
-
-  test("starting offset is latest by default") {
-    val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 5)
-    testUtils.sendMessages(topic, Array("0"))
-    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
-
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", topic)
-
-    val kafka = reader.load()
-      .selectExpr("CAST(value AS STRING)")
-      .as[String]
-    val mapped = kafka.map(_.toInt)
-
-    testStream(mapped)(
-      makeSureGetOffsetCalled,
-      AddKafkaData(Set(topic), 1, 2, 3),
-      CheckAnswer(1, 2, 3)  // should not have 0
-    )
-  }
-
   test("bad source options") {
     def testBadOptions(options: (String, String)*)(expectedMsgs: String*): Unit = {
       val ex = intercept[IllegalArgumentException] {
@@ -540,34 +604,6 @@ class KafkaSourceSuite extends KafkaSourceTest {
     testUnsupportedConfig("kafka.auto.offset.reset", "latest")
   }
 
-  test("input row metrics") {
-    val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 5)
-    testUtils.sendMessages(topic, Array("-1"))
-    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
-
-    val kafka = spark
-      .readStream
-      .format("kafka")
-      .option("subscribe", topic)
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-
-    val mapped = kafka.map(kv => kv._2.toInt + 1)
-    testStream(mapped)(
-      StartStream(trigger = ProcessingTime(1)),
-      makeSureGetOffsetCalled,
-      AddKafkaData(Set(topic), 1, 2, 3),
-      CheckAnswer(2, 3, 4),
-      AssertOnQuery { query =>
-        val recordsRead = query.recentProgress.map(_.numInputRows).sum
-        recordsRead == 3
-      }
-    )
-  }
-
   test("delete a topic when a Spark job is running") {
     KafkaSourceSuite.collectedData.clear()
 
@@ -629,8 +665,6 @@ class KafkaSourceSuite extends KafkaSourceTest {
     }
   }
 
-  private def newTopic(): String = s"topic-${topicId.getAndIncrement()}"
-
   private def assignString(topic: String, partitions: Iterable[Int]): String = {
     JsonUtils.partitions(partitions.map(p => new TopicPartition(topic, p)))
   }
@@ -676,6 +710,10 @@ class KafkaSourceSuite extends KafkaSourceTest {
 
     testStream(mapped)(
       makeSureGetOffsetCalled,
+      Execute { q =>
+        // wait to reach the last offset in every partition
+        q.awaitOffset(0, KafkaSourceOffset(partitionOffsets.mapValues(_ => 3L)))
+      },
       CheckAnswer(-20, -21, -22, 0, 1, 2, 11, 12, 22),
       StopStream,
       StartStream(),
@@ -706,6 +744,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
       .format("memory")
       .outputMode("append")
       .queryName("kafkaColumnTypes")
+      .trigger(defaultTrigger)
       .start()
     query.processAllAvailable()
     val rows = spark.table("kafkaColumnTypes").collect()
@@ -723,47 +762,6 @@ class KafkaSourceSuite extends KafkaSourceTest {
     query.stop()
   }
 
-  test("KafkaSource with watermark") {
-    val now = System.currentTimeMillis()
-    val topic = newTopic()
-    testUtils.createTopic(newTopic(), partitions = 1)
-    testUtils.sendMessages(topic, Array(1).map(_.toString))
-
-    val kafka = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("startingOffsets", s"earliest")
-      .option("subscribe", topic)
-      .load()
-
-    val windowedAggregation = kafka
-      .withWatermark("timestamp", "10 seconds")
-      .groupBy(window($"timestamp", "5 seconds") as 'window)
-      .agg(count("*") as 'count)
-      .select($"window".getField("start") as 'window, $"count")
-
-    val query = windowedAggregation
-      .writeStream
-      .format("memory")
-      .outputMode("complete")
-      .queryName("kafkaWatermark")
-      .start()
-    query.processAllAvailable()
-    val rows = spark.table("kafkaWatermark").collect()
-    assert(rows.length === 1, s"Unexpected results: ${rows.toList}")
-    val row = rows(0)
-    // We cannot check the exact window start time as it depands on the time that messages were
-    // inserted by the producer. So here we just use a low bound to make sure the internal
-    // conversion works.
-    assert(
-      row.getAs[java.sql.Timestamp]("window").getTime >= now - 5 * 1000,
-      s"Unexpected results: $row")
-    assert(row.getAs[Int]("count") === 1, s"Unexpected results: $row")
-    query.stop()
-  }
-
   private def testFromLatestOffsets(
       topic: String,
       addPartitions: Boolean,
@@ -800,9 +798,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
       AddKafkaData(Set(topic), 7, 8),
       CheckAnswer(2, 3, 4, 5, 6, 7, 8, 9),
       AssertOnQuery("Add partitions") { query: StreamExecution =>
-        if (addPartitions) {
-          testUtils.addPartitions(topic, 10)
-        }
+        if (addPartitions) setTopicPartitions(topic, 10, query)
         true
       },
       AddKafkaData(Set(topic), 9, 10, 11, 12, 13, 14, 15, 16),
@@ -843,9 +839,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
       StartStream(),
       CheckAnswer(2, 3, 4, 5, 6, 7, 8, 9),
       AssertOnQuery("Add partitions") { query: StreamExecution =>
-        if (addPartitions) {
-          testUtils.addPartitions(topic, 10)
-        }
+        if (addPartitions) setTopicPartitions(topic, 10, query)
         true
       },
       AddKafkaData(Set(topic), 9, 10, 11, 12, 13, 14, 15, 16),
@@ -977,20 +971,8 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with Shared
     }
   }
 
-  test("stress test for failOnDataLoss=false") {
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("subscribePattern", "failOnDataLoss.*")
-      .option("startingOffsets", "earliest")
-      .option("failOnDataLoss", "false")
-      .option("fetchOffset.retryIntervalMs", "3000")
-    val kafka = reader.load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-    val query = kafka.map(kv => kv._2.toInt).writeStream.foreach(new ForeachWriter[Int] {
+  protected def startStream(ds: Dataset[Int]) = {
+    ds.writeStream.foreach(new ForeachWriter[Int] {
 
       override def open(partitionId: Long, version: Long): Boolean = {
         true
@@ -1004,6 +986,22 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with Shared
       override def close(errorOrNull: Throwable): Unit = {
       }
     }).start()
+  }
+
+  test("stress test for failOnDataLoss=false") {
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("subscribePattern", "failOnDataLoss.*")
+      .option("startingOffsets", "earliest")
+      .option("failOnDataLoss", "false")
+      .option("fetchOffset.retryIntervalMs", "3000")
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val query = startStream(kafka.map(kv => kv._2.toInt))
 
     val testTime = 1.minutes
     val startTime = System.currentTimeMillis()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index e8d683a578f35..b714a46b5f786 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -191,6 +191,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
           ds = ds.asInstanceOf[DataSourceV2],
           conf = sparkSession.sessionState.conf)).asJava)
 
+      // Streaming also uses the data source V2 API. So it may be that the data source implements
+      // v2, but has no v2 implementation for batch reads. In that case, we fall back to loading
+      // the dataframe as a v1 source.
       val reader = (ds, userSpecifiedSchema) match {
         case (ds: ReadSupportWithSchema, Some(schema)) =>
           ds.createReader(schema, options)
@@ -208,23 +211,30 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
           }
           reader
 
-        case _ =>
-          throw new AnalysisException(s"$cls does not support data reading.")
+        case _ => null // fall back to v1
       }
 
-      Dataset.ofRows(sparkSession, DataSourceV2Relation(reader))
+      if (reader == null) {
+        loadV1Source(paths: _*)
+      } else {
+        Dataset.ofRows(sparkSession, DataSourceV2Relation(reader))
+      }
     } else {
-      // Code path for data source v1.
-      sparkSession.baseRelationToDataFrame(
-        DataSource.apply(
-          sparkSession,
-          paths = paths,
-          userSpecifiedSchema = userSpecifiedSchema,
-          className = source,
-          options = extraOptions.toMap).resolveRelation())
+      loadV1Source(paths: _*)
     }
   }
 
+  private def loadV1Source(paths: String*) = {
+    // Code path for data source v1.
+    sparkSession.baseRelationToDataFrame(
+      DataSource.apply(
+        sparkSession,
+        paths = paths,
+        userSpecifiedSchema = userSpecifiedSchema,
+        className = source,
+        options = extraOptions.toMap).resolveRelation())
+  }
+
   /**
    * Construct a `DataFrame` representing the database table accessible via JDBC URL
    * url named table and connection properties.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 3304f368e1050..97f12ff625c42 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -255,17 +255,24 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
             }
           }
 
-        case _ => throw new AnalysisException(s"$cls does not support data writing.")
+        // Streaming also uses the data source V2 API. So it may be that the data source implements
+        // v2, but has no v2 implementation for batch writes. In that case, we fall back to saving
+        // as though it's a V1 source.
+        case _ => saveToV1Source()
       }
     } else {
-      // Code path for data source v1.
-      runCommand(df.sparkSession, "save") {
-        DataSource(
-          sparkSession = df.sparkSession,
-          className = source,
-          partitionColumns = partitioningColumns.getOrElse(Nil),
-          options = extraOptions.toMap).planForWriting(mode, AnalysisBarrier(df.logicalPlan))
-      }
+      saveToV1Source()
+    }
+  }
+
+  private def saveToV1Source(): Unit = {
+    // Code path for data source v1.
+    runCommand(df.sparkSession, "save") {
+      DataSource(
+        sparkSession = df.sparkSession,
+        className = source,
+        partitionColumns = partitioningColumns.getOrElse(Nil),
+        options = extraOptions.toMap).planForWriting(mode, AnalysisBarrier(df.logicalPlan))
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
index f0bdf84bb7a84..a4a857f2d4d9b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
@@ -81,9 +81,11 @@ case class WriteToDataSourceV2Exec(writer: DataSourceV2Writer, query: SparkPlan)
         (index, message: WriterCommitMessage) => messages(index) = message
       )
 
-      logInfo(s"Data source writer $writer is committing.")
-      writer.commit(messages)
-      logInfo(s"Data source writer $writer committed.")
+      if (!writer.isInstanceOf[ContinuousWriter]) {
+        logInfo(s"Data source writer $writer is committing.")
+        writer.commit(messages)
+        logInfo(s"Data source writer $writer committed.")
+      }
     } catch {
       case _: InterruptedException if writer.isInstanceOf[ContinuousWriter] =>
         // Interruption is how continuous queries are ended, so accept and ignore the exception.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 24a8b000df0c1..cf27e1a70650a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -142,7 +142,8 @@ abstract class StreamExecution(
 
   override val id: UUID = UUID.fromString(streamMetadata.id)
 
-  override val runId: UUID = UUID.randomUUID
+  override def runId: UUID = currentRunId
+  protected var currentRunId = UUID.randomUUID
 
   /**
    * Pretty identified string of printing in logs. Format is
@@ -418,11 +419,17 @@ abstract class StreamExecution(
    * Blocks the current thread until processing for data from the given `source` has reached at
    * least the given `Offset`. This method is intended for use primarily when writing tests.
    */
-  private[sql] def awaitOffset(source: BaseStreamingSource, newOffset: Offset): Unit = {
+  private[sql] def awaitOffset(sourceIndex: Int, newOffset: Offset): Unit = {
     assertAwaitThread()
     def notDone = {
       val localCommittedOffsets = committedOffsets
-      !localCommittedOffsets.contains(source) || localCommittedOffsets(source) != newOffset
+      if (sources == null) {
+        // sources might not be initialized yet
+        false
+      } else {
+        val source = sources(sourceIndex)
+        !localCommittedOffsets.contains(source) || localCommittedOffsets(source) != newOffset
+      }
     }
 
     while (notDone) {
@@ -436,7 +443,7 @@ abstract class StreamExecution(
         awaitProgressLock.unlock()
       }
     }
-    logDebug(s"Unblocked at $newOffset for $source")
+    logDebug(s"Unblocked at $newOffset for ${sources(sourceIndex)}")
   }
 
   /** A flag to indicate that a batch has completed with no new data available. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
index d79e4bd65f563..e700aa4f9aea7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
@@ -77,7 +77,6 @@ class ContinuousDataSourceRDD(
     dataReaderThread.start()
 
     context.addTaskCompletionListener(_ => {
-      reader.close()
       dataReaderThread.interrupt()
       epochPollExecutor.shutdown()
     })
@@ -201,6 +200,8 @@ class DataReaderThread(
         failedFlag.set(true)
         // Don't rethrow the exception in this thread. It's not needed, and the default Spark
         // exception handler will kill the executor.
+    } finally {
+      reader.close()
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
index 9657b5e26d770..667410ef9f1c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -17,7 +17,9 @@
 
 package org.apache.spark.sql.execution.streaming.continuous
 
+import java.util.UUID
 import java.util.concurrent.TimeUnit
+import java.util.function.UnaryOperator
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, Map => MutableMap}
@@ -52,7 +54,7 @@ class ContinuousExecution(
     sparkSession, name, checkpointRoot, analyzedPlan, sink,
     trigger, triggerClock, outputMode, deleteCheckpointOnStop) {
 
-  @volatile protected var continuousSources: Seq[ContinuousReader] = Seq.empty
+  @volatile protected var continuousSources: Seq[ContinuousReader] = _
   override protected def sources: Seq[BaseStreamingSource] = continuousSources
 
   override lazy val logicalPlan: LogicalPlan = {
@@ -78,15 +80,17 @@ class ContinuousExecution(
   }
 
   override protected def runActivatedStream(sparkSessionForStream: SparkSession): Unit = {
-    do {
-      try {
-        runContinuous(sparkSessionForStream)
-      } catch {
-        case _: InterruptedException if state.get().equals(RECONFIGURING) =>
-          // swallow exception and run again
-          state.set(ACTIVE)
+    val stateUpdate = new UnaryOperator[State] {
+      override def apply(s: State) = s match {
+        // If we ended the query to reconfigure, reset the state to active.
+        case RECONFIGURING => ACTIVE
+        case _ => s
       }
-    } while (state.get() == ACTIVE)
+    }
+
+    do {
+      runContinuous(sparkSessionForStream)
+    } while (state.updateAndGet(stateUpdate) == ACTIVE)
   }
 
   /**
@@ -120,12 +124,16 @@ class ContinuousExecution(
         }
         committedOffsets = nextOffsets.toStreamProgress(sources)
 
-        // Forcibly align commit and offset logs by slicing off any spurious offset logs from
-        // a previous run. We can't allow commits to an epoch that a previous run reached but
-        // this run has not.
-        offsetLog.purgeAfter(latestEpochId)
+        // Get to an epoch ID that has definitely never been sent to a sink before. Since sink
+        // commit happens between offset log write and commit log write, this means an epoch ID
+        // which is not in the offset log.
+        val (latestOffsetEpoch, _) = offsetLog.getLatest().getOrElse {
+          throw new IllegalStateException(
+            s"Offset log had no latest element. This shouldn't be possible because nextOffsets is" +
+              s"an element.")
+        }
+        currentBatchId = latestOffsetEpoch + 1
 
-        currentBatchId = latestEpochId + 1
         logDebug(s"Resuming at epoch $currentBatchId with committed offsets $committedOffsets")
         nextOffsets
       case None =>
@@ -141,6 +149,7 @@ class ContinuousExecution(
    * @param sparkSessionForQuery Isolated [[SparkSession]] to run the continuous query with.
    */
   private def runContinuous(sparkSessionForQuery: SparkSession): Unit = {
+    currentRunId = UUID.randomUUID
     // A list of attributes that will need to be updated.
     val replacements = new ArrayBuffer[(Attribute, Attribute)]
     // Translate from continuous relation to the underlying data source.
@@ -225,13 +234,11 @@ class ContinuousExecution(
           triggerExecutor.execute(() => {
             startTrigger()
 
-            if (reader.needsReconfiguration()) {
-              state.set(RECONFIGURING)
+            if (reader.needsReconfiguration() && state.compareAndSet(ACTIVE, RECONFIGURING)) {
               stopSources()
               if (queryExecutionThread.isAlive) {
                 sparkSession.sparkContext.cancelJobGroup(runId.toString)
                 queryExecutionThread.interrupt()
-                // No need to join - this thread is about to end anyway.
               }
               false
             } else if (isActive) {
@@ -259,6 +266,7 @@ class ContinuousExecution(
           sparkSessionForQuery, lastExecution)(lastExecution.toRdd)
       }
     } finally {
+      epochEndpoint.askSync[Unit](StopContinuousExecutionWrites)
       SparkEnv.get.rpcEnv.stop(epochEndpoint)
 
       epochUpdateThread.interrupt()
@@ -273,17 +281,22 @@ class ContinuousExecution(
       epoch: Long, reader: ContinuousReader, partitionOffsets: Seq[PartitionOffset]): Unit = {
     assert(continuousSources.length == 1, "only one continuous source supported currently")
 
-    if (partitionOffsets.contains(null)) {
-      // If any offset is null, that means the corresponding partition hasn't seen any data yet, so
-      // there's nothing meaningful to add to the offset log.
-    }
     val globalOffset = reader.mergeOffsets(partitionOffsets.toArray)
-    synchronized {
-      if (queryExecutionThread.isAlive) {
-        offsetLog.add(epoch, OffsetSeq.fill(globalOffset))
-      } else {
-        return
-      }
+    val oldOffset = synchronized {
+      offsetLog.add(epoch, OffsetSeq.fill(globalOffset))
+      offsetLog.get(epoch - 1)
+    }
+
+    // If offset hasn't changed since last epoch, there's been no new data.
+    if (oldOffset.contains(OffsetSeq.fill(globalOffset))) {
+      noNewData = true
+    }
+
+    awaitProgressLock.lock()
+    try {
+      awaitProgressLockCondition.signalAll()
+    } finally {
+      awaitProgressLock.unlock()
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
index 98017c3ac6a33..40dcbecade814 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
@@ -39,6 +39,15 @@ private[continuous] sealed trait EpochCoordinatorMessage extends Serializable
  */
 private[sql] case object IncrementAndGetEpoch extends EpochCoordinatorMessage
 
+/**
+ * The RpcEndpoint stop() will wait to clear out the message queue before terminating the
+ * object. This can lead to a race condition where the query restarts at epoch n, a new
+ * EpochCoordinator starts at epoch n, and then the old epoch coordinator commits epoch n + 1.
+ * The framework doesn't provide a handle to wait on the message queue, so we use a synchronous
+ * message to stop any writes to the ContinuousExecution object.
+ */
+private[sql] case object StopContinuousExecutionWrites extends EpochCoordinatorMessage
+
 // Init messages
 /**
  * Set the reader and writer partition counts. Tasks may not be started until the coordinator
@@ -116,6 +125,8 @@ private[continuous] class EpochCoordinator(
     override val rpcEnv: RpcEnv)
   extends ThreadSafeRpcEndpoint with Logging {
 
+  private var queryWritesStopped: Boolean = false
+
   private var numReaderPartitions: Int = _
   private var numWriterPartitions: Int = _
 
@@ -147,12 +158,16 @@ private[continuous] class EpochCoordinator(
         partitionCommits.remove(k)
       }
       for (k <- partitionOffsets.keys.filter { case (e, _) => e < epoch }) {
-        partitionCommits.remove(k)
+        partitionOffsets.remove(k)
       }
     }
   }
 
   override def receive: PartialFunction[Any, Unit] = {
+    // If we just drop these messages, we won't do any writes to the query. The lame duck tasks
+    // won't shed errors or anything.
+    case _ if queryWritesStopped => ()
+
     case CommitPartitionEpoch(partitionId, epoch, message) =>
       logDebug(s"Got commit from partition $partitionId at epoch $epoch: $message")
       if (!partitionCommits.isDefinedAt((epoch, partitionId))) {
@@ -188,5 +203,9 @@ private[continuous] class EpochCoordinator(
     case SetWriterPartitions(numPartitions) =>
       numWriterPartitions = numPartitions
       context.reply(())
+
+    case StopContinuousExecutionWrites =>
+      queryWritesStopped = true
+      context.reply(())
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
index db588ae282f38..b5b4a05ab4973 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.continuous.ContinuousTrigger
 import org.apache.spark.sql.execution.streaming.sources.{MemoryPlanV2, MemorySinkV2}
+import org.apache.spark.sql.sources.v2.streaming.ContinuousWriteSupport
 
 /**
  * Interface used to write a streaming `Dataset` to external storage systems (e.g. file systems,
@@ -279,18 +280,29 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
         useTempCheckpointLocation = true,
         trigger = trigger)
     } else {
-      val dataSource =
-        DataSource(
-          df.sparkSession,
-          className = source,
-          options = extraOptions.toMap,
-          partitionColumns = normalizedParCols.getOrElse(Nil))
+      val sink = trigger match {
+        case _: ContinuousTrigger =>
+          val ds = DataSource.lookupDataSource(source, df.sparkSession.sessionState.conf)
+          ds.newInstance() match {
+            case w: ContinuousWriteSupport => w
+            case _ => throw new AnalysisException(
+              s"Data source $source does not support continuous writing")
+          }
+        case _ =>
+          val ds = DataSource(
+            df.sparkSession,
+            className = source,
+            options = extraOptions.toMap,
+            partitionColumns = normalizedParCols.getOrElse(Nil))
+          ds.createSink(outputMode)
+      }
+
       df.sparkSession.sessionState.streamingQueryManager.startQuery(
         extraOptions.get("queryName"),
         extraOptions.get("checkpointLocation"),
         df,
         extraOptions.toMap,
-        dataSource.createSink(outputMode),
+        sink,
         outputMode,
         useTempCheckpointLocation = source == "console",
         recoverFromCheckpointLocation = true,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index d46461fa9bf6d..0762895fdc620 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -38,8 +38,9 @@ import org.apache.spark.sql.{Dataset, Encoder, QueryTest, Row}
 import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.execution.streaming.continuous.{ContinuousExecution, EpochCoordinatorRef, IncrementAndGetEpoch}
+import org.apache.spark.sql.execution.streaming.continuous.{ContinuousExecution, ContinuousTrigger, EpochCoordinatorRef, IncrementAndGetEpoch}
 import org.apache.spark.sql.execution.streaming.sources.MemorySinkV2
 import org.apache.spark.sql.execution.streaming.state.StateStore
 import org.apache.spark.sql.streaming.StreamingQueryListener._
@@ -80,6 +81,9 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
     StateStore.stop() // stop the state store maintenance thread and unload store providers
   }
 
+  protected val defaultTrigger = Trigger.ProcessingTime(0)
+  protected val defaultUseV2Sink = false
+
   /** How long to wait for an active stream to catch up when checking a result. */
   val streamingTimeout = 10.seconds
 
@@ -189,7 +193,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
 
   /** Starts the stream, resuming if data has already been processed. It must not be running. */
   case class StartStream(
-      trigger: Trigger = Trigger.ProcessingTime(0),
+      trigger: Trigger = defaultTrigger,
       triggerClock: Clock = new SystemClock,
       additionalConfs: Map[String, String] = Map.empty,
       checkpointLocation: String = null)
@@ -276,7 +280,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
   def testStream(
       _stream: Dataset[_],
       outputMode: OutputMode = OutputMode.Append,
-      useV2Sink: Boolean = false)(actions: StreamAction*): Unit = synchronized {
+      useV2Sink: Boolean = defaultUseV2Sink)(actions: StreamAction*): Unit = synchronized {
     import org.apache.spark.sql.streaming.util.StreamManualClock
 
     // `synchronized` is added to prevent the user from calling multiple `testStream`s concurrently
@@ -403,18 +407,11 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
 
     def fetchStreamAnswer(currentStream: StreamExecution, lastOnly: Boolean) = {
       verify(currentStream != null, "stream not running")
-      // Get the map of source index to the current source objects
-      val indexToSource = currentStream
-        .logicalPlan
-        .collect { case StreamingExecutionRelation(s, _) => s }
-        .zipWithIndex
-        .map(_.swap)
-        .toMap
 
       // Block until all data added has been processed for all the source
       awaiting.foreach { case (sourceIndex, offset) =>
         failAfter(streamingTimeout) {
-          currentStream.awaitOffset(indexToSource(sourceIndex), offset)
+          currentStream.awaitOffset(sourceIndex, offset)
         }
       }
 
@@ -473,6 +470,12 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
             // after starting the query.
             try {
               currentStream.awaitInitialization(streamingTimeout.toMillis)
+              currentStream match {
+                case s: ContinuousExecution => eventually("IncrementalExecution was not created") {
+                    s.lastExecution.executedPlan // will fail if lastExecution is null
+                  }
+                case _ =>
+              }
             } catch {
               case _: StreamingQueryException =>
                 // Ignore the exception. `StopStream` or `ExpectFailure` will catch it as well.
@@ -600,7 +603,10 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
 
               def findSourceIndex(plan: LogicalPlan): Option[Int] = {
                 plan
-                  .collect { case StreamingExecutionRelation(s, _) => s }
+                  .collect {
+                    case StreamingExecutionRelation(s, _) => s
+                    case DataSourceV2Relation(_, r) => r
+                  }
                   .zipWithIndex
                   .find(_._1 == source)
                   .map(_._2)
@@ -613,9 +619,13 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
                   findSourceIndex(query.logicalPlan)
                 }.orElse {
                   findSourceIndex(stream.logicalPlan)
+                }.orElse {
+                  queryToUse.flatMap { q =>
+                    findSourceIndex(q.lastExecution.logical)
+                  }
                 }.getOrElse {
                   throw new IllegalArgumentException(
-                    "Could find index of the source to which data was added")
+                    "Could not find index of the source to which data was added")
                 }
 
               // Store the expected offset of added data to wait for it later

From 2ec302658c98038962c9b7a90fd2cff751a35ffa Mon Sep 17 00:00:00 2001
From: Bago Amirbekian <bago@databricks.com>
Date: Thu, 11 Jan 2018 13:57:15 -0800
Subject: [PATCH 070/566] [SPARK-23046][ML][SPARKR] Have RFormula include
 VectorSizeHint in pipeline

## What changes were proposed in this pull request?

Including VectorSizeHint in RFormula piplelines will allow them to be applied to streaming dataframes.

## How was this patch tested?

Unit tests.

Author: Bago Amirbekian <bago@databricks.com>

Closes #20238 from MrBago/rFormulaVectorSize.

(cherry picked from commit 186bf8fb2e9ff8a80f3f6bcb5f2a0327fa79a1c9)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 R/pkg/R/mllib_utils.R                         |  1 +
 .../apache/spark/ml/feature/RFormula.scala    | 18 +++++++--
 .../spark/ml/feature/RFormulaSuite.scala      | 37 ++++++++++++++++---
 3 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/R/pkg/R/mllib_utils.R b/R/pkg/R/mllib_utils.R
index a53c92c2c4815..23dda42c325be 100644
--- a/R/pkg/R/mllib_utils.R
+++ b/R/pkg/R/mllib_utils.R
@@ -130,3 +130,4 @@ read.ml <- function(path) {
     stop("Unsupported model: ", jobj)
   }
 }
+
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index 7da3339f8b487..f384ffbf578bc 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -25,7 +25,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer}
 import org.apache.spark.ml.attribute.AttributeGroup
-import org.apache.spark.ml.linalg.VectorUDT
+import org.apache.spark.ml.linalg.{Vector, VectorUDT}
 import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap, ParamValidators}
 import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasHandleInvalid, HasLabelCol}
 import org.apache.spark.ml.util._
@@ -210,8 +210,8 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
 
     // First we index each string column referenced by the input terms.
     val indexed: Map[String, String] = resolvedFormula.terms.flatten.distinct.map { term =>
-      dataset.schema(term) match {
-        case column if column.dataType == StringType =>
+      dataset.schema(term).dataType match {
+        case _: StringType =>
           val indexCol = tmpColumn("stridx")
           encoderStages += new StringIndexer()
             .setInputCol(term)
@@ -220,6 +220,18 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
             .setHandleInvalid($(handleInvalid))
           prefixesToRewrite(indexCol + "_") = term + "_"
           (term, indexCol)
+        case _: VectorUDT =>
+          val group = AttributeGroup.fromStructField(dataset.schema(term))
+          val size = if (group.size < 0) {
+            dataset.select(term).first().getAs[Vector](0).size
+          } else {
+            group.size
+          }
+          encoderStages += new VectorSizeHint(uid)
+            .setHandleInvalid("optimistic")
+            .setInputCol(term)
+            .setSize(size)
+          (term, term)
         case _ =>
           (term, term)
       }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index 5d09c90ec6dfa..f3f4b5a3d0233 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -17,15 +17,15 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.SparkException
 import org.apache.spark.ml.attribute._
-import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
-import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
+import org.apache.spark.sql.{DataFrame, Encoder, Row}
 import org.apache.spark.sql.types.DoubleType
 
-class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class RFormulaSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -548,4 +548,31 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
     assert(result3.collect() === expected3.collect())
     assert(result4.collect() === expected4.collect())
   }
+
+  test("Use Vectors as inputs to formula.") {
+    val original = Seq(
+      (1, 4, Vectors.dense(0.0, 0.0, 4.0)),
+      (2, 4, Vectors.dense(1.0, 0.0, 4.0)),
+      (3, 5, Vectors.dense(1.0, 0.0, 5.0)),
+      (4, 5, Vectors.dense(0.0, 1.0, 5.0))
+    ).toDF("id", "a", "b")
+    val formula = new RFormula().setFormula("id ~ a + b")
+    val (first +: rest) = Seq("id", "a", "b", "features", "label")
+    testTransformer[(Int, Int, Vector)](original, formula.fit(original), first, rest: _*) {
+      case Row(id: Int, a: Int, b: Vector, features: Vector, label: Double) =>
+        assert(label === id)
+        assert(features.toArray === a +: b.toArray)
+    }
+
+    val group = new AttributeGroup("b", 3)
+    val vectorColWithMetadata = original("b").as("b", group.toMetadata())
+    val dfWithMetadata = original.withColumn("b", vectorColWithMetadata)
+    val model = formula.fit(dfWithMetadata)
+    // model should work even when applied to dataframe without metadata.
+    testTransformer[(Int, Int, Vector)](original, model, first, rest: _*) {
+      case Row(id: Int, a: Int, b: Vector, features: Vector, label: Double) =>
+        assert(label === id)
+        assert(features.toArray === a +: b.toArray)
+    }
+  }
 }

From 964cc2e31b2862bca0bd968b3e9e2cbf8d3ba5ea Mon Sep 17 00:00:00 2001
From: Sameer Agarwal <sameerag@apache.org>
Date: Thu, 11 Jan 2018 15:23:10 -0800
Subject: [PATCH 071/566] Preparing Spark release v2.3.0-rc1

---
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 2 +-
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 40 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index b3b4239771bc3..2ca9ab66462a8 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index cf93d41cd77cf..404c744f98261 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 18cbdadd224ab..3c0b5288862fa 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 9968480ab7658..fe3bcfd68f988 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index ec2db6e5bb88c..90ca4016ecc79 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 2d59c71cc3757..8bb2c38f7d097 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index f7e586ee777e1..0d5d4b1661012 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index a3772a2620088..d6136e6c29a84 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 0a5bd958fc9c5..364aaab74e67b 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index dcc211204d766..aabf3151890f9 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,7 +14,7 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.0-SNAPSHOT
+SPARK_VERSION: 2.3.0
 SPARK_VERSION_SHORT: 2.3.0
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
diff --git a/examples/pom.xml b/examples/pom.xml
index 1791dbaad775e..28de7a180c2e7 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 485b562dce990..9b74a4d327136 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 71016bc645ca7..1da013353775c 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 12630840e79dc..da8ce5928b8bb 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 87a09642405a7..65a9f9972d398 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index d6f97316b326a..f5a2dbbef3b6f 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 0c9f0aa765a39..e9b8e4e18d6df 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 6eb7ba5f0092d..46a5bd1611d30 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 786349474389b..72446ecb5cd82 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 849c8b465f99e..e49c8281c6ddb 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 48783d65826aa..7ad92b07094f2 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 40a751a652fa9..126624ecd8922 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 36d555066b181..a8bcff8cdc14c 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index cb30e4a4af4bc..6e60fbac9ef5a 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index aa36dd4774d86..7e09e304cb1b7 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index e9b46c4cf0ffa..d8c1792f5b0dd 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 043d13609fd26..d9380ed2c6d4d 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index a906c9e02cd4c..9cb3ed2e7ae1d 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 1b37164376460..39525a5219192 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.0-SNAPSHOT</version>
+  <version>2.3.0</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 12dd53b9d2902..8065b058f8c0e 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.0.dev0"
+__version__ = "2.3.0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 1cb0098d0eca3..c21a1cd108a8a 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 7d35aea8a4142..544e08b333aaf 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 70d0c1750b14e..3dc6955ce312c 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 43a7ce95bd3de..7e1397ef9ba79 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 9e2ced30407d4..6b82f57c9e48d 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 93010c606cf45..b221d9ec9cbc5 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 3135a8a275dae..7ade122666a83 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 66fad85ea0263..da34cb8253272 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index fea882ad11230..adbafac9bc458 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 37427e8da62d8..4e005623e2b26 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 6bb22961c0c9df1a1f22e9491894895b297f5288 Mon Sep 17 00:00:00 2001
From: Sameer Agarwal <sameerag@apache.org>
Date: Thu, 11 Jan 2018 15:23:17 -0800
Subject: [PATCH 072/566] Preparing development version 2.3.1-SNAPSHOT

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 6d46c31906260..29a8a0062873f 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.0
+Version: 2.3.1
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 2ca9ab66462a8..5c5a8e9b8232e 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 404c744f98261..2a625da682cdc 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 3c0b5288862fa..adb1890439094 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index fe3bcfd68f988..4cdcfa2a4da99 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 90ca4016ecc79..494137f8e9ec1 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 8bb2c38f7d097..32bebf5eeca99 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 0d5d4b1661012..8dc4ddeea3ec0 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index d6136e6c29a84..5613f5e81384c 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 364aaab74e67b..c34c9df39063b 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index aabf3151890f9..213579037f51a 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.0
-SPARK_VERSION_SHORT: 2.3.0
+SPARK_VERSION: 2.3.1-SNAPSHOT
+SPARK_VERSION_SHORT: 2.3.1
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 28de7a180c2e7..e584feed1b337 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 9b74a4d327136..6c4031b392f65 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 1da013353775c..6935ade33546a 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index da8ce5928b8bb..60aa5fb1a1031 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 65a9f9972d398..d8ed3364c4bd4 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index f5a2dbbef3b6f..15d550d3bb445 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index e9b8e4e18d6df..7ee7156efe22c 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 46a5bd1611d30..1f28798fd245a 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 72446ecb5cd82..ad3cd0059e183 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index e49c8281c6ddb..53d903fc58aee 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 7ad92b07094f2..4206a1ac4ad99 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 126624ecd8922..7e838812bdc2e 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index a8bcff8cdc14c..ee7eb98e3dfc8 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 6e60fbac9ef5a..2ea2e4a7adbdb 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 7e09e304cb1b7..35ddfac1d6624 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index d8c1792f5b0dd..29c1e53f480a3 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index d9380ed2c6d4d..61eb9dbcf6f7a 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 9cb3ed2e7ae1d..edba0a72ee6a4 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 39525a5219192..a8e448a80c13e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.0</version>
+  <version>2.3.1-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 8065b058f8c0e..b09fa9822184c 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.0"
+__version__ = "2.3.1.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index c21a1cd108a8a..3b445bfaf773d 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 544e08b333aaf..ae1c69f305589 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 3dc6955ce312c..ea31b9c121433 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 7e1397ef9ba79..d42bfff851572 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 6b82f57c9e48d..d053007755dfd 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index b221d9ec9cbc5..ac5efeae37f33 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 7ade122666a83..6e04f528a1c2f 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index da34cb8253272..da50c9bd60fa6 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index adbafac9bc458..6e8e034fc4de4 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 4e005623e2b26..9523085cf5513 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 55695c7127cb2f357dfdf677cab4d21fc840aa3d Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Thu, 11 Jan 2018 16:20:30 -0800
Subject: [PATCH 073/566] [SPARK-23008][ML] OnehotEncoderEstimator python API

## What changes were proposed in this pull request?

OnehotEncoderEstimator python API.

## How was this patch tested?

doctest

Author: WeichenXu <weichen.xu@databricks.com>

Closes #20209 from WeichenXu123/ohe_py.

(cherry picked from commit b5042d75c2faa5f15bc1e160d75f06dfdd6eea37)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 python/pyspark/ml/feature.py                  | 113 ++++++++++++++++++
 .../ml/param/_shared_params_code_gen.py       |   1 +
 python/pyspark/ml/param/shared.py             |  23 ++++
 3 files changed, 137 insertions(+)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 13bf95cce40be..b963e45dd7cff 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -45,6 +45,7 @@
            'NGram',
            'Normalizer',
            'OneHotEncoder',
+           'OneHotEncoderEstimator', 'OneHotEncoderModel',
            'PCA', 'PCAModel',
            'PolynomialExpansion',
            'QuantileDiscretizer',
@@ -1641,6 +1642,118 @@ def getDropLast(self):
         return self.getOrDefault(self.dropLast)
 
 
+@inherit_doc
+class OneHotEncoderEstimator(JavaEstimator, HasInputCols, HasOutputCols, HasHandleInvalid,
+                             JavaMLReadable, JavaMLWritable):
+    """
+    A one-hot encoder that maps a column of category indices to a column of binary vectors, with
+    at most a single one-value per row that indicates the input category index.
+    For example with 5 categories, an input value of 2.0 would map to an output vector of
+    `[0.0, 0.0, 1.0, 0.0]`.
+    The last category is not included by default (configurable via `dropLast`),
+    because it makes the vector entries sum up to one, and hence linearly dependent.
+    So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.
+
+    Note: This is different from scikit-learn's OneHotEncoder, which keeps all categories.
+    The output vectors are sparse.
+
+    When `handleInvalid` is configured to 'keep', an extra "category" indicating invalid values is
+    added as last category. So when `dropLast` is true, invalid values are encoded as all-zeros
+    vector.
+
+    Note: When encoding multi-column by using `inputCols` and `outputCols` params, input/output
+    cols come in pairs, specified by the order in the arrays, and each pair is treated
+    independently.
+
+    See `StringIndexer` for converting categorical values into category indices
+
+    >>> from pyspark.ml.linalg import Vectors
+    >>> df = spark.createDataFrame([(0.0,), (1.0,), (2.0,)], ["input"])
+    >>> ohe = OneHotEncoderEstimator(inputCols=["input"], outputCols=["output"])
+    >>> model = ohe.fit(df)
+    >>> model.transform(df).head().output
+    SparseVector(2, {0: 1.0})
+    >>> ohePath = temp_path + "/oheEstimator"
+    >>> ohe.save(ohePath)
+    >>> loadedOHE = OneHotEncoderEstimator.load(ohePath)
+    >>> loadedOHE.getInputCols() == ohe.getInputCols()
+    True
+    >>> modelPath = temp_path + "/ohe-model"
+    >>> model.save(modelPath)
+    >>> loadedModel = OneHotEncoderModel.load(modelPath)
+    >>> loadedModel.categorySizes == model.categorySizes
+    True
+
+    .. versionadded:: 2.3.0
+    """
+
+    handleInvalid = Param(Params._dummy(), "handleInvalid", "How to handle invalid data during " +
+                          "transform(). Options are 'keep' (invalid data presented as an extra " +
+                          "categorical feature) or error (throw an error). Note that this Param " +
+                          "is only used during transform; during fitting, invalid data will " +
+                          "result in an error.",
+                          typeConverter=TypeConverters.toString)
+
+    dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category",
+                     typeConverter=TypeConverters.toBoolean)
+
+    @keyword_only
+    def __init__(self, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True):
+        """
+        __init__(self, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True)
+        """
+        super(OneHotEncoderEstimator, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.feature.OneHotEncoderEstimator", self.uid)
+        self._setDefault(handleInvalid="error", dropLast=True)
+        kwargs = self._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    @since("2.3.0")
+    def setParams(self, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True):
+        """
+        setParams(self, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True)
+        Sets params for this OneHotEncoderEstimator.
+        """
+        kwargs = self._input_kwargs
+        return self._set(**kwargs)
+
+    @since("2.3.0")
+    def setDropLast(self, value):
+        """
+        Sets the value of :py:attr:`dropLast`.
+        """
+        return self._set(dropLast=value)
+
+    @since("2.3.0")
+    def getDropLast(self):
+        """
+        Gets the value of dropLast or its default value.
+        """
+        return self.getOrDefault(self.dropLast)
+
+    def _create_model(self, java_model):
+        return OneHotEncoderModel(java_model)
+
+
+class OneHotEncoderModel(JavaModel, JavaMLReadable, JavaMLWritable):
+    """
+    Model fitted by :py:class:`OneHotEncoderEstimator`.
+
+    .. versionadded:: 2.3.0
+    """
+
+    @property
+    @since("2.3.0")
+    def categorySizes(self):
+        """
+        Original number of categories for each feature being encoded.
+        The array contains one value for each input column, in order.
+        """
+        return self._call_java("categorySizes")
+
+
 @inherit_doc
 class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable,
                           JavaMLWritable):
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 1d0f60acc6983..db951d81de1e7 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -119,6 +119,7 @@ def get$Name(self):
         ("inputCol", "input column name.", None, "TypeConverters.toString"),
         ("inputCols", "input column names.", None, "TypeConverters.toListString"),
         ("outputCol", "output column name.", "self.uid + '__output'", "TypeConverters.toString"),
+        ("outputCols", "output column names.", None, "TypeConverters.toListString"),
         ("numFeatures", "number of features.", None, "TypeConverters.toInt"),
         ("checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). " +
          "E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: " +
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index 813f7a59f3fd1..474c38764e5a1 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -256,6 +256,29 @@ def getOutputCol(self):
         return self.getOrDefault(self.outputCol)
 
 
+class HasOutputCols(Params):
+    """
+    Mixin for param outputCols: output column names.
+    """
+
+    outputCols = Param(Params._dummy(), "outputCols", "output column names.", typeConverter=TypeConverters.toListString)
+
+    def __init__(self):
+        super(HasOutputCols, self).__init__()
+
+    def setOutputCols(self, value):
+        """
+        Sets the value of :py:attr:`outputCols`.
+        """
+        return self._set(outputCols=value)
+
+    def getOutputCols(self):
+        """
+        Gets the value of outputCols or its default value.
+        """
+        return self.getOrDefault(self.outputCols)
+
+
 class HasNumFeatures(Params):
     """
     Mixin for param numFeatures: number of features.

From 3ae3e1bb71aa88be1c963b4416986ef679d7c8a2 Mon Sep 17 00:00:00 2001
From: ho3rexqj <ho3rexqj@gmail.com>
Date: Fri, 12 Jan 2018 15:27:00 +0800
Subject: [PATCH 074/566] [SPARK-22986][CORE] Use a cache to avoid
 instantiating multiple instances of broadcast variable values

When resources happen to be constrained on an executor the first time a broadcast variable is instantiated it is persisted to disk by the BlockManager. Consequently, every subsequent call to TorrentBroadcast::readBroadcastBlock from other instances of that broadcast variable spawns another instance of the underlying value. That is, broadcast variables are spawned once per executor **unless** memory is constrained, in which case every instance of a broadcast variable is provided with a unique copy of the underlying value.

This patch fixes the above by explicitly caching the underlying values using weak references in a ReferenceMap.

Author: ho3rexqj <ho3rexqj@gmail.com>

Closes #20183 from ho3rexqj/fix/cache-broadcast-values.

(cherry picked from commit cbe7c6fbf9dc2fc422b93b3644c40d449a869eea)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/broadcast/BroadcastManager.scala    |  6 ++
 .../spark/broadcast/TorrentBroadcast.scala    | 72 +++++++++++--------
 .../spark/broadcast/BroadcastSuite.scala      | 34 +++++++++
 3 files changed, 83 insertions(+), 29 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala b/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala
index e88988fe03b2e..8d7a4a353a792 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala
@@ -21,6 +21,8 @@ import java.util.concurrent.atomic.AtomicLong
 
 import scala.reflect.ClassTag
 
+import org.apache.commons.collections.map.{AbstractReferenceMap, ReferenceMap}
+
 import org.apache.spark.{SecurityManager, SparkConf}
 import org.apache.spark.internal.Logging
 
@@ -52,6 +54,10 @@ private[spark] class BroadcastManager(
 
   private val nextBroadcastId = new AtomicLong(0)
 
+  private[broadcast] val cachedValues = {
+    new ReferenceMap(AbstractReferenceMap.HARD, AbstractReferenceMap.WEAK)
+  }
+
   def newBroadcast[T: ClassTag](value_ : T, isLocal: Boolean): Broadcast[T] = {
     broadcastFactory.newBroadcast[T](value_, isLocal, nextBroadcastId.getAndIncrement())
   }
diff --git a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
index 7aecd3c9668ea..e125095cf4777 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
@@ -206,36 +206,50 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)
 
   private def readBroadcastBlock(): T = Utils.tryOrIOException {
     TorrentBroadcast.synchronized {
-      setConf(SparkEnv.get.conf)
-      val blockManager = SparkEnv.get.blockManager
-      blockManager.getLocalValues(broadcastId) match {
-        case Some(blockResult) =>
-          if (blockResult.data.hasNext) {
-            val x = blockResult.data.next().asInstanceOf[T]
-            releaseLock(broadcastId)
-            x
-          } else {
-            throw new SparkException(s"Failed to get locally stored broadcast data: $broadcastId")
-          }
-        case None =>
-          logInfo("Started reading broadcast variable " + id)
-          val startTimeMs = System.currentTimeMillis()
-          val blocks = readBlocks()
-          logInfo("Reading broadcast variable " + id + " took" + Utils.getUsedTimeMs(startTimeMs))
-
-          try {
-            val obj = TorrentBroadcast.unBlockifyObject[T](
-              blocks.map(_.toInputStream()), SparkEnv.get.serializer, compressionCodec)
-            // Store the merged copy in BlockManager so other tasks on this executor don't
-            // need to re-fetch it.
-            val storageLevel = StorageLevel.MEMORY_AND_DISK
-            if (!blockManager.putSingle(broadcastId, obj, storageLevel, tellMaster = false)) {
-              throw new SparkException(s"Failed to store $broadcastId in BlockManager")
+      val broadcastCache = SparkEnv.get.broadcastManager.cachedValues
+
+      Option(broadcastCache.get(broadcastId)).map(_.asInstanceOf[T]).getOrElse {
+        setConf(SparkEnv.get.conf)
+        val blockManager = SparkEnv.get.blockManager
+        blockManager.getLocalValues(broadcastId) match {
+          case Some(blockResult) =>
+            if (blockResult.data.hasNext) {
+              val x = blockResult.data.next().asInstanceOf[T]
+              releaseLock(broadcastId)
+
+              if (x != null) {
+                broadcastCache.put(broadcastId, x)
+              }
+
+              x
+            } else {
+              throw new SparkException(s"Failed to get locally stored broadcast data: $broadcastId")
             }
-            obj
-          } finally {
-            blocks.foreach(_.dispose())
-          }
+          case None =>
+            logInfo("Started reading broadcast variable " + id)
+            val startTimeMs = System.currentTimeMillis()
+            val blocks = readBlocks()
+            logInfo("Reading broadcast variable " + id + " took" + Utils.getUsedTimeMs(startTimeMs))
+
+            try {
+              val obj = TorrentBroadcast.unBlockifyObject[T](
+                blocks.map(_.toInputStream()), SparkEnv.get.serializer, compressionCodec)
+              // Store the merged copy in BlockManager so other tasks on this executor don't
+              // need to re-fetch it.
+              val storageLevel = StorageLevel.MEMORY_AND_DISK
+              if (!blockManager.putSingle(broadcastId, obj, storageLevel, tellMaster = false)) {
+                throw new SparkException(s"Failed to store $broadcastId in BlockManager")
+              }
+
+              if (obj != null) {
+                broadcastCache.put(broadcastId, obj)
+              }
+
+              obj
+            } finally {
+              blocks.foreach(_.dispose())
+            }
+        }
       }
     }
   }
diff --git a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
index 159629825c677..9ad2e9a5e74ac 100644
--- a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
+++ b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
@@ -153,6 +153,40 @@ class BroadcastSuite extends SparkFunSuite with LocalSparkContext with Encryptio
     assert(broadcast.value.sum === 10)
   }
 
+  test("One broadcast value instance per executor") {
+    val conf = new SparkConf()
+      .setMaster("local[4]")
+      .setAppName("test")
+
+    sc = new SparkContext(conf)
+    val list = List[Int](1, 2, 3, 4)
+    val broadcast = sc.broadcast(list)
+    val instances = sc.parallelize(1 to 10)
+      .map(x => System.identityHashCode(broadcast.value))
+      .collect()
+      .toSet
+
+    assert(instances.size === 1)
+  }
+
+  test("One broadcast value instance per executor when memory is constrained") {
+    val conf = new SparkConf()
+      .setMaster("local[4]")
+      .setAppName("test")
+      .set("spark.memory.useLegacyMode", "true")
+      .set("spark.storage.memoryFraction", "0.0")
+
+    sc = new SparkContext(conf)
+    val list = List[Int](1, 2, 3, 4)
+    val broadcast = sc.broadcast(list)
+    val instances = sc.parallelize(1 to 10)
+      .map(x => System.identityHashCode(broadcast.value))
+      .collect()
+      .toSet
+
+    assert(instances.size === 1)
+  }
+
   /**
    * Verify the persistence of state associated with a TorrentBroadcast in a local-cluster.
    *

From d512d873b3f445845bd113272d7158388427f8a6 Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Fri, 12 Jan 2018 11:27:02 +0200
Subject: [PATCH 075/566] [SPARK-23008][ML][FOLLOW-UP] mark OneHotEncoder
 python API deprecated

## What changes were proposed in this pull request?

mark OneHotEncoder python API deprecated

## How was this patch tested?

N/A

Author: WeichenXu <weichen.xu@databricks.com>

Closes #20241 from WeichenXu123/mark_ohe_deprecated.

(cherry picked from commit a7d98d53ceaf69cabaecc6c9113f17438c4e61f6)
Signed-off-by: Nick Pentreath <nickp@za.ibm.com>
---
 python/pyspark/ml/feature.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index b963e45dd7cff..eb79b193103e2 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -1578,6 +1578,9 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable,
     .. note:: This is different from scikit-learn's OneHotEncoder,
         which keeps all categories. The output vectors are sparse.
 
+    .. note:: Deprecated in 2.3.0. :py:class:`OneHotEncoderEstimator` will be renamed to
+        :py:class:`OneHotEncoder` and this :py:class:`OneHotEncoder` will be removed in 3.0.0.
+
     .. seealso::
 
        :py:class:`StringIndexer` for converting categorical values into

From 6152da3893a05b3f8dc0f13895af9be9548e5895 Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Fri, 12 Jan 2018 18:04:44 +0800
Subject: [PATCH 076/566] [SPARK-23025][SQL] Support Null type in scala
 reflection

## What changes were proposed in this pull request?

Add support for `Null` type in the `schemaFor` method for Scala reflection.

## How was this patch tested?

Added UT

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #20219 from mgaido91/SPARK-23025.

(cherry picked from commit 505086806997b4331d4a8c2fc5e08345d869a23c)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../org/apache/spark/sql/catalyst/ScalaReflection.scala  | 4 ++++
 .../apache/spark/sql/catalyst/ScalaReflectionSuite.scala | 9 +++++++++
 .../test/scala/org/apache/spark/sql/DatasetSuite.scala   | 5 +++++
 3 files changed, 18 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 65040f1af4b04..9a4bf0075a178 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -63,6 +63,7 @@ object ScalaReflection extends ScalaReflection {
 
   private def dataTypeFor(tpe: `Type`): DataType = cleanUpReflectionObjects {
     tpe.dealias match {
+      case t if t <:< definitions.NullTpe => NullType
       case t if t <:< definitions.IntTpe => IntegerType
       case t if t <:< definitions.LongTpe => LongType
       case t if t <:< definitions.DoubleTpe => DoubleType
@@ -712,6 +713,9 @@ object ScalaReflection extends ScalaReflection {
   /** Returns a catalyst DataType and its nullability for the given Scala Type using reflection. */
   def schemaFor(tpe: `Type`): Schema = cleanUpReflectionObjects {
     tpe.dealias match {
+      // this must be the first case, since all objects in scala are instances of Null, therefore
+      // Null type would wrongly match the first of them, which is Option as of now
+      case t if t <:< definitions.NullTpe => Schema(NullType, nullable = true)
       case t if t.typeSymbol.annotations.exists(_.tree.tpe =:= typeOf[SQLUserDefinedType]) =>
         val udt = getClassFromType(t).getAnnotation(classOf[SQLUserDefinedType]).udt().newInstance()
         Schema(udt, nullable = true)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
index 23e866cdf4917..8c3db48a01f12 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
@@ -356,4 +356,13 @@ class ScalaReflectionSuite extends SparkFunSuite {
     assert(deserializerFor[Int].isInstanceOf[AssertNotNull])
     assert(!deserializerFor[String].isInstanceOf[AssertNotNull])
   }
+
+  test("SPARK-23025: schemaFor should support Null type") {
+    val schema = schemaFor[(Int, Null)]
+    assert(schema === Schema(
+      StructType(Seq(
+        StructField("_1", IntegerType, nullable = false),
+        StructField("_2", NullType, nullable = true))),
+      nullable = true))
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index d535896723bd5..54893c184642b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1441,6 +1441,11 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       assert(e.getCause.isInstanceOf[NullPointerException])
     }
   }
+
+  test("SPARK-23025: Add support for null type in scala reflection") {
+    val data = Seq(("a", null))
+    checkDataset(data.toDS(), data: _*)
+  }
 }
 
 case class SingleData(id: Int)

From db27a93652780f234f3c5fe750ef07bc5525d177 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Fri, 12 Jan 2018 10:18:42 -0800
Subject: [PATCH 077/566] [MINOR][BUILD] Fix Java linter errors

## What changes were proposed in this pull request?

This PR cleans up the java-lint errors (for v2.3.0-rc1 tag). Hopefully, this will be the final one.

```
$ dev/lint-java
Using `mvn` from path: /usr/local/bin/mvn
Checkstyle checks failed at following occurrences:
[ERROR] src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java:[85] (sizes) LineLength: Line is longer than 100 characters (found 101).
[ERROR] src/main/java/org/apache/spark/launcher/InProcessAppHandle.java:[20,8] (imports) UnusedImports: Unused import - java.io.IOException.
[ERROR] src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java:[41,9] (modifier) ModifierOrder: 'private' modifier out of order with the JLS suggestions.
[ERROR] src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java:[464] (sizes) LineLength: Line is longer than 100 characters (found 102).
```

## How was this patch tested?

Manual.

```
$ dev/lint-java
Using `mvn` from path: /usr/local/bin/mvn
Checkstyle checks passed.
```

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20242 from dongjoon-hyun/fix_lint_java_2.3_rc1.

(cherry picked from commit 7bd14cfd40500a0b6462cda647bdbb686a430328)
Signed-off-by: Sameer Agarwal <sameerag@apache.org>
---
 .../org/apache/spark/unsafe/memory/HeapMemoryAllocator.java    | 3 ++-
 .../java/org/apache/spark/launcher/InProcessAppHandle.java     | 1 -
 .../spark/sql/execution/datasources/orc/OrcColumnVector.java   | 2 +-
 .../java/test/org/apache/spark/sql/JavaDataFrameSuite.java     | 3 ++-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
index 3acfe3696cb1e..a9603c1aba051 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
@@ -82,7 +82,8 @@ public void free(MemoryBlock memory) {
       "page has already been freed";
     assert ((memory.pageNumber == MemoryBlock.NO_PAGE_NUMBER)
             || (memory.pageNumber == MemoryBlock.FREED_IN_TMM_PAGE_NUMBER)) :
-      "TMM-allocated pages must first be freed via TMM.freePage(), not directly in allocator free()";
+      "TMM-allocated pages must first be freed via TMM.freePage(), not directly in allocator " +
+        "free()";
 
     final long size = memory.size();
     if (MemoryAllocator.MEMORY_DEBUG_FILL_ENABLED) {
diff --git a/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
index 0d6a73a3da3ed..acd64c962604f 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
@@ -17,7 +17,6 @@
 
 package org.apache.spark.launcher;
 
-import java.io.IOException;
 import java.lang.reflect.Method;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.logging.Level;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
index f94c55d860304..b6e792274da11 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
@@ -38,7 +38,7 @@ public class OrcColumnVector extends org.apache.spark.sql.vectorized.ColumnVecto
   private BytesColumnVector bytesData;
   private DecimalColumnVector decimalData;
   private TimestampColumnVector timestampData;
-  final private boolean isTimestamp;
+  private final boolean isTimestamp;
 
   private int batchSize;
 
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
index 4f8a31f185724..69a2904f5f3fe 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -461,7 +461,8 @@ public void testCircularReferenceBean() {
   public void testUDF() {
     UserDefinedFunction foo = udf((Integer i, String s) -> i.toString() + s, DataTypes.StringType);
     Dataset<Row> df = spark.table("testData").select(foo.apply(col("key"), col("value")));
-    String[] result = df.collectAsList().stream().map(row -> row.getString(0)).toArray(String[]::new);
+    String[] result = df.collectAsList().stream().map(row -> row.getString(0))
+      .toArray(String[]::new);
     String[] expected = spark.table("testData").collectAsList().stream()
       .map(row -> row.get(0).toString() + row.getString(1)).toArray(String[]::new);
     Assert.assertArrayEquals(expected, result);

From 02176f4c2f60342068669b215485ffd443346aed Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Fri, 12 Jan 2018 11:25:37 -0800
Subject: [PATCH 078/566] [SPARK-22975][SS] MetricsReporter should not throw
 exception when there was no progress reported

## What changes were proposed in this pull request?

`MetricsReporter ` assumes that there has been some progress for the query, ie. `lastProgress` is not null. If this is not true, as it might happen in particular conditions, a `NullPointerException` can be thrown.

The PR checks whether there is a `lastProgress` and if this is not true, it returns a default value for the metrics.

## How was this patch tested?

added UT

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #20189 from mgaido91/SPARK-22975.

(cherry picked from commit 54277398afbde92a38ba2802f4a7a3e5910533de)
Signed-off-by: Shixiong Zhu <zsxwing@gmail.com>
---
 .../execution/streaming/MetricsReporter.scala | 21 ++++++++---------
 .../sql/streaming/StreamingQuerySuite.scala   | 23 +++++++++++++++++++
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala
index b84e6ce64c611..66b11ecddf233 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala
@@ -17,15 +17,11 @@
 
 package org.apache.spark.sql.execution.streaming
 
-import java.{util => ju}
-
-import scala.collection.mutable
-
 import com.codahale.metrics.{Gauge, MetricRegistry}
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.metrics.source.{Source => CodahaleSource}
-import org.apache.spark.util.Clock
+import org.apache.spark.sql.streaming.StreamingQueryProgress
 
 /**
  * Serves metrics from a [[org.apache.spark.sql.streaming.StreamingQuery]] to
@@ -39,14 +35,17 @@ class MetricsReporter(
 
   // Metric names should not have . in them, so that all the metrics of a query are identified
   // together in Ganglia as a single metric group
-  registerGauge("inputRate-total", () => stream.lastProgress.inputRowsPerSecond)
-  registerGauge("processingRate-total", () => stream.lastProgress.processedRowsPerSecond)
-  registerGauge("latency", () => stream.lastProgress.durationMs.get("triggerExecution").longValue())
-
-  private def registerGauge[T](name: String, f: () => T)(implicit num: Numeric[T]): Unit = {
+  registerGauge("inputRate-total", _.inputRowsPerSecond, 0.0)
+  registerGauge("processingRate-total", _.processedRowsPerSecond, 0.0)
+  registerGauge("latency", _.durationMs.get("triggerExecution").longValue(), 0L)
+
+  private def registerGauge[T](
+      name: String,
+      f: StreamingQueryProgress => T,
+      default: T): Unit = {
     synchronized {
       metricRegistry.register(name, new Gauge[T] {
-        override def getValue: T = f()
+        override def getValue: T = Option(stream.lastProgress).map(f).getOrElse(default)
       })
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 2fa4595dab376..76201c63a2701 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -424,6 +424,29 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging wi
     }
   }
 
+  test("SPARK-22975: MetricsReporter defaults when there was no progress reported") {
+    withSQLConf("spark.sql.streaming.metricsEnabled" -> "true") {
+      BlockingSource.latch = new CountDownLatch(1)
+      withTempDir { tempDir =>
+        val sq = spark.readStream
+          .format("org.apache.spark.sql.streaming.util.BlockingSource")
+          .load()
+          .writeStream
+          .format("org.apache.spark.sql.streaming.util.BlockingSource")
+          .option("checkpointLocation", tempDir.toString)
+          .start()
+          .asInstanceOf[StreamingQueryWrapper]
+          .streamingQuery
+
+        val gauges = sq.streamMetrics.metricRegistry.getGauges
+        assert(gauges.get("latency").getValue.asInstanceOf[Long] == 0)
+        assert(gauges.get("processingRate-total").getValue.asInstanceOf[Double] == 0.0)
+        assert(gauges.get("inputRate-total").getValue.asInstanceOf[Double] == 0.0)
+        sq.stop()
+      }
+    }
+  }
+
   test("input row calculation with mixed batch and streaming sources") {
     val streamingTriggerDF = spark.createDataset(1 to 10).toDF
     val streamingInputDF = createSingleTriggerStreamingDF(streamingTriggerDF).toDF("value")

From 60bcb4685022c29a6ddcf707b505369687ec7da6 Mon Sep 17 00:00:00 2001
From: Sameer Agarwal <sameerag@apache.org>
Date: Fri, 12 Jan 2018 15:07:14 -0800
Subject: [PATCH 079/566] Revert "[SPARK-22908] Add kafka source and sink for
 continuous processing."

This reverts commit f891ee3249e04576dd579cbab6f8f1632550e6bd.
---
 .../sql/kafka010/KafkaContinuousReader.scala  | 232 ---------
 .../sql/kafka010/KafkaContinuousWriter.scala  | 119 -----
 .../sql/kafka010/KafkaOffsetReader.scala      |  21 +-
 .../spark/sql/kafka010/KafkaSource.scala      |  17 +-
 .../sql/kafka010/KafkaSourceOffset.scala      |   7 +-
 .../sql/kafka010/KafkaSourceProvider.scala    | 105 +---
 .../spark/sql/kafka010/KafkaWriteTask.scala   |  71 +--
 .../spark/sql/kafka010/KafkaWriter.scala      |   5 +-
 .../kafka010/KafkaContinuousSinkSuite.scala   | 474 ------------------
 .../kafka010/KafkaContinuousSourceSuite.scala |  96 ----
 .../sql/kafka010/KafkaContinuousTest.scala    |  64 ---
 .../spark/sql/kafka010/KafkaSourceSuite.scala | 470 ++++++++---------
 .../apache/spark/sql/DataFrameReader.scala    |  32 +-
 .../apache/spark/sql/DataFrameWriter.scala    |  25 +-
 .../datasources/v2/WriteToDataSourceV2.scala  |   8 +-
 .../execution/streaming/StreamExecution.scala |  15 +-
 .../ContinuousDataSourceRDDIter.scala         |   3 +-
 .../continuous/ContinuousExecution.scala      |  67 +--
 .../continuous/EpochCoordinator.scala         |  21 +-
 .../sql/streaming/DataStreamWriter.scala      |  26 +-
 .../spark/sql/streaming/StreamTest.scala      |  36 +-
 21 files changed, 383 insertions(+), 1531 deletions(-)
 delete mode 100644 external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
 delete mode 100644 external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala
 delete mode 100644 external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
 delete mode 100644 external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala
 delete mode 100644 external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
deleted file mode 100644
index 928379544758c..0000000000000
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.kafka010
-
-import java.{util => ju}
-
-import org.apache.kafka.clients.consumer.ConsumerRecord
-import org.apache.kafka.common.TopicPartition
-
-import org.apache.spark.internal.Logging
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.expressions.UnsafeRow
-import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter}
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.kafka010.KafkaSource.{INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE, INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE}
-import org.apache.spark.sql.sources.v2.reader._
-import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousDataReader, ContinuousReader, Offset, PartitionOffset}
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.unsafe.types.UTF8String
-
-/**
- * A [[ContinuousReader]] for data from kafka.
- *
- * @param offsetReader  a reader used to get kafka offsets. Note that the actual data will be
- *                      read by per-task consumers generated later.
- * @param kafkaParams   String params for per-task Kafka consumers.
- * @param sourceOptions The [[org.apache.spark.sql.sources.v2.DataSourceV2Options]] params which
- *                      are not Kafka consumer params.
- * @param metadataPath Path to a directory this reader can use for writing metadata.
- * @param initialOffsets The Kafka offsets to start reading data at.
- * @param failOnDataLoss Flag indicating whether reading should fail in data loss
- *                       scenarios, where some offsets after the specified initial ones can't be
- *                       properly read.
- */
-class KafkaContinuousReader(
-    offsetReader: KafkaOffsetReader,
-    kafkaParams: ju.Map[String, Object],
-    sourceOptions: Map[String, String],
-    metadataPath: String,
-    initialOffsets: KafkaOffsetRangeLimit,
-    failOnDataLoss: Boolean)
-  extends ContinuousReader with SupportsScanUnsafeRow with Logging {
-
-  private lazy val session = SparkSession.getActiveSession.get
-  private lazy val sc = session.sparkContext
-
-  // Initialized when creating read tasks. If this diverges from the partitions at the latest
-  // offsets, we need to reconfigure.
-  // Exposed outside this object only for unit tests.
-  private[sql] var knownPartitions: Set[TopicPartition] = _
-
-  override def readSchema: StructType = KafkaOffsetReader.kafkaSchema
-
-  private var offset: Offset = _
-  override def setOffset(start: ju.Optional[Offset]): Unit = {
-    offset = start.orElse {
-      val offsets = initialOffsets match {
-        case EarliestOffsetRangeLimit => KafkaSourceOffset(offsetReader.fetchEarliestOffsets())
-        case LatestOffsetRangeLimit => KafkaSourceOffset(offsetReader.fetchLatestOffsets())
-        case SpecificOffsetRangeLimit(p) => offsetReader.fetchSpecificOffsets(p, reportDataLoss)
-      }
-      logInfo(s"Initial offsets: $offsets")
-      offsets
-    }
-  }
-
-  override def getStartOffset(): Offset = offset
-
-  override def deserializeOffset(json: String): Offset = {
-    KafkaSourceOffset(JsonUtils.partitionOffsets(json))
-  }
-
-  override def createUnsafeRowReadTasks(): ju.List[ReadTask[UnsafeRow]] = {
-    import scala.collection.JavaConverters._
-
-    val oldStartPartitionOffsets = KafkaSourceOffset.getPartitionOffsets(offset)
-
-    val currentPartitionSet = offsetReader.fetchEarliestOffsets().keySet
-    val newPartitions = currentPartitionSet.diff(oldStartPartitionOffsets.keySet)
-    val newPartitionOffsets = offsetReader.fetchEarliestOffsets(newPartitions.toSeq)
-
-    val deletedPartitions = oldStartPartitionOffsets.keySet.diff(currentPartitionSet)
-    if (deletedPartitions.nonEmpty) {
-      reportDataLoss(s"Some partitions were deleted: $deletedPartitions")
-    }
-
-    val startOffsets = newPartitionOffsets ++
-      oldStartPartitionOffsets.filterKeys(!deletedPartitions.contains(_))
-    knownPartitions = startOffsets.keySet
-
-    startOffsets.toSeq.map {
-      case (topicPartition, start) =>
-        KafkaContinuousReadTask(
-          topicPartition, start, kafkaParams, failOnDataLoss)
-          .asInstanceOf[ReadTask[UnsafeRow]]
-    }.asJava
-  }
-
-  /** Stop this source and free any resources it has allocated. */
-  def stop(): Unit = synchronized {
-    offsetReader.close()
-  }
-
-  override def commit(end: Offset): Unit = {}
-
-  override def mergeOffsets(offsets: Array[PartitionOffset]): Offset = {
-    val mergedMap = offsets.map {
-      case KafkaSourcePartitionOffset(p, o) => Map(p -> o)
-    }.reduce(_ ++ _)
-    KafkaSourceOffset(mergedMap)
-  }
-
-  override def needsReconfiguration(): Boolean = {
-    knownPartitions != null && offsetReader.fetchLatestOffsets().keySet != knownPartitions
-  }
-
-  override def toString(): String = s"KafkaSource[$offsetReader]"
-
-  /**
-   * If `failOnDataLoss` is true, this method will throw an `IllegalStateException`.
-   * Otherwise, just log a warning.
-   */
-  private def reportDataLoss(message: String): Unit = {
-    if (failOnDataLoss) {
-      throw new IllegalStateException(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE")
-    } else {
-      logWarning(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE")
-    }
-  }
-}
-
-/**
- * A read task for continuous Kafka processing. This will be serialized and transformed into a
- * full reader on executors.
- *
- * @param topicPartition The (topic, partition) pair this task is responsible for.
- * @param startOffset The offset to start reading from within the partition.
- * @param kafkaParams Kafka consumer params to use.
- * @param failOnDataLoss Flag indicating whether data reader should fail if some offsets
- *                       are skipped.
- */
-case class KafkaContinuousReadTask(
-    topicPartition: TopicPartition,
-    startOffset: Long,
-    kafkaParams: ju.Map[String, Object],
-    failOnDataLoss: Boolean) extends ReadTask[UnsafeRow] {
-  override def createDataReader(): KafkaContinuousDataReader = {
-    new KafkaContinuousDataReader(topicPartition, startOffset, kafkaParams, failOnDataLoss)
-  }
-}
-
-/**
- * A per-task data reader for continuous Kafka processing.
- *
- * @param topicPartition The (topic, partition) pair this data reader is responsible for.
- * @param startOffset The offset to start reading from within the partition.
- * @param kafkaParams Kafka consumer params to use.
- * @param failOnDataLoss Flag indicating whether data reader should fail if some offsets
- *                       are skipped.
- */
-class KafkaContinuousDataReader(
-    topicPartition: TopicPartition,
-    startOffset: Long,
-    kafkaParams: ju.Map[String, Object],
-    failOnDataLoss: Boolean) extends ContinuousDataReader[UnsafeRow] {
-  private val topic = topicPartition.topic
-  private val kafkaPartition = topicPartition.partition
-  private val consumer = CachedKafkaConsumer.createUncached(topic, kafkaPartition, kafkaParams)
-
-  private val sharedRow = new UnsafeRow(7)
-  private val bufferHolder = new BufferHolder(sharedRow)
-  private val rowWriter = new UnsafeRowWriter(bufferHolder, 7)
-
-  private var nextKafkaOffset = startOffset
-  private var currentRecord: ConsumerRecord[Array[Byte], Array[Byte]] = _
-
-  override def next(): Boolean = {
-    var r: ConsumerRecord[Array[Byte], Array[Byte]] = null
-    while (r == null) {
-      r = consumer.get(
-        nextKafkaOffset,
-        untilOffset = Long.MaxValue,
-        pollTimeoutMs = Long.MaxValue,
-        failOnDataLoss)
-    }
-    nextKafkaOffset = r.offset + 1
-    currentRecord = r
-    true
-  }
-
-  override def get(): UnsafeRow = {
-    bufferHolder.reset()
-
-    if (currentRecord.key == null) {
-      rowWriter.setNullAt(0)
-    } else {
-      rowWriter.write(0, currentRecord.key)
-    }
-    rowWriter.write(1, currentRecord.value)
-    rowWriter.write(2, UTF8String.fromString(currentRecord.topic))
-    rowWriter.write(3, currentRecord.partition)
-    rowWriter.write(4, currentRecord.offset)
-    rowWriter.write(5,
-      DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(currentRecord.timestamp)))
-    rowWriter.write(6, currentRecord.timestampType.id)
-    sharedRow.setTotalSize(bufferHolder.totalSize)
-    sharedRow
-  }
-
-  override def getOffset(): KafkaSourcePartitionOffset = {
-    KafkaSourcePartitionOffset(topicPartition, nextKafkaOffset)
-  }
-
-  override def close(): Unit = {
-    consumer.close()
-  }
-}
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala
deleted file mode 100644
index 9843f469c5b25..0000000000000
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.kafka010
-
-import org.apache.kafka.clients.producer.{Callback, ProducerRecord, RecordMetadata}
-import scala.collection.JavaConverters._
-
-import org.apache.spark.internal.Logging
-import org.apache.spark.sql.{Row, SparkSession}
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Literal, UnsafeProjection}
-import org.apache.spark.sql.kafka010.KafkaSourceProvider.{kafkaParamsForProducer, TOPIC_OPTION_KEY}
-import org.apache.spark.sql.kafka010.KafkaWriter.validateQuery
-import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
-import org.apache.spark.sql.sources.v2.writer._
-import org.apache.spark.sql.streaming.OutputMode
-import org.apache.spark.sql.types.{BinaryType, StringType, StructType}
-
-/**
- * Dummy commit message. The DataSourceV2 framework requires a commit message implementation but we
- * don't need to really send one.
- */
-case object KafkaWriterCommitMessage extends WriterCommitMessage
-
-/**
- * A [[ContinuousWriter]] for Kafka writing. Responsible for generating the writer factory.
- * @param topic The topic this writer is responsible for. If None, topic will be inferred from
- *              a `topic` field in the incoming data.
- * @param producerParams Parameters for Kafka producers in each task.
- * @param schema The schema of the input data.
- */
-class KafkaContinuousWriter(
-    topic: Option[String], producerParams: Map[String, String], schema: StructType)
-  extends ContinuousWriter with SupportsWriteInternalRow {
-
-  validateQuery(schema.toAttributes, producerParams.toMap[String, Object].asJava, topic)
-
-  override def createInternalRowWriterFactory(): KafkaContinuousWriterFactory =
-    KafkaContinuousWriterFactory(topic, producerParams, schema)
-
-  override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {}
-  override def abort(messages: Array[WriterCommitMessage]): Unit = {}
-}
-
-/**
- * A [[DataWriterFactory]] for Kafka writing. Will be serialized and sent to executors to generate
- * the per-task data writers.
- * @param topic The topic that should be written to. If None, topic will be inferred from
- *              a `topic` field in the incoming data.
- * @param producerParams Parameters for Kafka producers in each task.
- * @param schema The schema of the input data.
- */
-case class KafkaContinuousWriterFactory(
-    topic: Option[String], producerParams: Map[String, String], schema: StructType)
-  extends DataWriterFactory[InternalRow] {
-
-  override def createDataWriter(partitionId: Int, attemptNumber: Int): DataWriter[InternalRow] = {
-    new KafkaContinuousDataWriter(topic, producerParams, schema.toAttributes)
-  }
-}
-
-/**
- * A [[DataWriter]] for Kafka writing. One data writer will be created in each partition to
- * process incoming rows.
- *
- * @param targetTopic The topic that this data writer is targeting. If None, topic will be inferred
- *                    from a `topic` field in the incoming data.
- * @param producerParams Parameters to use for the Kafka producer.
- * @param inputSchema The attributes in the input data.
- */
-class KafkaContinuousDataWriter(
-    targetTopic: Option[String], producerParams: Map[String, String], inputSchema: Seq[Attribute])
-  extends KafkaRowWriter(inputSchema, targetTopic) with DataWriter[InternalRow] {
-  import scala.collection.JavaConverters._
-
-  private lazy val producer = CachedKafkaProducer.getOrCreate(
-    new java.util.HashMap[String, Object](producerParams.asJava))
-
-  def write(row: InternalRow): Unit = {
-    checkForErrors()
-    sendRow(row, producer)
-  }
-
-  def commit(): WriterCommitMessage = {
-    // Send is asynchronous, but we can't commit until all rows are actually in Kafka.
-    // This requires flushing and then checking that no callbacks produced errors.
-    // We also check for errors before to fail as soon as possible - the check is cheap.
-    checkForErrors()
-    producer.flush()
-    checkForErrors()
-    KafkaWriterCommitMessage
-  }
-
-  def abort(): Unit = {}
-
-  def close(): Unit = {
-    checkForErrors()
-    if (producer != null) {
-      producer.flush()
-      checkForErrors()
-      CachedKafkaProducer.close(new java.util.HashMap[String, Object](producerParams.asJava))
-    }
-  }
-}
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
index 551641cfdbca8..3e65949a6fd1b 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
@@ -117,14 +117,10 @@ private[kafka010] class KafkaOffsetReader(
    * Resolves the specific offsets based on Kafka seek positions.
    * This method resolves offset value -1 to the latest and -2 to the
    * earliest Kafka seek position.
-   *
-   * @param partitionOffsets the specific offsets to resolve
-   * @param reportDataLoss callback to either report or log data loss depending on setting
    */
   def fetchSpecificOffsets(
-      partitionOffsets: Map[TopicPartition, Long],
-      reportDataLoss: String => Unit): KafkaSourceOffset = {
-    val fetched = runUninterruptibly {
+      partitionOffsets: Map[TopicPartition, Long]): Map[TopicPartition, Long] =
+    runUninterruptibly {
       withRetriesWithoutInterrupt {
         // Poll to get the latest assigned partitions
         consumer.poll(0)
@@ -149,19 +145,6 @@ private[kafka010] class KafkaOffsetReader(
       }
     }
 
-    partitionOffsets.foreach {
-      case (tp, off) if off != KafkaOffsetRangeLimit.LATEST &&
-        off != KafkaOffsetRangeLimit.EARLIEST =>
-        if (fetched(tp) != off) {
-          reportDataLoss(
-            s"startingOffsets for $tp was $off but consumer reset to ${fetched(tp)}")
-        }
-      case _ =>
-        // no real way to check that beginning or end is reasonable
-    }
-    KafkaSourceOffset(fetched)
-  }
-
   /**
    * Fetch the earliest offsets for the topic partitions that are indicated
    * in the [[ConsumerStrategy]].
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
index 27da76068a66f..e9cff04ba5f2e 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -130,7 +130,7 @@ private[kafka010] class KafkaSource(
       val offsets = startingOffsets match {
         case EarliestOffsetRangeLimit => KafkaSourceOffset(kafkaReader.fetchEarliestOffsets())
         case LatestOffsetRangeLimit => KafkaSourceOffset(kafkaReader.fetchLatestOffsets())
-        case SpecificOffsetRangeLimit(p) => kafkaReader.fetchSpecificOffsets(p, reportDataLoss)
+        case SpecificOffsetRangeLimit(p) => fetchAndVerify(p)
       }
       metadataLog.add(0, offsets)
       logInfo(s"Initial offsets: $offsets")
@@ -138,6 +138,21 @@ private[kafka010] class KafkaSource(
     }.partitionToOffsets
   }
 
+  private def fetchAndVerify(specificOffsets: Map[TopicPartition, Long]) = {
+    val result = kafkaReader.fetchSpecificOffsets(specificOffsets)
+    specificOffsets.foreach {
+      case (tp, off) if off != KafkaOffsetRangeLimit.LATEST &&
+          off != KafkaOffsetRangeLimit.EARLIEST =>
+        if (result(tp) != off) {
+          reportDataLoss(
+            s"startingOffsets for $tp was $off but consumer reset to ${result(tp)}")
+        }
+      case _ =>
+      // no real way to check that beginning or end is reasonable
+    }
+    KafkaSourceOffset(result)
+  }
+
   private var currentPartitionOffsets: Option[Map[TopicPartition, Long]] = None
 
   override def schema: StructType = KafkaOffsetReader.kafkaSchema
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
index c82154cfbad7f..b5da415b3097e 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
@@ -20,22 +20,17 @@ package org.apache.spark.sql.kafka010
 import org.apache.kafka.common.TopicPartition
 
 import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset}
-import org.apache.spark.sql.sources.v2.streaming.reader.{Offset => OffsetV2, PartitionOffset}
 
 /**
  * An [[Offset]] for the [[KafkaSource]]. This one tracks all partitions of subscribed topics and
  * their offsets.
  */
 private[kafka010]
-case class KafkaSourceOffset(partitionToOffsets: Map[TopicPartition, Long]) extends OffsetV2 {
+case class KafkaSourceOffset(partitionToOffsets: Map[TopicPartition, Long]) extends Offset {
 
   override val json = JsonUtils.partitionOffsets(partitionToOffsets)
 }
 
-private[kafka010]
-case class KafkaSourcePartitionOffset(topicPartition: TopicPartition, partitionOffset: Long)
-  extends PartitionOffset
-
 /** Companion object of the [[KafkaSourceOffset]] */
 private[kafka010] object KafkaSourceOffset {
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
index 3914370a96595..3cb4d8cad12cc 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.kafka010
 
 import java.{util => ju}
-import java.util.{Locale, Optional, UUID}
+import java.util.{Locale, UUID}
 
 import scala.collection.JavaConverters._
 
@@ -27,12 +27,9 @@ import org.apache.kafka.clients.producer.ProducerConfig
 import org.apache.kafka.common.serialization.{ByteArrayDeserializer, ByteArraySerializer}
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SparkSession, SQLContext}
-import org.apache.spark.sql.execution.streaming.{Offset, Sink, Source}
+import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
+import org.apache.spark.sql.execution.streaming.{Sink, Source}
 import org.apache.spark.sql.sources._
-import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
-import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, ContinuousWriteSupport}
-import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 
@@ -46,8 +43,6 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
     with StreamSinkProvider
     with RelationProvider
     with CreatableRelationProvider
-    with ContinuousWriteSupport
-    with ContinuousReadSupport
     with Logging {
   import KafkaSourceProvider._
 
@@ -106,43 +101,6 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
       failOnDataLoss(caseInsensitiveParams))
   }
 
-  override def createContinuousReader(
-      schema: Optional[StructType],
-      metadataPath: String,
-      options: DataSourceV2Options): KafkaContinuousReader = {
-    val parameters = options.asMap().asScala.toMap
-    validateStreamOptions(parameters)
-    // Each running query should use its own group id. Otherwise, the query may be only assigned
-    // partial data since Kafka will assign partitions to multiple consumers having the same group
-    // id. Hence, we should generate a unique id for each query.
-    val uniqueGroupId = s"spark-kafka-source-${UUID.randomUUID}-${metadataPath.hashCode}"
-
-    val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
-    val specifiedKafkaParams =
-      parameters
-        .keySet
-        .filter(_.toLowerCase(Locale.ROOT).startsWith("kafka."))
-        .map { k => k.drop(6).toString -> parameters(k) }
-        .toMap
-
-    val startingStreamOffsets = KafkaSourceProvider.getKafkaOffsetRangeLimit(caseInsensitiveParams,
-      STARTING_OFFSETS_OPTION_KEY, LatestOffsetRangeLimit)
-
-    val kafkaOffsetReader = new KafkaOffsetReader(
-      strategy(caseInsensitiveParams),
-      kafkaParamsForDriver(specifiedKafkaParams),
-      parameters,
-      driverGroupIdPrefix = s"$uniqueGroupId-driver")
-
-    new KafkaContinuousReader(
-      kafkaOffsetReader,
-      kafkaParamsForExecutors(specifiedKafkaParams, uniqueGroupId),
-      parameters,
-      metadataPath,
-      startingStreamOffsets,
-      failOnDataLoss(caseInsensitiveParams))
-  }
-
   /**
    * Returns a new base relation with the given parameters.
    *
@@ -223,22 +181,26 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
     }
   }
 
-  override def createContinuousWriter(
-      queryId: String,
-      schema: StructType,
-      mode: OutputMode,
-      options: DataSourceV2Options): Optional[ContinuousWriter] = {
-    import scala.collection.JavaConverters._
-
-    val spark = SparkSession.getActiveSession.get
-    val topic = Option(options.get(TOPIC_OPTION_KEY).orElse(null)).map(_.trim)
-    // We convert the options argument from V2 -> Java map -> scala mutable -> scala immutable.
-    val producerParams = kafkaParamsForProducer(options.asMap.asScala.toMap)
-
-    KafkaWriter.validateQuery(
-      schema.toAttributes, new java.util.HashMap[String, Object](producerParams.asJava), topic)
+  private def kafkaParamsForProducer(parameters: Map[String, String]): Map[String, String] = {
+    val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
+    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}")) {
+      throw new IllegalArgumentException(
+        s"Kafka option '${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}' is not supported as keys "
+          + "are serialized with ByteArraySerializer.")
+    }
 
-    Optional.of(new KafkaContinuousWriter(topic, producerParams, schema))
+    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}"))
+    {
+      throw new IllegalArgumentException(
+        s"Kafka option '${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}' is not supported as "
+          + "value are serialized with ByteArraySerializer.")
+    }
+    parameters
+      .keySet
+      .filter(_.toLowerCase(Locale.ROOT).startsWith("kafka."))
+      .map { k => k.drop(6).toString -> parameters(k) }
+      .toMap + (ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName,
+        ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName)
   }
 
   private def strategy(caseInsensitiveParams: Map[String, String]) =
@@ -488,27 +450,4 @@ private[kafka010] object KafkaSourceProvider extends Logging {
 
     def build(): ju.Map[String, Object] = map
   }
-
-  private[kafka010] def kafkaParamsForProducer(
-      parameters: Map[String, String]): Map[String, String] = {
-    val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
-    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}")) {
-      throw new IllegalArgumentException(
-        s"Kafka option '${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}' is not supported as keys "
-          + "are serialized with ByteArraySerializer.")
-    }
-
-    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}"))
-    {
-      throw new IllegalArgumentException(
-        s"Kafka option '${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}' is not supported as "
-          + "value are serialized with ByteArraySerializer.")
-    }
-    parameters
-      .keySet
-      .filter(_.toLowerCase(Locale.ROOT).startsWith("kafka."))
-      .map { k => k.drop(6).toString -> parameters(k) }
-      .toMap + (ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName,
-      ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName)
-  }
 }
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
index baa60febf661d..6fd333e2f43ba 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
@@ -33,8 +33,10 @@ import org.apache.spark.sql.types.{BinaryType, StringType}
 private[kafka010] class KafkaWriteTask(
     producerConfiguration: ju.Map[String, Object],
     inputSchema: Seq[Attribute],
-    topic: Option[String]) extends KafkaRowWriter(inputSchema, topic) {
+    topic: Option[String]) {
   // used to synchronize with Kafka callbacks
+  @volatile private var failedWrite: Exception = null
+  private val projection = createProjection
   private var producer: KafkaProducer[Array[Byte], Array[Byte]] = _
 
   /**
@@ -44,7 +46,23 @@ private[kafka010] class KafkaWriteTask(
     producer = CachedKafkaProducer.getOrCreate(producerConfiguration)
     while (iterator.hasNext && failedWrite == null) {
       val currentRow = iterator.next()
-      sendRow(currentRow, producer)
+      val projectedRow = projection(currentRow)
+      val topic = projectedRow.getUTF8String(0)
+      val key = projectedRow.getBinary(1)
+      val value = projectedRow.getBinary(2)
+      if (topic == null) {
+        throw new NullPointerException(s"null topic present in the data. Use the " +
+        s"${KafkaSourceProvider.TOPIC_OPTION_KEY} option for setting a default topic.")
+      }
+      val record = new ProducerRecord[Array[Byte], Array[Byte]](topic.toString, key, value)
+      val callback = new Callback() {
+        override def onCompletion(recordMetadata: RecordMetadata, e: Exception): Unit = {
+          if (failedWrite == null && e != null) {
+            failedWrite = e
+          }
+        }
+      }
+      producer.send(record, callback)
     }
   }
 
@@ -56,49 +74,8 @@ private[kafka010] class KafkaWriteTask(
       producer = null
     }
   }
-}
-
-private[kafka010] abstract class KafkaRowWriter(
-    inputSchema: Seq[Attribute], topic: Option[String]) {
-
-  // used to synchronize with Kafka callbacks
-  @volatile protected var failedWrite: Exception = _
-  protected val projection = createProjection
-
-  private val callback = new Callback() {
-    override def onCompletion(recordMetadata: RecordMetadata, e: Exception): Unit = {
-      if (failedWrite == null && e != null) {
-        failedWrite = e
-      }
-    }
-  }
 
-  /**
-   * Send the specified row to the producer, with a callback that will save any exception
-   * to failedWrite. Note that send is asynchronous; subclasses must flush() their producer before
-   * assuming the row is in Kafka.
-   */
-  protected def sendRow(
-      row: InternalRow, producer: KafkaProducer[Array[Byte], Array[Byte]]): Unit = {
-    val projectedRow = projection(row)
-    val topic = projectedRow.getUTF8String(0)
-    val key = projectedRow.getBinary(1)
-    val value = projectedRow.getBinary(2)
-    if (topic == null) {
-      throw new NullPointerException(s"null topic present in the data. Use the " +
-        s"${KafkaSourceProvider.TOPIC_OPTION_KEY} option for setting a default topic.")
-    }
-    val record = new ProducerRecord[Array[Byte], Array[Byte]](topic.toString, key, value)
-    producer.send(record, callback)
-  }
-
-  protected def checkForErrors(): Unit = {
-    if (failedWrite != null) {
-      throw failedWrite
-    }
-  }
-
-  private def createProjection = {
+  private def createProjection: UnsafeProjection = {
     val topicExpression = topic.map(Literal(_)).orElse {
       inputSchema.find(_.name == KafkaWriter.TOPIC_ATTRIBUTE_NAME)
     }.getOrElse {
@@ -135,5 +112,11 @@ private[kafka010] abstract class KafkaRowWriter(
       Seq(topicExpression, Cast(keyExpression, BinaryType),
         Cast(valueExpression, BinaryType)), inputSchema)
   }
+
+  private def checkForErrors(): Unit = {
+    if (failedWrite != null) {
+      throw failedWrite
+    }
+  }
 }
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
index 15cd44812cb0c..5e9ae35b3f008 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
@@ -43,9 +43,10 @@ private[kafka010] object KafkaWriter extends Logging {
   override def toString: String = "KafkaWriter"
 
   def validateQuery(
-      schema: Seq[Attribute],
+      queryExecution: QueryExecution,
       kafkaParameters: ju.Map[String, Object],
       topic: Option[String] = None): Unit = {
+    val schema = queryExecution.analyzed.output
     schema.find(_.name == TOPIC_ATTRIBUTE_NAME).getOrElse(
       if (topic.isEmpty) {
         throw new AnalysisException(s"topic option required when no " +
@@ -83,7 +84,7 @@ private[kafka010] object KafkaWriter extends Logging {
       kafkaParameters: ju.Map[String, Object],
       topic: Option[String] = None): Unit = {
     val schema = queryExecution.analyzed.output
-    validateQuery(schema, kafkaParameters, topic)
+    validateQuery(queryExecution, kafkaParameters, topic)
     queryExecution.toRdd.foreachPartition { iter =>
       val writeTask = new KafkaWriteTask(kafkaParameters, schema, topic)
       Utils.tryWithSafeFinally(block = writeTask.execute(iter))(
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
deleted file mode 100644
index dfc97b1c38bb5..0000000000000
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
+++ /dev/null
@@ -1,474 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.kafka010
-
-import java.util.Locale
-import java.util.concurrent.atomic.AtomicInteger
-
-import org.apache.kafka.clients.producer.ProducerConfig
-import org.apache.kafka.common.serialization.ByteArraySerializer
-import org.scalatest.time.SpanSugar._
-import scala.collection.JavaConverters._
-
-import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SaveMode}
-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, SpecificInternalRow, UnsafeProjection}
-import org.apache.spark.sql.execution.streaming.MemoryStream
-import org.apache.spark.sql.streaming._
-import org.apache.spark.sql.types.{BinaryType, DataType}
-import org.apache.spark.util.Utils
-
-/**
- * This is a temporary port of KafkaSinkSuite, since we do not yet have a V2 memory stream.
- * Once we have one, this will be changed to a specialization of KafkaSinkSuite and we won't have
- * to duplicate all the code.
- */
-class KafkaContinuousSinkSuite extends KafkaContinuousTest {
-  import testImplicits._
-
-  override val streamingTimeout = 30.seconds
-
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    testUtils = new KafkaTestUtils(
-      withBrokerProps = Map("auto.create.topics.enable" -> "false"))
-    testUtils.setup()
-  }
-
-  override def afterAll(): Unit = {
-    if (testUtils != null) {
-      testUtils.teardown()
-      testUtils = null
-    }
-    super.afterAll()
-  }
-
-  test("streaming - write to kafka with topic field") {
-    val inputTopic = newTopic()
-    testUtils.createTopic(inputTopic, partitions = 1)
-
-    val input = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", inputTopic)
-      .option("startingOffsets", "earliest")
-      .load()
-
-    val topic = newTopic()
-    testUtils.createTopic(topic)
-
-    val writer = createKafkaWriter(
-      input.toDF(),
-      withTopic = None,
-      withOutputMode = Some(OutputMode.Append))(
-      withSelectExpr = s"'$topic' as topic", "value")
-
-    val reader = createKafkaReader(topic)
-      .selectExpr("CAST(key as STRING) key", "CAST(value as STRING) value")
-      .selectExpr("CAST(key as INT) key", "CAST(value as INT) value")
-      .as[(Int, Int)]
-      .map(_._2)
-
-    try {
-      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
-      failAfter(streamingTimeout) {
-        writer.processAllAvailable()
-      }
-      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5)
-      testUtils.sendMessages(inputTopic, Array("6", "7", "8", "9", "10"))
-      failAfter(streamingTimeout) {
-        writer.processAllAvailable()
-      }
-      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
-    } finally {
-      writer.stop()
-    }
-  }
-
-  test("streaming - write w/o topic field, with topic option") {
-    val inputTopic = newTopic()
-    testUtils.createTopic(inputTopic, partitions = 1)
-
-    val input = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", inputTopic)
-      .option("startingOffsets", "earliest")
-      .load()
-
-    val topic = newTopic()
-    testUtils.createTopic(topic)
-
-    val writer = createKafkaWriter(
-      input.toDF(),
-      withTopic = Some(topic),
-      withOutputMode = Some(OutputMode.Append()))()
-
-    val reader = createKafkaReader(topic)
-      .selectExpr("CAST(key as STRING) key", "CAST(value as STRING) value")
-      .selectExpr("CAST(key as INT) key", "CAST(value as INT) value")
-      .as[(Int, Int)]
-      .map(_._2)
-
-    try {
-      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
-      failAfter(streamingTimeout) {
-        writer.processAllAvailable()
-      }
-      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5)
-      testUtils.sendMessages(inputTopic, Array("6", "7", "8", "9", "10"))
-      failAfter(streamingTimeout) {
-        writer.processAllAvailable()
-      }
-      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
-    } finally {
-      writer.stop()
-    }
-  }
-
-  test("streaming - topic field and topic option") {
-    /* The purpose of this test is to ensure that the topic option
-     * overrides the topic field. We begin by writing some data that
-     * includes a topic field and value (e.g., 'foo') along with a topic
-     * option. Then when we read from the topic specified in the option
-     * we should see the data i.e., the data was written to the topic
-     * option, and not to the topic in the data e.g., foo
-     */
-    val inputTopic = newTopic()
-    testUtils.createTopic(inputTopic, partitions = 1)
-
-    val input = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", inputTopic)
-      .option("startingOffsets", "earliest")
-      .load()
-
-    val topic = newTopic()
-    testUtils.createTopic(topic)
-
-    val writer = createKafkaWriter(
-      input.toDF(),
-      withTopic = Some(topic),
-      withOutputMode = Some(OutputMode.Append()))(
-      withSelectExpr = "'foo' as topic", "CAST(value as STRING) value")
-
-    val reader = createKafkaReader(topic)
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .selectExpr("CAST(key AS INT)", "CAST(value AS INT)")
-      .as[(Int, Int)]
-      .map(_._2)
-
-    try {
-      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
-      failAfter(streamingTimeout) {
-        writer.processAllAvailable()
-      }
-      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5)
-      testUtils.sendMessages(inputTopic, Array("6", "7", "8", "9", "10"))
-      failAfter(streamingTimeout) {
-        writer.processAllAvailable()
-      }
-      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
-    } finally {
-      writer.stop()
-    }
-  }
-
-  test("null topic attribute") {
-    val inputTopic = newTopic()
-    testUtils.createTopic(inputTopic, partitions = 1)
-
-    val input = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", inputTopic)
-      .option("startingOffsets", "earliest")
-      .load()
-    val topic = newTopic()
-    testUtils.createTopic(topic)
-
-    /* No topic field or topic option */
-    var writer: StreamingQuery = null
-    var ex: Exception = null
-    try {
-      ex = intercept[StreamingQueryException] {
-        writer = createKafkaWriter(input.toDF())(
-          withSelectExpr = "CAST(null as STRING) as topic", "value"
-        )
-        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
-        writer.processAllAvailable()
-      }
-    } finally {
-      writer.stop()
-    }
-    assert(ex.getCause.getCause.getMessage
-      .toLowerCase(Locale.ROOT)
-      .contains("null topic present in the data."))
-  }
-
-  test("streaming - write data with bad schema") {
-    val inputTopic = newTopic()
-    testUtils.createTopic(inputTopic, partitions = 1)
-
-    val input = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", inputTopic)
-      .option("startingOffsets", "earliest")
-      .load()
-    val topic = newTopic()
-    testUtils.createTopic(topic)
-
-    /* No topic field or topic option */
-    var writer: StreamingQuery = null
-    var ex: Exception = null
-    try {
-      ex = intercept[StreamingQueryException] {
-        writer = createKafkaWriter(input.toDF())(
-          withSelectExpr = "value as key", "value"
-        )
-        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
-        writer.processAllAvailable()
-      }
-    } finally {
-      writer.stop()
-    }
-    assert(ex.getMessage
-      .toLowerCase(Locale.ROOT)
-      .contains("topic option required when no 'topic' attribute is present"))
-
-    try {
-      /* No value field */
-      ex = intercept[StreamingQueryException] {
-        writer = createKafkaWriter(input.toDF())(
-          withSelectExpr = s"'$topic' as topic", "value as key"
-        )
-        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
-        writer.processAllAvailable()
-      }
-    } finally {
-      writer.stop()
-    }
-    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
-      "required attribute 'value' not found"))
-  }
-
-  test("streaming - write data with valid schema but wrong types") {
-    val inputTopic = newTopic()
-    testUtils.createTopic(inputTopic, partitions = 1)
-
-    val input = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", inputTopic)
-      .option("startingOffsets", "earliest")
-      .load()
-      .selectExpr("CAST(value as STRING) value")
-    val topic = newTopic()
-    testUtils.createTopic(topic)
-
-    var writer: StreamingQuery = null
-    var ex: Exception = null
-    try {
-      /* topic field wrong type */
-      ex = intercept[StreamingQueryException] {
-        writer = createKafkaWriter(input.toDF())(
-          withSelectExpr = s"CAST('1' as INT) as topic", "value"
-        )
-        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
-        writer.processAllAvailable()
-      }
-    } finally {
-      writer.stop()
-    }
-    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains("topic type must be a string"))
-
-    try {
-      /* value field wrong type */
-      ex = intercept[StreamingQueryException] {
-        writer = createKafkaWriter(input.toDF())(
-          withSelectExpr = s"'$topic' as topic", "CAST(value as INT) as value"
-        )
-        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
-        writer.processAllAvailable()
-      }
-    } finally {
-      writer.stop()
-    }
-    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
-      "value attribute type must be a string or binarytype"))
-
-    try {
-      ex = intercept[StreamingQueryException] {
-        /* key field wrong type */
-        writer = createKafkaWriter(input.toDF())(
-          withSelectExpr = s"'$topic' as topic", "CAST(value as INT) as key", "value"
-        )
-        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
-        writer.processAllAvailable()
-      }
-    } finally {
-      writer.stop()
-    }
-    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
-      "key attribute type must be a string or binarytype"))
-  }
-
-  test("streaming - write to non-existing topic") {
-    val inputTopic = newTopic()
-    testUtils.createTopic(inputTopic, partitions = 1)
-
-    val input = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", inputTopic)
-      .option("startingOffsets", "earliest")
-      .load()
-    val topic = newTopic()
-
-    var writer: StreamingQuery = null
-    var ex: Exception = null
-    try {
-      ex = intercept[StreamingQueryException] {
-        writer = createKafkaWriter(input.toDF(), withTopic = Some(topic))()
-        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
-        eventually(timeout(streamingTimeout)) {
-          assert(writer.exception.isDefined)
-        }
-        throw writer.exception.get
-      }
-    } finally {
-      writer.stop()
-    }
-    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains("job aborted"))
-  }
-
-  test("streaming - exception on config serializer") {
-    val inputTopic = newTopic()
-    testUtils.createTopic(inputTopic, partitions = 1)
-    testUtils.sendMessages(inputTopic, Array("0"))
-
-    val input = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", inputTopic)
-      .load()
-    var writer: StreamingQuery = null
-    var ex: Exception = null
-    try {
-      ex = intercept[StreamingQueryException] {
-        writer = createKafkaWriter(
-          input.toDF(),
-          withOptions = Map("kafka.key.serializer" -> "foo"))()
-        writer.processAllAvailable()
-      }
-      assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
-        "kafka option 'key.serializer' is not supported"))
-    } finally {
-      writer.stop()
-    }
-
-    try {
-      ex = intercept[StreamingQueryException] {
-        writer = createKafkaWriter(
-          input.toDF(),
-          withOptions = Map("kafka.value.serializer" -> "foo"))()
-        writer.processAllAvailable()
-      }
-      assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
-        "kafka option 'value.serializer' is not supported"))
-    } finally {
-      writer.stop()
-    }
-  }
-
-  test("generic - write big data with small producer buffer") {
-    /* This test ensures that we understand the semantics of Kafka when
-    * is comes to blocking on a call to send when the send buffer is full.
-    * This test will configure the smallest possible producer buffer and
-    * indicate that we should block when it is full. Thus, no exception should
-    * be thrown in the case of a full buffer.
-    */
-    val topic = newTopic()
-    testUtils.createTopic(topic, 1)
-    val options = new java.util.HashMap[String, String]
-    options.put("bootstrap.servers", testUtils.brokerAddress)
-    options.put("buffer.memory", "16384") // min buffer size
-    options.put("block.on.buffer.full", "true")
-    options.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, classOf[ByteArraySerializer].getName)
-    options.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, classOf[ByteArraySerializer].getName)
-    val inputSchema = Seq(AttributeReference("value", BinaryType)())
-    val data = new Array[Byte](15000) // large value
-    val writeTask = new KafkaContinuousDataWriter(Some(topic), options.asScala.toMap, inputSchema)
-    try {
-      val fieldTypes: Array[DataType] = Array(BinaryType)
-      val converter = UnsafeProjection.create(fieldTypes)
-      val row = new SpecificInternalRow(fieldTypes)
-      row.update(0, data)
-      val iter = Seq.fill(1000)(converter.apply(row)).iterator
-      iter.foreach(writeTask.write(_))
-      writeTask.commit()
-    } finally {
-      writeTask.close()
-    }
-  }
-
-  private def createKafkaReader(topic: String): DataFrame = {
-    spark.read
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("startingOffsets", "earliest")
-      .option("endingOffsets", "latest")
-      .option("subscribe", topic)
-      .load()
-  }
-
-  private def createKafkaWriter(
-      input: DataFrame,
-      withTopic: Option[String] = None,
-      withOutputMode: Option[OutputMode] = None,
-      withOptions: Map[String, String] = Map[String, String]())
-      (withSelectExpr: String*): StreamingQuery = {
-    var stream: DataStreamWriter[Row] = null
-    val checkpointDir = Utils.createTempDir()
-    var df = input.toDF()
-    if (withSelectExpr.length > 0) {
-      df = df.selectExpr(withSelectExpr: _*)
-    }
-    stream = df.writeStream
-      .format("kafka")
-      .option("checkpointLocation", checkpointDir.getCanonicalPath)
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      // We need to reduce blocking time to efficiently test non-existent partition behavior.
-      .option("kafka.max.block.ms", "1000")
-      .trigger(Trigger.Continuous(1000))
-      .queryName("kafkaStream")
-    withTopic.foreach(stream.option("topic", _))
-    withOutputMode.foreach(stream.outputMode(_))
-    withOptions.foreach(opt => stream.option(opt._1, opt._2))
-    stream.start()
-  }
-}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala
deleted file mode 100644
index b3dade414f625..0000000000000
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.kafka010
-
-import java.util.Properties
-import java.util.concurrent.atomic.AtomicInteger
-
-import org.scalatest.time.SpanSugar._
-import scala.collection.mutable
-import scala.util.Random
-
-import org.apache.spark.SparkContext
-import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row}
-import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
-import org.apache.spark.sql.execution.streaming.StreamExecution
-import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
-import org.apache.spark.sql.streaming.{StreamTest, Trigger}
-import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}
-
-// Run tests in KafkaSourceSuiteBase in continuous execution mode.
-class KafkaContinuousSourceSuite extends KafkaSourceSuiteBase with KafkaContinuousTest
-
-class KafkaContinuousSourceTopicDeletionSuite extends KafkaContinuousTest {
-  import testImplicits._
-
-  override val brokerProps = Map("auto.create.topics.enable" -> "false")
-
-  test("subscribing topic by pattern with topic deletions") {
-    val topicPrefix = newTopic()
-    val topic = topicPrefix + "-seems"
-    val topic2 = topicPrefix + "-bad"
-    testUtils.createTopic(topic, partitions = 5)
-    testUtils.sendMessages(topic, Array("-1"))
-    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
-
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("subscribePattern", s"$topicPrefix-.*")
-      .option("failOnDataLoss", "false")
-
-    val kafka = reader.load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-    val mapped = kafka.map(kv => kv._2.toInt + 1)
-
-    testStream(mapped)(
-      makeSureGetOffsetCalled,
-      AddKafkaData(Set(topic), 1, 2, 3),
-      CheckAnswer(2, 3, 4),
-      Execute { query =>
-        testUtils.deleteTopic(topic)
-        testUtils.createTopic(topic2, partitions = 5)
-        eventually(timeout(streamingTimeout)) {
-          assert(
-            query.lastExecution.logical.collectFirst {
-              case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
-            }.exists { r =>
-              // Ensure the new topic is present and the old topic is gone.
-              r.knownPartitions.exists(_.topic == topic2)
-            },
-            s"query never reconfigured to new topic $topic2")
-        }
-      },
-      AddKafkaData(Set(topic2), 4, 5, 6),
-      CheckAnswer(2, 3, 4, 5, 6, 7)
-    )
-  }
-}
-
-class KafkaContinuousSourceStressForDontFailOnDataLossSuite
-    extends KafkaSourceStressForDontFailOnDataLossSuite {
-  override protected def startStream(ds: Dataset[Int]) = {
-    ds.writeStream
-      .format("memory")
-      .queryName("memory")
-      .start()
-  }
-}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala
deleted file mode 100644
index e713e6695d2bd..0000000000000
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.kafka010
-
-import org.apache.spark.SparkContext
-import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
-import org.apache.spark.sql.execution.streaming.StreamExecution
-import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
-import org.apache.spark.sql.streaming.Trigger
-import org.apache.spark.sql.test.TestSparkSession
-
-// Trait to configure StreamTest for kafka continuous execution tests.
-trait KafkaContinuousTest extends KafkaSourceTest {
-  override val defaultTrigger = Trigger.Continuous(1000)
-  override val defaultUseV2Sink = true
-
-  // We need more than the default local[2] to be able to schedule all partitions simultaneously.
-  override protected def createSparkSession = new TestSparkSession(
-    new SparkContext(
-      "local[10]",
-      "continuous-stream-test-sql-context",
-      sparkConf.set("spark.sql.testkey", "true")))
-
-  // In addition to setting the partitions in Kafka, we have to wait until the query has
-  // reconfigured to the new count so the test framework can hook in properly.
-  override protected def setTopicPartitions(
-      topic: String, newCount: Int, query: StreamExecution) = {
-    testUtils.addPartitions(topic, newCount)
-    eventually(timeout(streamingTimeout)) {
-      assert(
-        query.lastExecution.logical.collectFirst {
-          case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
-        }.exists(_.knownPartitions.size == newCount),
-        s"query never reconfigured to $newCount partitions")
-    }
-  }
-
-  test("ensure continuous stream is being used") {
-    val query = spark.readStream
-      .format("rate")
-      .option("numPartitions", "1")
-      .option("rowsPerSecond", "1")
-      .load()
-
-    testStream(query)(
-      Execute(q => assert(q.isInstanceOf[ContinuousExecution]))
-    )
-  }
-}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index d66908f86ccc7..2034b9be07f24 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -34,14 +34,11 @@ import org.scalatest.concurrent.PatienceConfiguration.Timeout
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.SparkContext
-import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row}
-import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, WriteToDataSourceV2Exec}
+import org.apache.spark.sql.ForeachWriter
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
-import org.apache.spark.sql.execution.streaming.sources.ContinuousMemoryWriter
 import org.apache.spark.sql.functions.{count, window}
 import org.apache.spark.sql.kafka010.KafkaSourceProvider._
-import org.apache.spark.sql.streaming.{ProcessingTime, StreamTest, Trigger}
+import org.apache.spark.sql.streaming.{ProcessingTime, StreamTest}
 import org.apache.spark.sql.streaming.util.StreamManualClock
 import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}
 import org.apache.spark.util.Utils
@@ -52,11 +49,9 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
 
   override val streamingTimeout = 30.seconds
 
-  protected val brokerProps = Map[String, Object]()
-
   override def beforeAll(): Unit = {
     super.beforeAll()
-    testUtils = new KafkaTestUtils(brokerProps)
+    testUtils = new KafkaTestUtils
     testUtils.setup()
   }
 
@@ -64,25 +59,18 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
     if (testUtils != null) {
       testUtils.teardown()
       testUtils = null
+      super.afterAll()
     }
-    super.afterAll()
   }
 
   protected def makeSureGetOffsetCalled = AssertOnQuery { q =>
     // Because KafkaSource's initialPartitionOffsets is set lazily, we need to make sure
-    // its "getOffset" is called before pushing any data. Otherwise, because of the race condition,
+    // its "getOffset" is called before pushing any data. Otherwise, because of the race contion,
     // we don't know which data should be fetched when `startingOffsets` is latest.
-    q match {
-      case c: ContinuousExecution => c.awaitEpoch(0)
-      case m: MicroBatchExecution => m.processAllAvailable()
-    }
+    q.processAllAvailable()
     true
   }
 
-  protected def setTopicPartitions(topic: String, newCount: Int, query: StreamExecution) : Unit = {
-    testUtils.addPartitions(topic, newCount)
-  }
-
   /**
    * Add data to Kafka.
    *
@@ -94,7 +82,7 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
       message: String = "",
       topicAction: (String, Option[Int]) => Unit = (_, _) => {}) extends AddData {
 
-    override def addData(query: Option[StreamExecution]): (BaseStreamingSource, Offset) = {
+    override def addData(query: Option[StreamExecution]): (Source, Offset) = {
       if (query.get.isActive) {
         // Make sure no Spark job is running when deleting a topic
         query.get.processAllAvailable()
@@ -109,18 +97,16 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
         topicAction(existingTopicPartitions._1, Some(existingTopicPartitions._2))
       }
 
+      // Read all topics again in case some topics are delete.
+      val allTopics = testUtils.getAllTopicsAndPartitionSize().toMap.keys
       require(
         query.nonEmpty,
         "Cannot add data when there is no query for finding the active kafka source")
 
       val sources = query.get.logicalPlan.collect {
-        case StreamingExecutionRelation(source: KafkaSource, _) => source
-      } ++ (query.get.lastExecution match {
-        case null => Seq()
-        case e => e.logical.collect {
-          case DataSourceV2Relation(_, reader: KafkaContinuousReader) => reader
-        }
-      })
+        case StreamingExecutionRelation(source, _) if source.isInstanceOf[KafkaSource] =>
+          source.asInstanceOf[KafkaSource]
+      }
       if (sources.isEmpty) {
         throw new Exception(
           "Could not find Kafka source in the StreamExecution logical plan to add data to")
@@ -151,158 +137,14 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
     override def toString: String =
       s"AddKafkaData(topics = $topics, data = $data, message = $message)"
   }
-
-  private val topicId = new AtomicInteger(0)
-  protected def newTopic(): String = s"topic-${topicId.getAndIncrement()}"
 }
 
-class KafkaMicroBatchSourceSuite extends KafkaSourceSuiteBase {
-
-  import testImplicits._
-
-  test("(de)serialization of initial offsets") {
-    val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 5)
-
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", topic)
-
-    testStream(reader.load)(
-      makeSureGetOffsetCalled,
-      StopStream,
-      StartStream(),
-      StopStream)
-  }
-
-  test("maxOffsetsPerTrigger") {
-    val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 3)
-    testUtils.sendMessages(topic, (100 to 200).map(_.toString).toArray, Some(0))
-    testUtils.sendMessages(topic, (10 to 20).map(_.toString).toArray, Some(1))
-    testUtils.sendMessages(topic, Array("1"), Some(2))
-
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("maxOffsetsPerTrigger", 10)
-      .option("subscribe", topic)
-      .option("startingOffsets", "earliest")
-    val kafka = reader.load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-    val mapped: org.apache.spark.sql.Dataset[_] = kafka.map(kv => kv._2.toInt)
-
-    val clock = new StreamManualClock
-
-    val waitUntilBatchProcessed = AssertOnQuery { q =>
-      eventually(Timeout(streamingTimeout)) {
-        if (!q.exception.isDefined) {
-          assert(clock.isStreamWaitingAt(clock.getTimeMillis()))
-        }
-      }
-      if (q.exception.isDefined) {
-        throw q.exception.get
-      }
-      true
-    }
-
-    testStream(mapped)(
-      StartStream(ProcessingTime(100), clock),
-      waitUntilBatchProcessed,
-      // 1 from smallest, 1 from middle, 8 from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107),
-      AdvanceManualClock(100),
-      waitUntilBatchProcessed,
-      // smallest now empty, 1 more from middle, 9 more from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
-        11, 108, 109, 110, 111, 112, 113, 114, 115, 116
-      ),
-      StopStream,
-      StartStream(ProcessingTime(100), clock),
-      waitUntilBatchProcessed,
-      // smallest now empty, 1 more from middle, 9 more from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
-        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
-        12, 117, 118, 119, 120, 121, 122, 123, 124, 125
-      ),
-      AdvanceManualClock(100),
-      waitUntilBatchProcessed,
-      // smallest now empty, 1 more from middle, 9 more from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
-        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
-        12, 117, 118, 119, 120, 121, 122, 123, 124, 125,
-        13, 126, 127, 128, 129, 130, 131, 132, 133, 134
-      )
-    )
-  }
-
-  test("input row metrics") {
-    val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 5)
-    testUtils.sendMessages(topic, Array("-1"))
-    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
-
-    val kafka = spark
-      .readStream
-      .format("kafka")
-      .option("subscribe", topic)
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-
-    val mapped = kafka.map(kv => kv._2.toInt + 1)
-    testStream(mapped)(
-      StartStream(trigger = ProcessingTime(1)),
-      makeSureGetOffsetCalled,
-      AddKafkaData(Set(topic), 1, 2, 3),
-      CheckAnswer(2, 3, 4),
-      AssertOnQuery { query =>
-        val recordsRead = query.recentProgress.map(_.numInputRows).sum
-        recordsRead == 3
-      }
-    )
-  }
-
-  test("subscribing topic by pattern with topic deletions") {
-    val topicPrefix = newTopic()
-    val topic = topicPrefix + "-seems"
-    val topic2 = topicPrefix + "-bad"
-    testUtils.createTopic(topic, partitions = 5)
-    testUtils.sendMessages(topic, Array("-1"))
-    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
 
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("subscribePattern", s"$topicPrefix-.*")
-      .option("failOnDataLoss", "false")
+class KafkaSourceSuite extends KafkaSourceTest {
 
-    val kafka = reader.load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-    val mapped = kafka.map(kv => kv._2.toInt + 1)
+  import testImplicits._
 
-    testStream(mapped)(
-      makeSureGetOffsetCalled,
-      AddKafkaData(Set(topic), 1, 2, 3),
-      CheckAnswer(2, 3, 4),
-      Assert {
-        testUtils.deleteTopic(topic)
-        testUtils.createTopic(topic2, partitions = 5)
-        true
-      },
-      AddKafkaData(Set(topic2), 4, 5, 6),
-      CheckAnswer(2, 3, 4, 5, 6, 7)
-    )
-  }
+  private val topicId = new AtomicInteger(0)
 
   testWithUninterruptibleThread(
     "deserialization of initial offset with Spark 2.1.0") {
@@ -395,51 +237,86 @@ class KafkaMicroBatchSourceSuite extends KafkaSourceSuiteBase {
     }
   }
 
-  test("KafkaSource with watermark") {
-    val now = System.currentTimeMillis()
+  test("(de)serialization of initial offsets") {
     val topic = newTopic()
-    testUtils.createTopic(newTopic(), partitions = 1)
-    testUtils.sendMessages(topic, Array(1).map(_.toString))
+    testUtils.createTopic(topic, partitions = 64)
 
-    val kafka = spark
+    val reader = spark
       .readStream
       .format("kafka")
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("startingOffsets", s"earliest")
       .option("subscribe", topic)
-      .load()
-
-    val windowedAggregation = kafka
-      .withWatermark("timestamp", "10 seconds")
-      .groupBy(window($"timestamp", "5 seconds") as 'window)
-      .agg(count("*") as 'count)
-      .select($"window".getField("start") as 'window, $"count")
 
-    val query = windowedAggregation
-      .writeStream
-      .format("memory")
-      .outputMode("complete")
-      .queryName("kafkaWatermark")
-      .start()
-    query.processAllAvailable()
-    val rows = spark.table("kafkaWatermark").collect()
-    assert(rows.length === 1, s"Unexpected results: ${rows.toList}")
-    val row = rows(0)
-    // We cannot check the exact window start time as it depands on the time that messages were
-    // inserted by the producer. So here we just use a low bound to make sure the internal
-    // conversion works.
-    assert(
-      row.getAs[java.sql.Timestamp]("window").getTime >= now - 5 * 1000,
-      s"Unexpected results: $row")
-    assert(row.getAs[Int]("count") === 1, s"Unexpected results: $row")
-    query.stop()
+    testStream(reader.load)(
+      makeSureGetOffsetCalled,
+      StopStream,
+      StartStream(),
+      StopStream)
   }
-}
 
-class KafkaSourceSuiteBase extends KafkaSourceTest {
+  test("maxOffsetsPerTrigger") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 3)
+    testUtils.sendMessages(topic, (100 to 200).map(_.toString).toArray, Some(0))
+    testUtils.sendMessages(topic, (10 to 20).map(_.toString).toArray, Some(1))
+    testUtils.sendMessages(topic, Array("1"), Some(2))
 
-  import testImplicits._
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("maxOffsetsPerTrigger", 10)
+      .option("subscribe", topic)
+      .option("startingOffsets", "earliest")
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val mapped: org.apache.spark.sql.Dataset[_] = kafka.map(kv => kv._2.toInt)
+
+    val clock = new StreamManualClock
+
+    val waitUntilBatchProcessed = AssertOnQuery { q =>
+      eventually(Timeout(streamingTimeout)) {
+        if (!q.exception.isDefined) {
+          assert(clock.isStreamWaitingAt(clock.getTimeMillis()))
+        }
+      }
+      if (q.exception.isDefined) {
+        throw q.exception.get
+      }
+      true
+    }
+
+    testStream(mapped)(
+      StartStream(ProcessingTime(100), clock),
+      waitUntilBatchProcessed,
+      // 1 from smallest, 1 from middle, 8 from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107),
+      AdvanceManualClock(100),
+      waitUntilBatchProcessed,
+      // smallest now empty, 1 more from middle, 9 more from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
+        11, 108, 109, 110, 111, 112, 113, 114, 115, 116
+      ),
+      StopStream,
+      StartStream(ProcessingTime(100), clock),
+      waitUntilBatchProcessed,
+      // smallest now empty, 1 more from middle, 9 more from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
+        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+        12, 117, 118, 119, 120, 121, 122, 123, 124, 125
+      ),
+      AdvanceManualClock(100),
+      waitUntilBatchProcessed,
+      // smallest now empty, 1 more from middle, 9 more from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
+        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+        12, 117, 118, 119, 120, 121, 122, 123, 124, 125,
+        13, 126, 127, 128, 129, 130, 131, 132, 133, 134
+      )
+    )
+  }
 
   test("cannot stop Kafka stream") {
     val topic = newTopic()
@@ -451,7 +328,7 @@ class KafkaSourceSuiteBase extends KafkaSourceTest {
       .format("kafka")
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
       .option("kafka.metadata.max.age.ms", "1")
-      .option("subscribePattern", s"$topic.*")
+      .option("subscribePattern", s"topic-.*")
 
     val kafka = reader.load()
       .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
@@ -545,6 +422,65 @@ class KafkaSourceSuiteBase extends KafkaSourceTest {
     }
   }
 
+  test("subscribing topic by pattern with topic deletions") {
+    val topicPrefix = newTopic()
+    val topic = topicPrefix + "-seems"
+    val topic2 = topicPrefix + "-bad"
+    testUtils.createTopic(topic, partitions = 5)
+    testUtils.sendMessages(topic, Array("-1"))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("subscribePattern", s"$topicPrefix-.*")
+      .option("failOnDataLoss", "false")
+
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val mapped = kafka.map(kv => kv._2.toInt + 1)
+
+    testStream(mapped)(
+      makeSureGetOffsetCalled,
+      AddKafkaData(Set(topic), 1, 2, 3),
+      CheckAnswer(2, 3, 4),
+      Assert {
+        testUtils.deleteTopic(topic)
+        testUtils.createTopic(topic2, partitions = 5)
+        true
+      },
+      AddKafkaData(Set(topic2), 4, 5, 6),
+      CheckAnswer(2, 3, 4, 5, 6, 7)
+    )
+  }
+
+  test("starting offset is latest by default") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 5)
+    testUtils.sendMessages(topic, Array("0"))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", topic)
+
+    val kafka = reader.load()
+      .selectExpr("CAST(value AS STRING)")
+      .as[String]
+    val mapped = kafka.map(_.toInt)
+
+    testStream(mapped)(
+      makeSureGetOffsetCalled,
+      AddKafkaData(Set(topic), 1, 2, 3),
+      CheckAnswer(1, 2, 3)  // should not have 0
+    )
+  }
+
   test("bad source options") {
     def testBadOptions(options: (String, String)*)(expectedMsgs: String*): Unit = {
       val ex = intercept[IllegalArgumentException] {
@@ -604,6 +540,34 @@ class KafkaSourceSuiteBase extends KafkaSourceTest {
     testUnsupportedConfig("kafka.auto.offset.reset", "latest")
   }
 
+  test("input row metrics") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 5)
+    testUtils.sendMessages(topic, Array("-1"))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val kafka = spark
+      .readStream
+      .format("kafka")
+      .option("subscribe", topic)
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+
+    val mapped = kafka.map(kv => kv._2.toInt + 1)
+    testStream(mapped)(
+      StartStream(trigger = ProcessingTime(1)),
+      makeSureGetOffsetCalled,
+      AddKafkaData(Set(topic), 1, 2, 3),
+      CheckAnswer(2, 3, 4),
+      AssertOnQuery { query =>
+        val recordsRead = query.recentProgress.map(_.numInputRows).sum
+        recordsRead == 3
+      }
+    )
+  }
+
   test("delete a topic when a Spark job is running") {
     KafkaSourceSuite.collectedData.clear()
 
@@ -665,6 +629,8 @@ class KafkaSourceSuiteBase extends KafkaSourceTest {
     }
   }
 
+  private def newTopic(): String = s"topic-${topicId.getAndIncrement()}"
+
   private def assignString(topic: String, partitions: Iterable[Int]): String = {
     JsonUtils.partitions(partitions.map(p => new TopicPartition(topic, p)))
   }
@@ -710,10 +676,6 @@ class KafkaSourceSuiteBase extends KafkaSourceTest {
 
     testStream(mapped)(
       makeSureGetOffsetCalled,
-      Execute { q =>
-        // wait to reach the last offset in every partition
-        q.awaitOffset(0, KafkaSourceOffset(partitionOffsets.mapValues(_ => 3L)))
-      },
       CheckAnswer(-20, -21, -22, 0, 1, 2, 11, 12, 22),
       StopStream,
       StartStream(),
@@ -744,7 +706,6 @@ class KafkaSourceSuiteBase extends KafkaSourceTest {
       .format("memory")
       .outputMode("append")
       .queryName("kafkaColumnTypes")
-      .trigger(defaultTrigger)
       .start()
     query.processAllAvailable()
     val rows = spark.table("kafkaColumnTypes").collect()
@@ -762,6 +723,47 @@ class KafkaSourceSuiteBase extends KafkaSourceTest {
     query.stop()
   }
 
+  test("KafkaSource with watermark") {
+    val now = System.currentTimeMillis()
+    val topic = newTopic()
+    testUtils.createTopic(newTopic(), partitions = 1)
+    testUtils.sendMessages(topic, Array(1).map(_.toString))
+
+    val kafka = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("startingOffsets", s"earliest")
+      .option("subscribe", topic)
+      .load()
+
+    val windowedAggregation = kafka
+      .withWatermark("timestamp", "10 seconds")
+      .groupBy(window($"timestamp", "5 seconds") as 'window)
+      .agg(count("*") as 'count)
+      .select($"window".getField("start") as 'window, $"count")
+
+    val query = windowedAggregation
+      .writeStream
+      .format("memory")
+      .outputMode("complete")
+      .queryName("kafkaWatermark")
+      .start()
+    query.processAllAvailable()
+    val rows = spark.table("kafkaWatermark").collect()
+    assert(rows.length === 1, s"Unexpected results: ${rows.toList}")
+    val row = rows(0)
+    // We cannot check the exact window start time as it depands on the time that messages were
+    // inserted by the producer. So here we just use a low bound to make sure the internal
+    // conversion works.
+    assert(
+      row.getAs[java.sql.Timestamp]("window").getTime >= now - 5 * 1000,
+      s"Unexpected results: $row")
+    assert(row.getAs[Int]("count") === 1, s"Unexpected results: $row")
+    query.stop()
+  }
+
   private def testFromLatestOffsets(
       topic: String,
       addPartitions: Boolean,
@@ -798,7 +800,9 @@ class KafkaSourceSuiteBase extends KafkaSourceTest {
       AddKafkaData(Set(topic), 7, 8),
       CheckAnswer(2, 3, 4, 5, 6, 7, 8, 9),
       AssertOnQuery("Add partitions") { query: StreamExecution =>
-        if (addPartitions) setTopicPartitions(topic, 10, query)
+        if (addPartitions) {
+          testUtils.addPartitions(topic, 10)
+        }
         true
       },
       AddKafkaData(Set(topic), 9, 10, 11, 12, 13, 14, 15, 16),
@@ -839,7 +843,9 @@ class KafkaSourceSuiteBase extends KafkaSourceTest {
       StartStream(),
       CheckAnswer(2, 3, 4, 5, 6, 7, 8, 9),
       AssertOnQuery("Add partitions") { query: StreamExecution =>
-        if (addPartitions) setTopicPartitions(topic, 10, query)
+        if (addPartitions) {
+          testUtils.addPartitions(topic, 10)
+        }
         true
       },
       AddKafkaData(Set(topic), 9, 10, 11, 12, 13, 14, 15, 16),
@@ -971,23 +977,6 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with Shared
     }
   }
 
-  protected def startStream(ds: Dataset[Int]) = {
-    ds.writeStream.foreach(new ForeachWriter[Int] {
-
-      override def open(partitionId: Long, version: Long): Boolean = {
-        true
-      }
-
-      override def process(value: Int): Unit = {
-        // Slow down the processing speed so that messages may be aged out.
-        Thread.sleep(Random.nextInt(500))
-      }
-
-      override def close(errorOrNull: Throwable): Unit = {
-      }
-    }).start()
-  }
-
   test("stress test for failOnDataLoss=false") {
     val reader = spark
       .readStream
@@ -1001,7 +990,20 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with Shared
     val kafka = reader.load()
       .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
       .as[(String, String)]
-    val query = startStream(kafka.map(kv => kv._2.toInt))
+    val query = kafka.map(kv => kv._2.toInt).writeStream.foreach(new ForeachWriter[Int] {
+
+      override def open(partitionId: Long, version: Long): Boolean = {
+        true
+      }
+
+      override def process(value: Int): Unit = {
+        // Slow down the processing speed so that messages may be aged out.
+        Thread.sleep(Random.nextInt(500))
+      }
+
+      override def close(errorOrNull: Throwable): Unit = {
+      }
+    }).start()
 
     val testTime = 1.minutes
     val startTime = System.currentTimeMillis()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index b714a46b5f786..e8d683a578f35 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -191,9 +191,6 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
           ds = ds.asInstanceOf[DataSourceV2],
           conf = sparkSession.sessionState.conf)).asJava)
 
-      // Streaming also uses the data source V2 API. So it may be that the data source implements
-      // v2, but has no v2 implementation for batch reads. In that case, we fall back to loading
-      // the dataframe as a v1 source.
       val reader = (ds, userSpecifiedSchema) match {
         case (ds: ReadSupportWithSchema, Some(schema)) =>
           ds.createReader(schema, options)
@@ -211,30 +208,23 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
           }
           reader
 
-        case _ => null // fall back to v1
+        case _ =>
+          throw new AnalysisException(s"$cls does not support data reading.")
       }
 
-      if (reader == null) {
-        loadV1Source(paths: _*)
-      } else {
-        Dataset.ofRows(sparkSession, DataSourceV2Relation(reader))
-      }
+      Dataset.ofRows(sparkSession, DataSourceV2Relation(reader))
     } else {
-      loadV1Source(paths: _*)
+      // Code path for data source v1.
+      sparkSession.baseRelationToDataFrame(
+        DataSource.apply(
+          sparkSession,
+          paths = paths,
+          userSpecifiedSchema = userSpecifiedSchema,
+          className = source,
+          options = extraOptions.toMap).resolveRelation())
     }
   }
 
-  private def loadV1Source(paths: String*) = {
-    // Code path for data source v1.
-    sparkSession.baseRelationToDataFrame(
-      DataSource.apply(
-        sparkSession,
-        paths = paths,
-        userSpecifiedSchema = userSpecifiedSchema,
-        className = source,
-        options = extraOptions.toMap).resolveRelation())
-  }
-
   /**
    * Construct a `DataFrame` representing the database table accessible via JDBC URL
    * url named table and connection properties.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 97f12ff625c42..3304f368e1050 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -255,24 +255,17 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
             }
           }
 
-        // Streaming also uses the data source V2 API. So it may be that the data source implements
-        // v2, but has no v2 implementation for batch writes. In that case, we fall back to saving
-        // as though it's a V1 source.
-        case _ => saveToV1Source()
+        case _ => throw new AnalysisException(s"$cls does not support data writing.")
       }
     } else {
-      saveToV1Source()
-    }
-  }
-
-  private def saveToV1Source(): Unit = {
-    // Code path for data source v1.
-    runCommand(df.sparkSession, "save") {
-      DataSource(
-        sparkSession = df.sparkSession,
-        className = source,
-        partitionColumns = partitioningColumns.getOrElse(Nil),
-        options = extraOptions.toMap).planForWriting(mode, AnalysisBarrier(df.logicalPlan))
+      // Code path for data source v1.
+      runCommand(df.sparkSession, "save") {
+        DataSource(
+          sparkSession = df.sparkSession,
+          className = source,
+          partitionColumns = partitioningColumns.getOrElse(Nil),
+          options = extraOptions.toMap).planForWriting(mode, AnalysisBarrier(df.logicalPlan))
+      }
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
index a4a857f2d4d9b..f0bdf84bb7a84 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
@@ -81,11 +81,9 @@ case class WriteToDataSourceV2Exec(writer: DataSourceV2Writer, query: SparkPlan)
         (index, message: WriterCommitMessage) => messages(index) = message
       )
 
-      if (!writer.isInstanceOf[ContinuousWriter]) {
-        logInfo(s"Data source writer $writer is committing.")
-        writer.commit(messages)
-        logInfo(s"Data source writer $writer committed.")
-      }
+      logInfo(s"Data source writer $writer is committing.")
+      writer.commit(messages)
+      logInfo(s"Data source writer $writer committed.")
     } catch {
       case _: InterruptedException if writer.isInstanceOf[ContinuousWriter] =>
         // Interruption is how continuous queries are ended, so accept and ignore the exception.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index cf27e1a70650a..24a8b000df0c1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -142,8 +142,7 @@ abstract class StreamExecution(
 
   override val id: UUID = UUID.fromString(streamMetadata.id)
 
-  override def runId: UUID = currentRunId
-  protected var currentRunId = UUID.randomUUID
+  override val runId: UUID = UUID.randomUUID
 
   /**
    * Pretty identified string of printing in logs. Format is
@@ -419,17 +418,11 @@ abstract class StreamExecution(
    * Blocks the current thread until processing for data from the given `source` has reached at
    * least the given `Offset`. This method is intended for use primarily when writing tests.
    */
-  private[sql] def awaitOffset(sourceIndex: Int, newOffset: Offset): Unit = {
+  private[sql] def awaitOffset(source: BaseStreamingSource, newOffset: Offset): Unit = {
     assertAwaitThread()
     def notDone = {
       val localCommittedOffsets = committedOffsets
-      if (sources == null) {
-        // sources might not be initialized yet
-        false
-      } else {
-        val source = sources(sourceIndex)
-        !localCommittedOffsets.contains(source) || localCommittedOffsets(source) != newOffset
-      }
+      !localCommittedOffsets.contains(source) || localCommittedOffsets(source) != newOffset
     }
 
     while (notDone) {
@@ -443,7 +436,7 @@ abstract class StreamExecution(
         awaitProgressLock.unlock()
       }
     }
-    logDebug(s"Unblocked at $newOffset for ${sources(sourceIndex)}")
+    logDebug(s"Unblocked at $newOffset for $source")
   }
 
   /** A flag to indicate that a batch has completed with no new data available. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
index e700aa4f9aea7..d79e4bd65f563 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
@@ -77,6 +77,7 @@ class ContinuousDataSourceRDD(
     dataReaderThread.start()
 
     context.addTaskCompletionListener(_ => {
+      reader.close()
       dataReaderThread.interrupt()
       epochPollExecutor.shutdown()
     })
@@ -200,8 +201,6 @@ class DataReaderThread(
         failedFlag.set(true)
         // Don't rethrow the exception in this thread. It's not needed, and the default Spark
         // exception handler will kill the executor.
-    } finally {
-      reader.close()
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
index 667410ef9f1c6..9657b5e26d770 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -17,9 +17,7 @@
 
 package org.apache.spark.sql.execution.streaming.continuous
 
-import java.util.UUID
 import java.util.concurrent.TimeUnit
-import java.util.function.UnaryOperator
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, Map => MutableMap}
@@ -54,7 +52,7 @@ class ContinuousExecution(
     sparkSession, name, checkpointRoot, analyzedPlan, sink,
     trigger, triggerClock, outputMode, deleteCheckpointOnStop) {
 
-  @volatile protected var continuousSources: Seq[ContinuousReader] = _
+  @volatile protected var continuousSources: Seq[ContinuousReader] = Seq.empty
   override protected def sources: Seq[BaseStreamingSource] = continuousSources
 
   override lazy val logicalPlan: LogicalPlan = {
@@ -80,17 +78,15 @@ class ContinuousExecution(
   }
 
   override protected def runActivatedStream(sparkSessionForStream: SparkSession): Unit = {
-    val stateUpdate = new UnaryOperator[State] {
-      override def apply(s: State) = s match {
-        // If we ended the query to reconfigure, reset the state to active.
-        case RECONFIGURING => ACTIVE
-        case _ => s
-      }
-    }
-
     do {
-      runContinuous(sparkSessionForStream)
-    } while (state.updateAndGet(stateUpdate) == ACTIVE)
+      try {
+        runContinuous(sparkSessionForStream)
+      } catch {
+        case _: InterruptedException if state.get().equals(RECONFIGURING) =>
+          // swallow exception and run again
+          state.set(ACTIVE)
+      }
+    } while (state.get() == ACTIVE)
   }
 
   /**
@@ -124,16 +120,12 @@ class ContinuousExecution(
         }
         committedOffsets = nextOffsets.toStreamProgress(sources)
 
-        // Get to an epoch ID that has definitely never been sent to a sink before. Since sink
-        // commit happens between offset log write and commit log write, this means an epoch ID
-        // which is not in the offset log.
-        val (latestOffsetEpoch, _) = offsetLog.getLatest().getOrElse {
-          throw new IllegalStateException(
-            s"Offset log had no latest element. This shouldn't be possible because nextOffsets is" +
-              s"an element.")
-        }
-        currentBatchId = latestOffsetEpoch + 1
+        // Forcibly align commit and offset logs by slicing off any spurious offset logs from
+        // a previous run. We can't allow commits to an epoch that a previous run reached but
+        // this run has not.
+        offsetLog.purgeAfter(latestEpochId)
 
+        currentBatchId = latestEpochId + 1
         logDebug(s"Resuming at epoch $currentBatchId with committed offsets $committedOffsets")
         nextOffsets
       case None =>
@@ -149,7 +141,6 @@ class ContinuousExecution(
    * @param sparkSessionForQuery Isolated [[SparkSession]] to run the continuous query with.
    */
   private def runContinuous(sparkSessionForQuery: SparkSession): Unit = {
-    currentRunId = UUID.randomUUID
     // A list of attributes that will need to be updated.
     val replacements = new ArrayBuffer[(Attribute, Attribute)]
     // Translate from continuous relation to the underlying data source.
@@ -234,11 +225,13 @@ class ContinuousExecution(
           triggerExecutor.execute(() => {
             startTrigger()
 
-            if (reader.needsReconfiguration() && state.compareAndSet(ACTIVE, RECONFIGURING)) {
+            if (reader.needsReconfiguration()) {
+              state.set(RECONFIGURING)
               stopSources()
               if (queryExecutionThread.isAlive) {
                 sparkSession.sparkContext.cancelJobGroup(runId.toString)
                 queryExecutionThread.interrupt()
+                // No need to join - this thread is about to end anyway.
               }
               false
             } else if (isActive) {
@@ -266,7 +259,6 @@ class ContinuousExecution(
           sparkSessionForQuery, lastExecution)(lastExecution.toRdd)
       }
     } finally {
-      epochEndpoint.askSync[Unit](StopContinuousExecutionWrites)
       SparkEnv.get.rpcEnv.stop(epochEndpoint)
 
       epochUpdateThread.interrupt()
@@ -281,22 +273,17 @@ class ContinuousExecution(
       epoch: Long, reader: ContinuousReader, partitionOffsets: Seq[PartitionOffset]): Unit = {
     assert(continuousSources.length == 1, "only one continuous source supported currently")
 
-    val globalOffset = reader.mergeOffsets(partitionOffsets.toArray)
-    val oldOffset = synchronized {
-      offsetLog.add(epoch, OffsetSeq.fill(globalOffset))
-      offsetLog.get(epoch - 1)
+    if (partitionOffsets.contains(null)) {
+      // If any offset is null, that means the corresponding partition hasn't seen any data yet, so
+      // there's nothing meaningful to add to the offset log.
     }
-
-    // If offset hasn't changed since last epoch, there's been no new data.
-    if (oldOffset.contains(OffsetSeq.fill(globalOffset))) {
-      noNewData = true
-    }
-
-    awaitProgressLock.lock()
-    try {
-      awaitProgressLockCondition.signalAll()
-    } finally {
-      awaitProgressLock.unlock()
+    val globalOffset = reader.mergeOffsets(partitionOffsets.toArray)
+    synchronized {
+      if (queryExecutionThread.isAlive) {
+        offsetLog.add(epoch, OffsetSeq.fill(globalOffset))
+      } else {
+        return
+      }
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
index 40dcbecade814..98017c3ac6a33 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
@@ -39,15 +39,6 @@ private[continuous] sealed trait EpochCoordinatorMessage extends Serializable
  */
 private[sql] case object IncrementAndGetEpoch extends EpochCoordinatorMessage
 
-/**
- * The RpcEndpoint stop() will wait to clear out the message queue before terminating the
- * object. This can lead to a race condition where the query restarts at epoch n, a new
- * EpochCoordinator starts at epoch n, and then the old epoch coordinator commits epoch n + 1.
- * The framework doesn't provide a handle to wait on the message queue, so we use a synchronous
- * message to stop any writes to the ContinuousExecution object.
- */
-private[sql] case object StopContinuousExecutionWrites extends EpochCoordinatorMessage
-
 // Init messages
 /**
  * Set the reader and writer partition counts. Tasks may not be started until the coordinator
@@ -125,8 +116,6 @@ private[continuous] class EpochCoordinator(
     override val rpcEnv: RpcEnv)
   extends ThreadSafeRpcEndpoint with Logging {
 
-  private var queryWritesStopped: Boolean = false
-
   private var numReaderPartitions: Int = _
   private var numWriterPartitions: Int = _
 
@@ -158,16 +147,12 @@ private[continuous] class EpochCoordinator(
         partitionCommits.remove(k)
       }
       for (k <- partitionOffsets.keys.filter { case (e, _) => e < epoch }) {
-        partitionOffsets.remove(k)
+        partitionCommits.remove(k)
       }
     }
   }
 
   override def receive: PartialFunction[Any, Unit] = {
-    // If we just drop these messages, we won't do any writes to the query. The lame duck tasks
-    // won't shed errors or anything.
-    case _ if queryWritesStopped => ()
-
     case CommitPartitionEpoch(partitionId, epoch, message) =>
       logDebug(s"Got commit from partition $partitionId at epoch $epoch: $message")
       if (!partitionCommits.isDefinedAt((epoch, partitionId))) {
@@ -203,9 +188,5 @@ private[continuous] class EpochCoordinator(
     case SetWriterPartitions(numPartitions) =>
       numWriterPartitions = numPartitions
       context.reply(())
-
-    case StopContinuousExecutionWrites =>
-      queryWritesStopped = true
-      context.reply(())
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
index b5b4a05ab4973..db588ae282f38 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
@@ -29,7 +29,6 @@ import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.continuous.ContinuousTrigger
 import org.apache.spark.sql.execution.streaming.sources.{MemoryPlanV2, MemorySinkV2}
-import org.apache.spark.sql.sources.v2.streaming.ContinuousWriteSupport
 
 /**
  * Interface used to write a streaming `Dataset` to external storage systems (e.g. file systems,
@@ -280,29 +279,18 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
         useTempCheckpointLocation = true,
         trigger = trigger)
     } else {
-      val sink = trigger match {
-        case _: ContinuousTrigger =>
-          val ds = DataSource.lookupDataSource(source, df.sparkSession.sessionState.conf)
-          ds.newInstance() match {
-            case w: ContinuousWriteSupport => w
-            case _ => throw new AnalysisException(
-              s"Data source $source does not support continuous writing")
-          }
-        case _ =>
-          val ds = DataSource(
-            df.sparkSession,
-            className = source,
-            options = extraOptions.toMap,
-            partitionColumns = normalizedParCols.getOrElse(Nil))
-          ds.createSink(outputMode)
-      }
-
+      val dataSource =
+        DataSource(
+          df.sparkSession,
+          className = source,
+          options = extraOptions.toMap,
+          partitionColumns = normalizedParCols.getOrElse(Nil))
       df.sparkSession.sessionState.streamingQueryManager.startQuery(
         extraOptions.get("queryName"),
         extraOptions.get("checkpointLocation"),
         df,
         extraOptions.toMap,
-        sink,
+        dataSource.createSink(outputMode),
         outputMode,
         useTempCheckpointLocation = source == "console",
         recoverFromCheckpointLocation = true,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index 0762895fdc620..d46461fa9bf6d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -38,9 +38,8 @@ import org.apache.spark.sql.{Dataset, Encoder, QueryTest, Row}
 import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util._
-import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.execution.streaming.continuous.{ContinuousExecution, ContinuousTrigger, EpochCoordinatorRef, IncrementAndGetEpoch}
+import org.apache.spark.sql.execution.streaming.continuous.{ContinuousExecution, EpochCoordinatorRef, IncrementAndGetEpoch}
 import org.apache.spark.sql.execution.streaming.sources.MemorySinkV2
 import org.apache.spark.sql.execution.streaming.state.StateStore
 import org.apache.spark.sql.streaming.StreamingQueryListener._
@@ -81,9 +80,6 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
     StateStore.stop() // stop the state store maintenance thread and unload store providers
   }
 
-  protected val defaultTrigger = Trigger.ProcessingTime(0)
-  protected val defaultUseV2Sink = false
-
   /** How long to wait for an active stream to catch up when checking a result. */
   val streamingTimeout = 10.seconds
 
@@ -193,7 +189,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
 
   /** Starts the stream, resuming if data has already been processed. It must not be running. */
   case class StartStream(
-      trigger: Trigger = defaultTrigger,
+      trigger: Trigger = Trigger.ProcessingTime(0),
       triggerClock: Clock = new SystemClock,
       additionalConfs: Map[String, String] = Map.empty,
       checkpointLocation: String = null)
@@ -280,7 +276,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
   def testStream(
       _stream: Dataset[_],
       outputMode: OutputMode = OutputMode.Append,
-      useV2Sink: Boolean = defaultUseV2Sink)(actions: StreamAction*): Unit = synchronized {
+      useV2Sink: Boolean = false)(actions: StreamAction*): Unit = synchronized {
     import org.apache.spark.sql.streaming.util.StreamManualClock
 
     // `synchronized` is added to prevent the user from calling multiple `testStream`s concurrently
@@ -407,11 +403,18 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
 
     def fetchStreamAnswer(currentStream: StreamExecution, lastOnly: Boolean) = {
       verify(currentStream != null, "stream not running")
+      // Get the map of source index to the current source objects
+      val indexToSource = currentStream
+        .logicalPlan
+        .collect { case StreamingExecutionRelation(s, _) => s }
+        .zipWithIndex
+        .map(_.swap)
+        .toMap
 
       // Block until all data added has been processed for all the source
       awaiting.foreach { case (sourceIndex, offset) =>
         failAfter(streamingTimeout) {
-          currentStream.awaitOffset(sourceIndex, offset)
+          currentStream.awaitOffset(indexToSource(sourceIndex), offset)
         }
       }
 
@@ -470,12 +473,6 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
             // after starting the query.
             try {
               currentStream.awaitInitialization(streamingTimeout.toMillis)
-              currentStream match {
-                case s: ContinuousExecution => eventually("IncrementalExecution was not created") {
-                    s.lastExecution.executedPlan // will fail if lastExecution is null
-                  }
-                case _ =>
-              }
             } catch {
               case _: StreamingQueryException =>
                 // Ignore the exception. `StopStream` or `ExpectFailure` will catch it as well.
@@ -603,10 +600,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
 
               def findSourceIndex(plan: LogicalPlan): Option[Int] = {
                 plan
-                  .collect {
-                    case StreamingExecutionRelation(s, _) => s
-                    case DataSourceV2Relation(_, r) => r
-                  }
+                  .collect { case StreamingExecutionRelation(s, _) => s }
                   .zipWithIndex
                   .find(_._1 == source)
                   .map(_._2)
@@ -619,13 +613,9 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
                   findSourceIndex(query.logicalPlan)
                 }.orElse {
                   findSourceIndex(stream.logicalPlan)
-                }.orElse {
-                  queryToUse.flatMap { q =>
-                    findSourceIndex(q.lastExecution.logical)
-                  }
                 }.getOrElse {
                   throw new IllegalArgumentException(
-                    "Could not find index of the source to which data was added")
+                    "Could find index of the source to which data was added")
                 }
 
               // Store the expected offset of added data to wait for it later

From ca27d9cb5e30b6a50a4c8b7d10ac28f4f51d44ee Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sat, 13 Jan 2018 16:13:44 +0900
Subject: [PATCH 080/566] [SPARK-22980][PYTHON][SQL] Clarify the length of each
 series is of each batch within scalar Pandas UDF

## What changes were proposed in this pull request?

This PR proposes to add a note that saying the length of a scalar Pandas UDF's `Series` is not of the whole input column but of the batch.

We are fine for a group map UDF because the usage is different from our typical UDF but scalar UDFs might cause confusion with the normal UDF.

For example, please consider this example:

```python
from pyspark.sql.functions import pandas_udf, col, lit

df = spark.range(1)
f = pandas_udf(lambda x, y: len(x) + y, LongType())
df.select(f(lit('text'), col('id'))).show()
```

```
+------------------+
|<lambda>(text, id)|
+------------------+
|                 1|
+------------------+
```

```python
from pyspark.sql.functions import udf, col, lit

df = spark.range(1)
f = udf(lambda x, y: len(x) + y, "long")
df.select(f(lit('text'), col('id'))).show()
```

```
+------------------+
|<lambda>(text, id)|
+------------------+
|                 4|
+------------------+
```

## How was this patch tested?

Manually built the doc and checked the output.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20237 from HyukjinKwon/SPARK-22980.

(cherry picked from commit cd9f49a2aed3799964976ead06080a0f7044a0c3)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 python/pyspark/sql/functions.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 733e32bd825b0..e1ad6590554cf 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2184,6 +2184,11 @@ def pandas_udf(f=None, returnType=None, functionType=None):
        |         8|      JOHN DOE|          22|
        +----------+--------------+------------+
 
+       .. note:: The length of `pandas.Series` within a scalar UDF is not that of the whole input
+           column, but is the length of an internal batch used for each call to the function.
+           Therefore, this can be used, for example, to ensure the length of each returned
+           `pandas.Series`, and can not be used as the column length.
+
     2. GROUP_MAP
 
        A group map UDF defines transformation: A `pandas.DataFrame` -> A `pandas.DataFrame`

From 801ffd799922e1c2751d3331874b88a67da8cf01 Mon Sep 17 00:00:00 2001
From: Yuming Wang <yumwang@ebay.com>
Date: Sat, 13 Jan 2018 10:01:44 -0600
Subject: [PATCH 081/566] [SPARK-22870][CORE] Dynamic allocation should allow 0
 idle time

## What changes were proposed in this pull request?

This pr to make `0` as a valid value for `spark.dynamicAllocation.executorIdleTimeout`.
For details, see the jira description: https://issues.apache.org/jira/browse/SPARK-22870.

## How was this patch tested?

N/A

Author: Yuming Wang <yumwang@ebay.com>
Author: Yuming Wang <wgyumg@gmail.com>

Closes #20080 from wangyum/SPARK-22870.

(cherry picked from commit fc6fe8a1d0f161c4788f3db94de49a8669ba3bcc)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../scala/org/apache/spark/ExecutorAllocationManager.scala | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 2e00dc8b49dd5..6c59038f2a6c1 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -195,8 +195,11 @@ private[spark] class ExecutorAllocationManager(
       throw new SparkException(
         "spark.dynamicAllocation.sustainedSchedulerBacklogTimeout must be > 0!")
     }
-    if (executorIdleTimeoutS <= 0) {
-      throw new SparkException("spark.dynamicAllocation.executorIdleTimeout must be > 0!")
+    if (executorIdleTimeoutS < 0) {
+      throw new SparkException("spark.dynamicAllocation.executorIdleTimeout must be >= 0!")
+    }
+    if (cachedExecutorIdleTimeoutS < 0) {
+      throw new SparkException("spark.dynamicAllocation.cachedExecutorIdleTimeout must be >= 0!")
     }
     // Require external shuffle service for dynamic allocation
     // Otherwise, we may lose shuffle files when killing executors

From 8d32ed5f281317ba380aa6b8b3f3f041575022cb Mon Sep 17 00:00:00 2001
From: xubo245 <601450868@qq.com>
Date: Sun, 14 Jan 2018 02:28:57 +0800
Subject: [PATCH 082/566] [SPARK-23036][SQL][TEST] Add withGlobalTempView for
 testing

## What changes were proposed in this pull request?

Add withGlobalTempView when create global temp view, like withTempView and withView.
And correct some improper usage.
Please see jira.
There are other similar place like that. I will fix it if community need. Please confirm it.
## How was this patch tested?

no new test.

Author: xubo245 <601450868@qq.com>

Closes #20228 from xubo245/DropTempView.

(cherry picked from commit bd4a21b4820c4ebaf750131574a6b2eeea36907e)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../sql/execution/GlobalTempViewSuite.scala   | 55 ++++++++-----------
 .../spark/sql/execution/SQLViewSuite.scala    | 34 +++++++-----
 .../apache/spark/sql/test/SQLTestUtils.scala  | 21 +++++--
 3 files changed, 59 insertions(+), 51 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/GlobalTempViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/GlobalTempViewSuite.scala
index cc943e0356f2a..dcc6fa6403f31 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/GlobalTempViewSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/GlobalTempViewSuite.scala
@@ -36,7 +36,7 @@ class GlobalTempViewSuite extends QueryTest with SharedSQLContext {
 
   test("basic semantic") {
     val expectedErrorMsg = "not found"
-    try {
+    withGlobalTempView("src") {
       sql("CREATE GLOBAL TEMP VIEW src AS SELECT 1, 'a'")
 
       // If there is no database in table name, we should try local temp view first, if not found,
@@ -79,19 +79,15 @@ class GlobalTempViewSuite extends QueryTest with SharedSQLContext {
       // We can also use Dataset API to replace global temp view
       Seq(2 -> "b").toDF("i", "j").createOrReplaceGlobalTempView("src")
       checkAnswer(spark.table(s"$globalTempDB.src"), Row(2, "b"))
-    } finally {
-      spark.catalog.dropGlobalTempView("src")
     }
   }
 
   test("global temp view is shared among all sessions") {
-    try {
+    withGlobalTempView("src") {
       sql("CREATE GLOBAL TEMP VIEW src AS SELECT 1, 2")
       checkAnswer(spark.table(s"$globalTempDB.src"), Row(1, 2))
       val newSession = spark.newSession()
       checkAnswer(newSession.table(s"$globalTempDB.src"), Row(1, 2))
-    } finally {
-      spark.catalog.dropGlobalTempView("src")
     }
   }
 
@@ -105,27 +101,25 @@ class GlobalTempViewSuite extends QueryTest with SharedSQLContext {
 
   test("CREATE GLOBAL TEMP VIEW USING") {
     withTempPath { path =>
-      try {
+      withGlobalTempView("src") {
         Seq(1 -> "a").toDF("i", "j").write.parquet(path.getAbsolutePath)
         sql(s"CREATE GLOBAL TEMP VIEW src USING parquet OPTIONS (PATH '${path.toURI}')")
         checkAnswer(spark.table(s"$globalTempDB.src"), Row(1, "a"))
         sql(s"INSERT INTO $globalTempDB.src SELECT 2, 'b'")
         checkAnswer(spark.table(s"$globalTempDB.src"), Row(1, "a") :: Row(2, "b") :: Nil)
-      } finally {
-        spark.catalog.dropGlobalTempView("src")
       }
     }
   }
 
   test("CREATE TABLE LIKE should work for global temp view") {
-    try {
-      sql("CREATE GLOBAL TEMP VIEW src AS SELECT 1 AS a, '2' AS b")
-      sql(s"CREATE TABLE cloned LIKE $globalTempDB.src")
-      val tableMeta = spark.sessionState.catalog.getTableMetadata(TableIdentifier("cloned"))
-      assert(tableMeta.schema == new StructType().add("a", "int", false).add("b", "string", false))
-    } finally {
-      spark.catalog.dropGlobalTempView("src")
-      sql("DROP TABLE default.cloned")
+    withTable("cloned") {
+      withGlobalTempView("src") {
+        sql("CREATE GLOBAL TEMP VIEW src AS SELECT 1 AS a, '2' AS b")
+        sql(s"CREATE TABLE cloned LIKE $globalTempDB.src")
+        val tableMeta = spark.sessionState.catalog.getTableMetadata(TableIdentifier("cloned"))
+        assert(tableMeta.schema == new StructType()
+          .add("a", "int", false).add("b", "string", false))
+      }
     }
   }
 
@@ -146,26 +140,25 @@ class GlobalTempViewSuite extends QueryTest with SharedSQLContext {
   }
 
   test("should lookup global temp view if and only if global temp db is specified") {
-    try {
-      sql("CREATE GLOBAL TEMP VIEW same_name AS SELECT 3, 4")
-      sql("CREATE TEMP VIEW same_name AS SELECT 1, 2")
+    withTempView("same_name") {
+      withGlobalTempView("same_name") {
+        sql("CREATE GLOBAL TEMP VIEW same_name AS SELECT 3, 4")
+        sql("CREATE TEMP VIEW same_name AS SELECT 1, 2")
 
-      checkAnswer(sql("SELECT * FROM same_name"), Row(1, 2))
+        checkAnswer(sql("SELECT * FROM same_name"), Row(1, 2))
 
-      // we never lookup global temp views if database is not specified in table name
-      spark.catalog.dropTempView("same_name")
-      intercept[AnalysisException](sql("SELECT * FROM same_name"))
+        // we never lookup global temp views if database is not specified in table name
+        spark.catalog.dropTempView("same_name")
+        intercept[AnalysisException](sql("SELECT * FROM same_name"))
 
-      // Use qualified name to lookup a global temp view.
-      checkAnswer(sql(s"SELECT * FROM $globalTempDB.same_name"), Row(3, 4))
-    } finally {
-      spark.catalog.dropTempView("same_name")
-      spark.catalog.dropGlobalTempView("same_name")
+        // Use qualified name to lookup a global temp view.
+        checkAnswer(sql(s"SELECT * FROM $globalTempDB.same_name"), Row(3, 4))
+      }
     }
   }
 
   test("public Catalog should recognize global temp view") {
-    try {
+    withGlobalTempView("src")  {
       sql("CREATE GLOBAL TEMP VIEW src AS SELECT 1, 2")
 
       assert(spark.catalog.tableExists(globalTempDB, "src"))
@@ -175,8 +168,6 @@ class GlobalTempViewSuite extends QueryTest with SharedSQLContext {
         description = null,
         tableType = "TEMPORARY",
         isTemporary = true).toString)
-    } finally {
-      spark.catalog.dropGlobalTempView("src")
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
index 08a4a21b20f61..8c55758cfe38d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
@@ -69,21 +69,25 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils {
   }
 
   test("create a permanent view on a temp view") {
-    withView("jtv1", "temp_jtv1", "global_temp_jtv1") {
-      sql("CREATE TEMPORARY VIEW temp_jtv1 AS SELECT * FROM jt WHERE id > 3")
-      var e = intercept[AnalysisException] {
-        sql("CREATE VIEW jtv1 AS SELECT * FROM temp_jtv1 WHERE id < 6")
-      }.getMessage
-      assert(e.contains("Not allowed to create a permanent view `jtv1` by " +
-        "referencing a temporary view `temp_jtv1`"))
-
-      val globalTempDB = spark.sharedState.globalTempViewManager.database
-      sql("CREATE GLOBAL TEMP VIEW global_temp_jtv1 AS SELECT * FROM jt WHERE id > 0")
-      e = intercept[AnalysisException] {
-        sql(s"CREATE VIEW jtv1 AS SELECT * FROM $globalTempDB.global_temp_jtv1 WHERE id < 6")
-      }.getMessage
-      assert(e.contains(s"Not allowed to create a permanent view `jtv1` by referencing " +
-        s"a temporary view `global_temp`.`global_temp_jtv1`"))
+    withView("jtv1") {
+      withTempView("temp_jtv1") {
+        withGlobalTempView("global_temp_jtv1") {
+          sql("CREATE TEMPORARY VIEW temp_jtv1 AS SELECT * FROM jt WHERE id > 3")
+          var e = intercept[AnalysisException] {
+            sql("CREATE VIEW jtv1 AS SELECT * FROM temp_jtv1 WHERE id < 6")
+          }.getMessage
+          assert(e.contains("Not allowed to create a permanent view `jtv1` by " +
+            "referencing a temporary view `temp_jtv1`"))
+
+          val globalTempDB = spark.sharedState.globalTempViewManager.database
+          sql("CREATE GLOBAL TEMP VIEW global_temp_jtv1 AS SELECT * FROM jt WHERE id > 0")
+          e = intercept[AnalysisException] {
+            sql(s"CREATE VIEW jtv1 AS SELECT * FROM $globalTempDB.global_temp_jtv1 WHERE id < 6")
+          }.getMessage
+          assert(e.contains(s"Not allowed to create a permanent view `jtv1` by referencing " +
+            s"a temporary view `global_temp`.`global_temp_jtv1`"))
+        }
+      }
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
index 904f9f2ad0b22..bc4a120f7042f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
@@ -254,13 +254,26 @@ private[sql] trait SQLTestUtilsBase
   }
 
   /**
-   * Drops temporary table `tableName` after calling `f`.
+   * Drops temporary view `viewNames` after calling `f`.
    */
-  protected def withTempView(tableNames: String*)(f: => Unit): Unit = {
+  protected def withTempView(viewNames: String*)(f: => Unit): Unit = {
     try f finally {
       // If the test failed part way, we don't want to mask the failure by failing to remove
-      // temp tables that never got created.
-      try tableNames.foreach(spark.catalog.dropTempView) catch {
+      // temp views that never got created.
+      try viewNames.foreach(spark.catalog.dropTempView) catch {
+        case _: NoSuchTableException =>
+      }
+    }
+  }
+
+  /**
+   * Drops global temporary view `viewNames` after calling `f`.
+   */
+  protected def withGlobalTempView(viewNames: String*)(f: => Unit): Unit = {
+    try f finally {
+      // If the test failed part way, we don't want to mask the failure by failing to remove
+      // global temp views that never got created.
+      try viewNames.foreach(spark.catalog.dropGlobalTempView) catch {
         case _: NoSuchTableException =>
       }
     }

From 0fc5533e53ad03eb67590ddd231f40c2713150c3 Mon Sep 17 00:00:00 2001
From: CodingCat <zhunansjtu@gmail.com>
Date: Sun, 14 Jan 2018 02:36:32 +0800
Subject: [PATCH 083/566] [SPARK-22790][SQL] add a configurable factor to
 describe HadoopFsRelation's size

## What changes were proposed in this pull request?

as per discussion in https://github.com/apache/spark/pull/19864#discussion_r156847927

the current HadoopFsRelation is purely based on the underlying file size which is not accurate and makes the execution vulnerable to errors like OOM

Users can enable CBO with the functionalities in https://github.com/apache/spark/pull/19864 to avoid this issue

This JIRA proposes to add a configurable factor to sizeInBytes method in HadoopFsRelation class so that users can mitigate this problem without CBO

## How was this patch tested?

Existing tests

Author: CodingCat <zhunansjtu@gmail.com>
Author: Nan Zhu <nanzhu@uber.com>

Closes #20072 from CodingCat/SPARK-22790.

(cherry picked from commit ba891ec993c616dc4249fc786c56ea82ed04a827)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../apache/spark/sql/internal/SQLConf.scala   | 13 +++++-
 .../datasources/HadoopFsRelation.scala        |  6 ++-
 .../datasources/HadoopFsRelationSuite.scala   | 41 +++++++++++++++++++
 3 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 36e802a9faa6f..6746fbcaf2483 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -249,7 +249,7 @@ object SQLConf {
   val CONSTRAINT_PROPAGATION_ENABLED = buildConf("spark.sql.constraintPropagation.enabled")
     .internal()
     .doc("When true, the query optimizer will infer and propagate data constraints in the query " +
-      "plan to optimize them. Constraint propagation can sometimes be computationally expensive" +
+      "plan to optimize them. Constraint propagation can sometimes be computationally expensive " +
       "for certain kinds of query plans (such as those with a large number of predicates and " +
       "aliases) which might negatively impact overall runtime.")
     .booleanConf
@@ -263,6 +263,15 @@ object SQLConf {
     .booleanConf
     .createWithDefault(false)
 
+  val FILE_COMRESSION_FACTOR = buildConf("spark.sql.sources.fileCompressionFactor")
+    .internal()
+    .doc("When estimating the output data size of a table scan, multiply the file size with this " +
+      "factor as the estimated data size, in case the data is compressed in the file and lead to" +
+      " a heavily underestimated result.")
+    .doubleConf
+    .checkValue(_ > 0, "the value of fileDataSizeFactor must be larger than 0")
+    .createWithDefault(1.0)
+
   val PARQUET_SCHEMA_MERGING_ENABLED = buildConf("spark.sql.parquet.mergeSchema")
     .doc("When true, the Parquet data source merges schemas collected from all data files, " +
          "otherwise the schema is picked from the summary file or a random data file " +
@@ -1255,6 +1264,8 @@ class SQLConf extends Serializable with Logging {
 
   def escapedStringLiterals: Boolean = getConf(ESCAPED_STRING_LITERALS)
 
+  def fileCompressionFactor: Double = getConf(FILE_COMRESSION_FACTOR)
+
   def stringRedationPattern: Option[Regex] = SQL_STRING_REDACTION_PATTERN.readFrom(reader)
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
index 89d8a85a9cbd2..6b34638529770 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
@@ -82,7 +82,11 @@ case class HadoopFsRelation(
     }
   }
 
-  override def sizeInBytes: Long = location.sizeInBytes
+  override def sizeInBytes: Long = {
+    val compressionFactor = sqlContext.conf.fileCompressionFactor
+    (location.sizeInBytes * compressionFactor).toLong
+  }
+
 
   override def inputFiles: Array[String] = location.inputFiles
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
index caf03885e3873..c1f2c18d1417d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.datasources
 import java.io.{File, FilenameFilter}
 
 import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, SortMergeJoinExec}
 import org.apache.spark.sql.test.SharedSQLContext
 
 class HadoopFsRelationSuite extends QueryTest with SharedSQLContext {
@@ -39,4 +40,44 @@ class HadoopFsRelationSuite extends QueryTest with SharedSQLContext {
       assert(df.queryExecution.logical.stats.sizeInBytes === BigInt(totalSize))
     }
   }
+
+  test("SPARK-22790: spark.sql.sources.compressionFactor takes effect") {
+    import testImplicits._
+    Seq(1.0, 0.5).foreach { compressionFactor =>
+      withSQLConf("spark.sql.sources.fileCompressionFactor" -> compressionFactor.toString,
+        "spark.sql.autoBroadcastJoinThreshold" -> "400") {
+        withTempPath { workDir =>
+          // the file size is 740 bytes
+          val workDirPath = workDir.getAbsolutePath
+          val data1 = Seq(100, 200, 300, 400).toDF("count")
+          data1.write.parquet(workDirPath + "/data1")
+          val df1FromFile = spark.read.parquet(workDirPath + "/data1")
+          val data2 = Seq(100, 200, 300, 400).toDF("count")
+          data2.write.parquet(workDirPath + "/data2")
+          val df2FromFile = spark.read.parquet(workDirPath + "/data2")
+          val joinedDF = df1FromFile.join(df2FromFile, Seq("count"))
+          if (compressionFactor == 0.5) {
+            val bJoinExec = joinedDF.queryExecution.executedPlan.collect {
+              case bJoin: BroadcastHashJoinExec => bJoin
+            }
+            assert(bJoinExec.nonEmpty)
+            val smJoinExec = joinedDF.queryExecution.executedPlan.collect {
+              case smJoin: SortMergeJoinExec => smJoin
+            }
+            assert(smJoinExec.isEmpty)
+          } else {
+            // compressionFactor is 1.0
+            val bJoinExec = joinedDF.queryExecution.executedPlan.collect {
+              case bJoin: BroadcastHashJoinExec => bJoin
+            }
+            assert(bJoinExec.isEmpty)
+            val smJoinExec = joinedDF.queryExecution.executedPlan.collect {
+              case smJoin: SortMergeJoinExec => smJoin
+            }
+            assert(smJoinExec.nonEmpty)
+          }
+        }
+      }
+    }
+  }
 }

From bcd87ae0775d16b7c3b9de0c4f2db36eb3679476 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Sun, 14 Jan 2018 05:39:38 +0800
Subject: [PATCH 084/566] [SPARK-21213][SQL][FOLLOWUP] Use compatible types for
 comparisons in compareAndGetNewStats

## What changes were proposed in this pull request?
This pr fixed code to compare values in `compareAndGetNewStats`.
The test below fails in the current master;
```
    val oldStats2 = CatalogStatistics(sizeInBytes = BigInt(Long.MaxValue) * 2)
    val newStats5 = CommandUtils.compareAndGetNewStats(
      Some(oldStats2), newTotalSize = BigInt(Long.MaxValue) * 2, None)
    assert(newStats5.isEmpty)
```

## How was this patch tested?
Added some tests in `CommandUtilsSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #20245 from maropu/SPARK-21213-FOLLOWUP.

(cherry picked from commit 0066d6f6fa604817468471832968d4339f71c5cb)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../sql/execution/command/CommandUtils.scala  |  4 +-
 .../execution/command/CommandUtilsSuite.scala | 56 +++++++++++++++++++
 2 files changed, 58 insertions(+), 2 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/command/CommandUtilsSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
index 1a0d67fc71fbc..c27048626c8eb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
@@ -116,8 +116,8 @@ object CommandUtils extends Logging {
       oldStats: Option[CatalogStatistics],
       newTotalSize: BigInt,
       newRowCount: Option[BigInt]): Option[CatalogStatistics] = {
-    val oldTotalSize = oldStats.map(_.sizeInBytes.toLong).getOrElse(-1L)
-    val oldRowCount = oldStats.flatMap(_.rowCount.map(_.toLong)).getOrElse(-1L)
+    val oldTotalSize = oldStats.map(_.sizeInBytes).getOrElse(BigInt(-1))
+    val oldRowCount = oldStats.flatMap(_.rowCount).getOrElse(BigInt(-1))
     var newStats: Option[CatalogStatistics] = None
     if (newTotalSize >= 0 && newTotalSize != oldTotalSize) {
       newStats = Some(CatalogStatistics(sizeInBytes = newTotalSize))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/CommandUtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/CommandUtilsSuite.scala
new file mode 100644
index 0000000000000..f3e15189a6418
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/CommandUtilsSuite.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.command
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.catalog.CatalogStatistics
+
+class CommandUtilsSuite extends SparkFunSuite {
+
+  test("Check if compareAndGetNewStats returns correct results") {
+    val oldStats1 = CatalogStatistics(sizeInBytes = 10, rowCount = Some(100))
+    val newStats1 = CommandUtils.compareAndGetNewStats(
+      Some(oldStats1), newTotalSize = 10, newRowCount = Some(100))
+    assert(newStats1.isEmpty)
+    val newStats2 = CommandUtils.compareAndGetNewStats(
+      Some(oldStats1), newTotalSize = -1, newRowCount = None)
+    assert(newStats2.isEmpty)
+    val newStats3 = CommandUtils.compareAndGetNewStats(
+      Some(oldStats1), newTotalSize = 20, newRowCount = Some(-1))
+    assert(newStats3.isDefined)
+    newStats3.foreach { stat =>
+      assert(stat.sizeInBytes === 20)
+      assert(stat.rowCount.isEmpty)
+    }
+    val newStats4 = CommandUtils.compareAndGetNewStats(
+      Some(oldStats1), newTotalSize = -1, newRowCount = Some(200))
+    assert(newStats4.isDefined)
+    newStats4.foreach { stat =>
+      assert(stat.sizeInBytes === 10)
+      assert(stat.rowCount.isDefined && stat.rowCount.get === 200)
+    }
+  }
+
+  test("Check if compareAndGetNewStats can handle large values") {
+    // Tests for large values
+    val oldStats2 = CatalogStatistics(sizeInBytes = BigInt(Long.MaxValue) * 2)
+    val newStats5 = CommandUtils.compareAndGetNewStats(
+      Some(oldStats2), newTotalSize = BigInt(Long.MaxValue) * 2, None)
+    assert(newStats5.isEmpty)
+  }
+}

From 1f4a08b15ab47cf6c3bb08c783497422f30d0709 Mon Sep 17 00:00:00 2001
From: foxish <ramanathana@google.com>
Date: Sat, 13 Jan 2018 21:34:28 -0800
Subject: [PATCH 085/566] [SPARK-23063][K8S] K8s changes for publishing scripts
 (and a couple of other misses)

## What changes were proposed in this pull request?

Including the `-Pkubernetes` flag in a few places it was missed.

## How was this patch tested?

checkstyle, mima through manual tests.

Author: foxish <ramanathana@google.com>

Closes #20256 from foxish/SPARK-23063.

(cherry picked from commit c3548d11c3c57e8f2c6ebd9d2d6a3924ddcd3cba)
Signed-off-by: Felix Cheung <felixcheung@apache.org>
---
 dev/create-release/release-build.sh |  4 ++--
 dev/create-release/releaseutils.py  |  2 ++
 dev/deps/spark-deps-hadoop-2.6      | 11 +++++++++++
 dev/deps/spark-deps-hadoop-2.7      | 11 +++++++++++
 dev/lint-java                       |  2 +-
 dev/mima                            |  2 +-
 dev/scalastyle                      |  1 +
 dev/test-dependencies.sh            |  2 +-
 8 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index c71137468054f..a3579f21fc539 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -92,9 +92,9 @@ MVN="build/mvn --force"
 # Hive-specific profiles for some builds
 HIVE_PROFILES="-Phive -Phive-thriftserver"
 # Profiles for publishing snapshots and release to Maven Central
-PUBLISH_PROFILES="-Pmesos -Pyarn -Pflume $HIVE_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl"
+PUBLISH_PROFILES="-Pmesos -Pyarn -Pkubernetes -Pflume $HIVE_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl"
 # Profiles for building binary releases
-BASE_RELEASE_PROFILES="-Pmesos -Pyarn -Pflume -Psparkr"
+BASE_RELEASE_PROFILES="-Pmesos -Pyarn -Pkubernetes -Pflume -Psparkr"
 # Scala 2.11 only profiles for some builds
 SCALA_2_11_PROFILES="-Pkafka-0-8"
 # Scala 2.12 only profiles for some builds
diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py
index 730138195e5fe..32f6cbb29f0be 100755
--- a/dev/create-release/releaseutils.py
+++ b/dev/create-release/releaseutils.py
@@ -185,6 +185,8 @@ def get_commits(tag):
     "graphx": "GraphX",
     "input/output": CORE_COMPONENT,
     "java api": "Java API",
+    "k8s": "Kubernetes",
+    "kubernetes": "Kubernetes",
     "mesos": "Mesos",
     "ml": "MLlib",
     "mllib": "MLlib",
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index a7fce2ede0ea5..3b2a88537ca50 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -17,6 +17,7 @@ arpack_combined_all-0.1.jar
 arrow-format-0.8.0.jar
 arrow-memory-0.8.0.jar
 arrow-vector-0.8.0.jar
+automaton-1.11-8.jar
 avro-1.7.7.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
@@ -60,6 +61,7 @@ datanucleus-rdbms-3.2.9.jar
 derby-10.12.1.1.jar
 eigenbase-properties-1.1.5.jar
 flatbuffers-1.2.0-3f79e055.jar
+generex-1.0.1.jar
 gson-2.2.4.jar
 guava-14.0.1.jar
 guice-3.0.jar
@@ -91,8 +93,10 @@ jackson-annotations-2.6.7.jar
 jackson-core-2.6.7.jar
 jackson-core-asl-1.9.13.jar
 jackson-databind-2.6.7.1.jar
+jackson-dataformat-yaml-2.6.7.jar
 jackson-jaxrs-1.9.13.jar
 jackson-mapper-asl-1.9.13.jar
+jackson-module-jaxb-annotations-2.6.7.jar
 jackson-module-paranamer-2.7.9.jar
 jackson-module-scala_2.11-2.6.7.1.jar
 jackson-xc-1.9.13.jar
@@ -130,10 +134,13 @@ jta-1.1.jar
 jtransforms-2.4.0.jar
 jul-to-slf4j-1.7.16.jar
 kryo-shaded-3.0.3.jar
+kubernetes-client-3.0.0.jar
+kubernetes-model-2.0.0.jar
 leveldbjni-all-1.8.jar
 libfb303-0.9.3.jar
 libthrift-0.9.3.jar
 log4j-1.2.17.jar
+logging-interceptor-3.8.1.jar
 lz4-java-1.4.0.jar
 machinist_2.11-0.6.1.jar
 macro-compat_2.11-1.1.1.jar
@@ -146,6 +153,8 @@ minlog-1.3.0.jar
 netty-3.9.9.Final.jar
 netty-all-4.1.17.Final.jar
 objenesis-2.1.jar
+okhttp-3.8.1.jar
+okio-1.13.0.jar
 opencsv-2.3.jar
 orc-core-1.4.1-nohive.jar
 orc-mapreduce-1.4.1-nohive.jar
@@ -171,6 +180,7 @@ scalap-2.11.8.jar
 shapeless_2.11-2.3.2.jar
 slf4j-api-1.7.16.jar
 slf4j-log4j12-1.7.16.jar
+snakeyaml-1.15.jar
 snappy-0.2.jar
 snappy-java-1.1.2.6.jar
 spire-macros_2.11-0.13.0.jar
@@ -186,5 +196,6 @@ xbean-asm5-shaded-4.4.jar
 xercesImpl-2.9.1.jar
 xmlenc-0.52.jar
 xz-1.0.jar
+zjsonpatch-0.3.0.jar
 zookeeper-3.4.6.jar
 zstd-jni-1.3.2-2.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 94b2e98d85e74..757da00788e54 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -17,6 +17,7 @@ arpack_combined_all-0.1.jar
 arrow-format-0.8.0.jar
 arrow-memory-0.8.0.jar
 arrow-vector-0.8.0.jar
+automaton-1.11-8.jar
 avro-1.7.7.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
@@ -60,6 +61,7 @@ datanucleus-rdbms-3.2.9.jar
 derby-10.12.1.1.jar
 eigenbase-properties-1.1.5.jar
 flatbuffers-1.2.0-3f79e055.jar
+generex-1.0.1.jar
 gson-2.2.4.jar
 guava-14.0.1.jar
 guice-3.0.jar
@@ -91,8 +93,10 @@ jackson-annotations-2.6.7.jar
 jackson-core-2.6.7.jar
 jackson-core-asl-1.9.13.jar
 jackson-databind-2.6.7.1.jar
+jackson-dataformat-yaml-2.6.7.jar
 jackson-jaxrs-1.9.13.jar
 jackson-mapper-asl-1.9.13.jar
+jackson-module-jaxb-annotations-2.6.7.jar
 jackson-module-paranamer-2.7.9.jar
 jackson-module-scala_2.11-2.6.7.1.jar
 jackson-xc-1.9.13.jar
@@ -131,10 +135,13 @@ jta-1.1.jar
 jtransforms-2.4.0.jar
 jul-to-slf4j-1.7.16.jar
 kryo-shaded-3.0.3.jar
+kubernetes-client-3.0.0.jar
+kubernetes-model-2.0.0.jar
 leveldbjni-all-1.8.jar
 libfb303-0.9.3.jar
 libthrift-0.9.3.jar
 log4j-1.2.17.jar
+logging-interceptor-3.8.1.jar
 lz4-java-1.4.0.jar
 machinist_2.11-0.6.1.jar
 macro-compat_2.11-1.1.1.jar
@@ -147,6 +154,8 @@ minlog-1.3.0.jar
 netty-3.9.9.Final.jar
 netty-all-4.1.17.Final.jar
 objenesis-2.1.jar
+okhttp-3.8.1.jar
+okio-1.13.0.jar
 opencsv-2.3.jar
 orc-core-1.4.1-nohive.jar
 orc-mapreduce-1.4.1-nohive.jar
@@ -172,6 +181,7 @@ scalap-2.11.8.jar
 shapeless_2.11-2.3.2.jar
 slf4j-api-1.7.16.jar
 slf4j-log4j12-1.7.16.jar
+snakeyaml-1.15.jar
 snappy-0.2.jar
 snappy-java-1.1.2.6.jar
 spire-macros_2.11-0.13.0.jar
@@ -187,5 +197,6 @@ xbean-asm5-shaded-4.4.jar
 xercesImpl-2.9.1.jar
 xmlenc-0.52.jar
 xz-1.0.jar
+zjsonpatch-0.3.0.jar
 zookeeper-3.4.6.jar
 zstd-jni-1.3.2-2.jar
diff --git a/dev/lint-java b/dev/lint-java
index c2e80538ef2a5..1f0b0c8379ed0 100755
--- a/dev/lint-java
+++ b/dev/lint-java
@@ -20,7 +20,7 @@
 SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 SPARK_ROOT_DIR="$(dirname $SCRIPT_DIR)"
 
-ERRORS=$($SCRIPT_DIR/../build/mvn -Pkinesis-asl -Pmesos -Pyarn -Phive -Phive-thriftserver checkstyle:check | grep ERROR)
+ERRORS=$($SCRIPT_DIR/../build/mvn -Pkinesis-asl -Pmesos -Pkubernetes -Pyarn -Phive -Phive-thriftserver checkstyle:check | grep ERROR)
 
 if test ! -z "$ERRORS"; then
     echo -e "Checkstyle checks failed at following occurrences:\n$ERRORS"
diff --git a/dev/mima b/dev/mima
index 1e3ca9700bc07..cd2694ff4d3de 100755
--- a/dev/mima
+++ b/dev/mima
@@ -24,7 +24,7 @@ set -e
 FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
 cd "$FWDIR"
 
-SPARK_PROFILES="-Pmesos -Pkafka-0-8 -Pyarn -Pflume -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive"
+SPARK_PROFILES="-Pmesos -Pkafka-0-8 -Pkubernetes -Pyarn -Pflume -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive"
 TOOLS_CLASSPATH="$(build/sbt -DcopyDependencies=false "export tools/fullClasspath" | tail -n1)"
 OLD_DEPS_CLASSPATH="$(build/sbt -DcopyDependencies=false $SPARK_PROFILES "export oldDeps/fullClasspath" | tail -n1)"
 
diff --git a/dev/scalastyle b/dev/scalastyle
index 89ecc8abd6f8c..b8053df05fa2b 100755
--- a/dev/scalastyle
+++ b/dev/scalastyle
@@ -24,6 +24,7 @@ ERRORS=$(echo -e "q\n" \
         -Pkinesis-asl \
         -Pmesos \
         -Pkafka-0-8 \
+        -Pkubernetes \
         -Pyarn \
         -Pflume \
         -Phive \
diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh
index 58b295d4f6e00..3bf7618e1ea96 100755
--- a/dev/test-dependencies.sh
+++ b/dev/test-dependencies.sh
@@ -29,7 +29,7 @@ export LC_ALL=C
 # TODO: This would be much nicer to do in SBT, once SBT supports Maven-style resolution.
 
 # NOTE: These should match those in the release publishing script
-HADOOP2_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pkafka-0-8 -Pyarn -Pflume -Phive"
+HADOOP2_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pkafka-0-8 -Pkubernetes -Pyarn -Pflume -Phive"
 MVN="build/mvn"
 HADOOP_PROFILES=(
     hadoop-2.6

From a335a49ce4672b44e5f818145214040a67c722ba Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Sat, 13 Jan 2018 23:26:12 -0800
Subject: [PATCH 086/566] [SPARK-23038][TEST] Update docker/spark-test (JDK/OS)

## What changes were proposed in this pull request?

This PR aims to update the followings in `docker/spark-test`.

- JDK7 -> JDK8
Spark 2.2+ supports JDK8 only.

- Ubuntu 12.04.5 LTS(precise) -> Ubuntu 16.04.3 LTS(xeniel)
The end of life of `precise` was April 28, 2017.

## How was this patch tested?

Manual.

* Master
```
$ cd external/docker
$ ./build
$ export SPARK_HOME=...
$ docker run -v $SPARK_HOME:/opt/spark spark-test-master
CONTAINER_IP=172.17.0.3
...
18/01/11 06:50:25 INFO MasterWebUI: Bound MasterWebUI to 172.17.0.3, and started at http://172.17.0.3:8080
18/01/11 06:50:25 INFO Utils: Successfully started service on port 6066.
18/01/11 06:50:25 INFO StandaloneRestServer: Started REST server for submitting applications on port 6066
18/01/11 06:50:25 INFO Master: I have been elected leader! New state: ALIVE
```

* Slave
```
$ docker run -v $SPARK_HOME:/opt/spark spark-test-worker spark://172.17.0.3:7077
CONTAINER_IP=172.17.0.4
...
18/01/11 06:51:54 INFO Worker: Successfully registered with master spark://172.17.0.3:7077
```

After slave starts, master will show
```
18/01/11 06:51:54 INFO Master: Registering worker 172.17.0.4:8888 with 4 cores, 1024.0 MB RAM
```

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20230 from dongjoon-hyun/SPARK-23038.

(cherry picked from commit 7a3d0aad2b89aef54f7dd580397302e9ff984d9d)
Signed-off-by: Felix Cheung <felixcheung@apache.org>
---
 external/docker/spark-test/base/Dockerfile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/external/docker/spark-test/base/Dockerfile b/external/docker/spark-test/base/Dockerfile
index 5a95a9387c310..c70cd71367679 100644
--- a/external/docker/spark-test/base/Dockerfile
+++ b/external/docker/spark-test/base/Dockerfile
@@ -15,14 +15,14 @@
 # limitations under the License.
 #
 
-FROM ubuntu:precise
+FROM ubuntu:xenial
 
 # Upgrade package index
-# install a few other useful packages plus Open Jdk 7
+# install a few other useful packages plus Open Jdk 8
 # Remove unneeded /var/lib/apt/lists/* after install to reduce the
 # docker image size (by ~30MB)
 RUN apt-get update && \
-    apt-get install -y less openjdk-7-jre-headless net-tools vim-tiny sudo openssh-server && \
+    apt-get install -y less openjdk-8-jre-headless iproute2 vim-tiny sudo openssh-server && \
     rm -rf /var/lib/apt/lists/*
 
 ENV SCALA_VERSION 2.11.8

From 0d425c3362dc648d5c85b2b07af1a9df23b6d422 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Sun, 14 Jan 2018 19:43:10 +0900
Subject: [PATCH 087/566] [SPARK-23069][DOCS][SPARKR] fix R doc for describe
 missing text

## What changes were proposed in this pull request?

fix doc truncated

## How was this patch tested?

manually

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #20263 from felixcheung/r23docfix.

(cherry picked from commit 66738d29c59871b29d26fc3756772b95ef536248)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 R/pkg/R/DataFrame.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 9956f7eda91e6..6caa125e1e14a 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -3054,10 +3054,10 @@ setMethod("describe",
 #' \item stddev
 #' \item min
 #' \item max
-#' \item arbitrary approximate percentiles specified as a percentage (eg, "75%")
+#' \item arbitrary approximate percentiles specified as a percentage (eg, "75\%")
 #' }
 #' If no statistics are given, this function computes count, mean, stddev, min,
-#' approximate quartiles (percentiles at 25%, 50%, and 75%), and max.
+#' approximate quartiles (percentiles at 25\%, 50\%, and 75\%), and max.
 #' This function is meant for exploratory data analysis, as we make no guarantee about the
 #' backward compatibility of the schema of the resulting Dataset. If you want to
 #' programmatically compute summary statistics, use the \code{agg} function instead.
@@ -4019,9 +4019,9 @@ setMethod("broadcast",
 #'
 #' Spark will use this watermark for several purposes:
 #' \itemize{
-#'  \item{-} To know when a given time window aggregation can be finalized and thus can be emitted
+#'  \item To know when a given time window aggregation can be finalized and thus can be emitted
 #' when using output modes that do not allow updates.
-#'  \item{-} To minimize the amount of state that we need to keep for on-going aggregations.
+#'  \item To minimize the amount of state that we need to keep for on-going aggregations.
 #' }
 #' The current watermark is computed by looking at the \code{MAX(eventTime)} seen across
 #' all of the partitions in the query minus a user specified \code{delayThreshold}. Due to the cost

From 5fbbd94d509dbbcfa1fe940569049f72ff4a6e89 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Sun, 14 Jan 2018 22:26:21 +0800
Subject: [PATCH 088/566] [SPARK-23021][SQL] AnalysisBarrier should override
 innerChildren to print correct explain output

## What changes were proposed in this pull request?
`AnalysisBarrier` in the current master cuts off explain results for parsed logical plans;
```
scala> Seq((1, 1)).toDF("a", "b").groupBy("a").count().sample(0.1).explain(true)
== Parsed Logical Plan ==
Sample 0.0, 0.1, false, -7661439431999668039
+- AnalysisBarrier Aggregate [a#5], [a#5, count(1) AS count#14L]
```
To fix this, `AnalysisBarrier` needs to override `innerChildren` and this pr changed the output to;
```
== Parsed Logical Plan ==
Sample 0.0, 0.1, false, -5086223488015741426
+- AnalysisBarrier
      +- Aggregate [a#5], [a#5, count(1) AS count#14L]
         +- Project [_1#2 AS a#5, _2#3 AS b#6]
            +- LocalRelation [_1#2, _2#3]
```

## How was this patch tested?
Added tests in `DataFrameSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #20247 from maropu/SPARK-23021-2.

(cherry picked from commit 990f05c80347c6eec2ee06823cff587c9ea90b49)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../plans/logical/basicLogicalOperators.scala   |  1 +
 .../sql/hive/execution/HiveExplainSuite.scala   | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 95e099c340af1..a4fca790dd086 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -903,6 +903,7 @@ case class Deduplicate(
  * This analysis barrier will be removed at the end of analysis stage.
  */
 case class AnalysisBarrier(child: LogicalPlan) extends LeafNode {
+  override protected def innerChildren: Seq[LogicalPlan] = Seq(child)
   override def output: Seq[Attribute] = child.output
   override def isStreaming: Boolean = child.isStreaming
   override def doCanonicalize(): LogicalPlan = child.canonicalized
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
index dfabf1ec2a22a..a4273de5fe260 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
@@ -171,4 +171,21 @@ class HiveExplainSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
       sql("EXPLAIN EXTENDED CODEGEN SELECT 1")
     }
   }
+
+  test("SPARK-23021 AnalysisBarrier should not cut off explain output for parsed logical plans") {
+    val df = Seq((1, 1)).toDF("a", "b").groupBy("a").count().limit(1)
+    val outputStream = new java.io.ByteArrayOutputStream()
+    Console.withOut(outputStream) {
+      df.explain(true)
+    }
+    assert(outputStream.toString.replaceAll("""#\d+""", "#0").contains(
+      s"""== Parsed Logical Plan ==
+         |GlobalLimit 1
+         |+- LocalLimit 1
+         |   +- AnalysisBarrier
+         |         +- Aggregate [a#0], [a#0, count(1) AS count#0L]
+         |            +- Project [_1#0 AS a#0, _2#0 AS b#0]
+         |               +- LocalRelation [_1#0, _2#0]
+         |""".stripMargin))
+  }
 }

From 9051e1a265dc0f1dc19fd27a0127ffa47f3ac245 Mon Sep 17 00:00:00 2001
From: Sandor Murakozi <smurakozi@gmail.com>
Date: Sun, 14 Jan 2018 08:32:35 -0600
Subject: [PATCH 089/566] [SPARK-23051][CORE] Fix for broken job description in
 Spark UI

## What changes were proposed in this pull request?

In 2.2, Spark UI displayed the stage description if the job description was not set. This functionality was broken, the GUI has shown no description in this case. In addition, the code uses jobName and
jobDescription instead of stageName and stageDescription when JobTableRowData is created.

In this PR the logic producing values for the job rows was modified to find the latest stage attempt for the job and use that as a fallback if job description was missing.
StageName and stageDescription are also set using values from stage and jobName/description is used only as a fallback.

## How was this patch tested?
Manual testing of the UI, using the code in the bug report.

Author: Sandor Murakozi <smurakozi@gmail.com>

Closes #20251 from smurakozi/SPARK-23051.

(cherry picked from commit 60eeecd7760aee6ce2fd207c83ae40054eadaf83)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../apache/spark/ui/jobs/AllJobsPage.scala    | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
index 37e3b3b304a63..ff916bb6a5759 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
@@ -65,12 +65,10 @@ private[ui] class AllJobsPage(parent: JobsTab, store: AppStatusStore) extends We
     }.map { job =>
       val jobId = job.jobId
       val status = job.status
-      val displayJobDescription =
-        if (job.description.isEmpty) {
-          job.name
-        } else {
-          UIUtils.makeDescription(job.description.get, "", plainText = true).text
-        }
+      val jobDescription = store.lastStageAttempt(job.stageIds.max).description
+      val displayJobDescription = jobDescription
+        .map(UIUtils.makeDescription(_, "", plainText = true).text)
+        .getOrElse("")
       val submissionTime = job.submissionTime.get.getTime()
       val completionTime = job.completionTime.map(_.getTime()).getOrElse(System.currentTimeMillis())
       val classNameByStatus = status match {
@@ -429,20 +427,23 @@ private[ui] class JobDataSource(
     val formattedDuration = duration.map(d => UIUtils.formatDuration(d)).getOrElse("Unknown")
     val submissionTime = jobData.submissionTime
     val formattedSubmissionTime = submissionTime.map(UIUtils.formatDate).getOrElse("Unknown")
-    val jobDescription = UIUtils.makeDescription(jobData.description.getOrElse(""),
-      basePath, plainText = false)
+    val lastStageAttempt = store.lastStageAttempt(jobData.stageIds.max)
+    val lastStageDescription = lastStageAttempt.description.getOrElse("")
+
+    val formattedJobDescription =
+      UIUtils.makeDescription(lastStageDescription, basePath, plainText = false)
 
     val detailUrl = "%s/jobs/job?id=%s".format(basePath, jobData.jobId)
 
     new JobTableRowData(
       jobData,
-      jobData.name,
-      jobData.description.getOrElse(jobData.name),
+      lastStageAttempt.name,
+      lastStageDescription,
       duration.getOrElse(-1),
       formattedDuration,
       submissionTime.map(_.getTime()).getOrElse(-1L),
       formattedSubmissionTime,
-      jobDescription,
+      formattedJobDescription,
       detailUrl
     )
   }

From 2879236b92b5712b7438b972404375bbf1993df8 Mon Sep 17 00:00:00 2001
From: guoxiaolong <guo.xiaolong1@zte.com.cn>
Date: Mon, 15 Jan 2018 02:02:49 +0800
Subject: [PATCH 090/566] [SPARK-22999][SQL] show databases like command' can
 remove the like keyword

## What changes were proposed in this pull request?

SHOW DATABASES (LIKE pattern = STRING)? Can be like the back increase?
When using this command, LIKE keyword can be removed.
You can refer to the SHOW TABLES command, SHOW TABLES 'test *' and SHOW TABELS like 'test *' can be used.
Similarly SHOW DATABASES 'test *' and SHOW DATABASES like 'test *' can be used.

## How was this patch tested?
unit tests   manual tests
Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: guoxiaolong <guo.xiaolong1@zte.com.cn>

Closes #20194 from guoxiaolongzte/SPARK-22999.

(cherry picked from commit 42a1a15d739890bdfbb367ef94198b19e98ffcb7)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4    | 2 +-
 .../org/apache/spark/sql/execution/command/DDLSuite.scala     | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 6daf01d98426c..39d5e4ed56628 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -141,7 +141,7 @@ statement
         (LIKE? pattern=STRING)?                                        #showTables
     | SHOW TABLE EXTENDED ((FROM | IN) db=identifier)?
         LIKE pattern=STRING partitionSpec?                             #showTable
-    | SHOW DATABASES (LIKE pattern=STRING)?                            #showDatabases
+    | SHOW DATABASES (LIKE? pattern=STRING)?                            #showDatabases
     | SHOW TBLPROPERTIES table=tableIdentifier
         ('(' key=tablePropertyKey ')')?                                #showTblProperties
     | SHOW COLUMNS (FROM | IN) tableIdentifier
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 591510c1d8283..2b4b7c137428a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -991,6 +991,10 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
       sql("SHOW DATABASES LIKE '*db1A'"),
       Row("showdb1a") :: Nil)
 
+    checkAnswer(
+      sql("SHOW DATABASES '*db1A'"),
+      Row("showdb1a") :: Nil)
+
     checkAnswer(
       sql("SHOW DATABASES LIKE 'showdb1A'"),
       Row("showdb1a") :: Nil)

From 30574fd3716dbdf553cfd0f4d33164ab8fbccb77 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Mon, 15 Jan 2018 10:55:21 +0800
Subject: [PATCH 091/566] [SPARK-23054][SQL] Fix incorrect results of casting
 UserDefinedType to String

## What changes were proposed in this pull request?
This pr fixed the issue when casting `UserDefinedType`s into strings;
```
>>> from pyspark.ml.classification import MultilayerPerceptronClassifier
>>> from pyspark.ml.linalg import Vectors
>>> df = spark.createDataFrame([(0.0, Vectors.dense([0.0, 0.0])), (1.0, Vectors.dense([0.0, 1.0]))], ["label", "features"])
>>> df.selectExpr("CAST(features AS STRING)").show(truncate = False)
+-------------------------------------------+
|features                                   |
+-------------------------------------------+
|[6,1,0,0,2800000020,2,0,0,0]               |
|[6,1,0,0,2800000020,2,0,0,3ff0000000000000]|
+-------------------------------------------+
```
The root cause is that `Cast` handles input data as `UserDefinedType.sqlType`(this is underlying storage type), so we should pass data into `UserDefinedType.deserialize` then `toString`.
This pr modified the result into;
```
+---------+
|features |
+---------+
|[0.0,0.0]|
|[0.0,1.0]|
+---------+
```

## How was this patch tested?
Added tests in `UserDefinedTypeSuite `.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #20246 from maropu/SPARK-23054.

(cherry picked from commit b98ffa4d6dabaf787177d3f14b200fc4b118c7ce)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/catalyst/expressions/Cast.scala     |  7 +++++++
 .../apache/spark/sql/UserDefinedTypeSuite.scala   | 15 +++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index f21aa1e9e3135..a95ebe301b9d1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -282,6 +282,8 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
         builder.append("]")
         builder.build()
       })
+    case udt: UserDefinedType[_] =>
+      buildCast[Any](_, o => UTF8String.fromString(udt.deserialize(o).toString))
     case _ => buildCast[Any](_, o => UTF8String.fromString(o.toString))
   }
 
@@ -836,6 +838,11 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
              |$evPrim = $buffer.build();
            """.stripMargin
         }
+      case udt: UserDefinedType[_] =>
+        val udtRef = ctx.addReferenceObj("udt", udt)
+        (c, evPrim, evNull) => {
+          s"$evPrim = UTF8String.fromString($udtRef.deserialize($c).toString());"
+        }
       case _ =>
         (c, evPrim, evNull) => s"$evPrim = UTF8String.fromString(String.valueOf($c));"
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
index a08433ba794d9..cc8b600efa46a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
@@ -21,7 +21,7 @@ import scala.beans.{BeanInfo, BeanProperty}
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
-import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
+import org.apache.spark.sql.catalyst.expressions.{Cast, ExpressionEvalHelper, GenericInternalRow, Literal}
 import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData}
 import org.apache.spark.sql.execution.datasources.parquet.ParquetTest
 import org.apache.spark.sql.functions._
@@ -44,6 +44,8 @@ object UDT {
       case v: MyDenseVector => java.util.Arrays.equals(this.data, v.data)
       case _ => false
     }
+
+    override def toString: String = data.mkString("(", ", ", ")")
   }
 
   private[sql] class MyDenseVectorUDT extends UserDefinedType[MyDenseVector] {
@@ -143,7 +145,8 @@ private[spark] class ExampleSubTypeUDT extends UserDefinedType[IExampleSubType]
   override def userClass: Class[IExampleSubType] = classOf[IExampleSubType]
 }
 
-class UserDefinedTypeSuite extends QueryTest with SharedSQLContext with ParquetTest {
+class UserDefinedTypeSuite extends QueryTest with SharedSQLContext with ParquetTest
+    with ExpressionEvalHelper {
   import testImplicits._
 
   private lazy val pointsRDD = Seq(
@@ -304,4 +307,12 @@ class UserDefinedTypeSuite extends QueryTest with SharedSQLContext with ParquetT
       pointsRDD.except(pointsRDD2),
       Seq(Row(0.0, new UDT.MyDenseVector(Array(0.2, 2.0)))))
   }
+
+  test("SPARK-23054 Cast UserDefinedType to string") {
+    val udt = new UDT.MyDenseVectorUDT()
+    val vector = new UDT.MyDenseVector(Array(1.0, 3.0, 5.0, 7.0, 9.0))
+    val data = udt.serialize(vector)
+    val ret = Cast(Literal(data, udt), StringType, None)
+    checkEvaluation(ret, "(1.0, 3.0, 5.0, 7.0, 9.0)")
+  }
 }

From 81b989903af0cdcb6c19e6e8e7bdbac455a2c281 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Mon, 15 Jan 2018 12:06:56 +0800
Subject: [PATCH 092/566] [SPARK-23049][SQL]
 `spark.sql.files.ignoreCorruptFiles` should work for ORC files

## What changes were proposed in this pull request?

When `spark.sql.files.ignoreCorruptFiles=true`, we should ignore corrupted ORC files.

## How was this patch tested?

Pass the Jenkins with a newly added test case.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20240 from dongjoon-hyun/SPARK-23049.

(cherry picked from commit 9a96bfc8bf021cb4b6c62fac6ce1bcf87affcd43)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../execution/datasources/orc/OrcUtils.scala  | 29 ++++++++----
 .../datasources/orc/OrcQuerySuite.scala       | 47 +++++++++++++++++++
 .../parquet/ParquetQuerySuite.scala           | 23 +++++++--
 .../spark/sql/hive/orc/OrcFileFormat.scala    |  8 +++-
 .../spark/sql/hive/orc/OrcFileOperator.scala  | 28 +++++++++--
 5 files changed, 117 insertions(+), 18 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
index 13a23996f4ade..460194ba61c8b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
@@ -23,6 +23,7 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.orc.{OrcFile, Reader, TypeDescription}
 
+import org.apache.spark.SparkException
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SparkSession
@@ -50,23 +51,35 @@ object OrcUtils extends Logging {
     paths
   }
 
-  def readSchema(file: Path, conf: Configuration): Option[TypeDescription] = {
+  def readSchema(file: Path, conf: Configuration, ignoreCorruptFiles: Boolean)
+      : Option[TypeDescription] = {
     val fs = file.getFileSystem(conf)
     val readerOptions = OrcFile.readerOptions(conf).filesystem(fs)
-    val reader = OrcFile.createReader(file, readerOptions)
-    val schema = reader.getSchema
-    if (schema.getFieldNames.size == 0) {
-      None
-    } else {
-      Some(schema)
+    try {
+      val reader = OrcFile.createReader(file, readerOptions)
+      val schema = reader.getSchema
+      if (schema.getFieldNames.size == 0) {
+        None
+      } else {
+        Some(schema)
+      }
+    } catch {
+      case e: org.apache.orc.FileFormatException =>
+        if (ignoreCorruptFiles) {
+          logWarning(s"Skipped the footer in the corrupted file: $file", e)
+          None
+        } else {
+          throw new SparkException(s"Could not read footer for file: $file", e)
+        }
     }
   }
 
   def readSchema(sparkSession: SparkSession, files: Seq[FileStatus])
       : Option[StructType] = {
+    val ignoreCorruptFiles = sparkSession.sessionState.conf.ignoreCorruptFiles
     val conf = sparkSession.sessionState.newHadoopConf()
     // TODO: We need to support merge schema. Please see SPARK-11412.
-    files.map(_.getPath).flatMap(readSchema(_, conf)).headOption.map { schema =>
+    files.map(_.getPath).flatMap(readSchema(_, conf, ignoreCorruptFiles)).headOption.map { schema =>
       logDebug(s"Reading schema from file $files, got Hive schema string: $schema")
       CatalystSqlParser.parseDataType(schema.toString).asInstanceOf[StructType]
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala
index e00e057a18cc6..f58c331f33ca8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala
@@ -31,6 +31,7 @@ import org.apache.orc.OrcConf.COMPRESS
 import org.apache.orc.mapred.OrcStruct
 import org.apache.orc.mapreduce.OrcInputFormat
 
+import org.apache.spark.SparkException
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, RecordReaderIterator}
@@ -531,6 +532,52 @@ abstract class OrcQueryTest extends OrcTest {
     val df = spark.read.orc(path1.getCanonicalPath, path2.getCanonicalPath)
     assert(df.count() == 20)
   }
+
+  test("Enabling/disabling ignoreCorruptFiles") {
+    def testIgnoreCorruptFiles(): Unit = {
+      withTempDir { dir =>
+        val basePath = dir.getCanonicalPath
+        spark.range(1).toDF("a").write.orc(new Path(basePath, "first").toString)
+        spark.range(1, 2).toDF("a").write.orc(new Path(basePath, "second").toString)
+        spark.range(2, 3).toDF("a").write.json(new Path(basePath, "third").toString)
+        val df = spark.read.orc(
+          new Path(basePath, "first").toString,
+          new Path(basePath, "second").toString,
+          new Path(basePath, "third").toString)
+        checkAnswer(df, Seq(Row(0), Row(1)))
+      }
+    }
+
+    def testIgnoreCorruptFilesWithoutSchemaInfer(): Unit = {
+      withTempDir { dir =>
+        val basePath = dir.getCanonicalPath
+        spark.range(1).toDF("a").write.orc(new Path(basePath, "first").toString)
+        spark.range(1, 2).toDF("a").write.orc(new Path(basePath, "second").toString)
+        spark.range(2, 3).toDF("a").write.json(new Path(basePath, "third").toString)
+        val df = spark.read.schema("a long").orc(
+          new Path(basePath, "first").toString,
+          new Path(basePath, "second").toString,
+          new Path(basePath, "third").toString)
+        checkAnswer(df, Seq(Row(0), Row(1)))
+      }
+    }
+
+    withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "true") {
+      testIgnoreCorruptFiles()
+      testIgnoreCorruptFilesWithoutSchemaInfer()
+    }
+
+    withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "false") {
+      val m1 = intercept[SparkException] {
+        testIgnoreCorruptFiles()
+      }.getMessage
+      assert(m1.contains("Could not read footer for file"))
+      val m2 = intercept[SparkException] {
+        testIgnoreCorruptFilesWithoutSchemaInfer()
+      }.getMessage
+      assert(m2.contains("Malformed ORC file"))
+    }
+  }
 }
 
 class OrcQuerySuite extends OrcQueryTest with SharedSQLContext {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
index 4c8c9ef6e0432..6ad88ed997ce7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -320,14 +320,27 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
           new Path(basePath, "first").toString,
           new Path(basePath, "second").toString,
           new Path(basePath, "third").toString)
-        checkAnswer(
-          df,
-          Seq(Row(0), Row(1)))
+        checkAnswer(df, Seq(Row(0), Row(1)))
+      }
+    }
+
+    def testIgnoreCorruptFilesWithoutSchemaInfer(): Unit = {
+      withTempDir { dir =>
+        val basePath = dir.getCanonicalPath
+        spark.range(1).toDF("a").write.parquet(new Path(basePath, "first").toString)
+        spark.range(1, 2).toDF("a").write.parquet(new Path(basePath, "second").toString)
+        spark.range(2, 3).toDF("a").write.json(new Path(basePath, "third").toString)
+        val df = spark.read.schema("a long").parquet(
+          new Path(basePath, "first").toString,
+          new Path(basePath, "second").toString,
+          new Path(basePath, "third").toString)
+        checkAnswer(df, Seq(Row(0), Row(1)))
       }
     }
 
     withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "true") {
       testIgnoreCorruptFiles()
+      testIgnoreCorruptFilesWithoutSchemaInfer()
     }
 
     withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "false") {
@@ -335,6 +348,10 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
         testIgnoreCorruptFiles()
       }
       assert(exception.getMessage().contains("is not a Parquet file"))
+      val exception2 = intercept[SparkException] {
+        testIgnoreCorruptFilesWithoutSchemaInfer()
+      }
+      assert(exception2.getMessage().contains("is not a Parquet file"))
     }
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index 95741c7b30289..237ed9bc05988 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -59,9 +59,11 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
       sparkSession: SparkSession,
       options: Map[String, String],
       files: Seq[FileStatus]): Option[StructType] = {
+    val ignoreCorruptFiles = sparkSession.sessionState.conf.ignoreCorruptFiles
     OrcFileOperator.readSchema(
       files.map(_.getPath.toString),
-      Some(sparkSession.sessionState.newHadoopConf())
+      Some(sparkSession.sessionState.newHadoopConf()),
+      ignoreCorruptFiles
     )
   }
 
@@ -129,6 +131,7 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
 
     val broadcastedHadoopConf =
       sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
+    val ignoreCorruptFiles = sparkSession.sessionState.conf.ignoreCorruptFiles
 
     (file: PartitionedFile) => {
       val conf = broadcastedHadoopConf.value.value
@@ -138,7 +141,8 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
       // SPARK-8501: Empty ORC files always have an empty schema stored in their footer. In this
       // case, `OrcFileOperator.readSchema` returns `None`, and we can't read the underlying file
       // using the given physical schema. Instead, we simply return an empty iterator.
-      val isEmptyFile = OrcFileOperator.readSchema(Seq(filePath.toString), Some(conf)).isEmpty
+      val isEmptyFile =
+        OrcFileOperator.readSchema(Seq(filePath.toString), Some(conf), ignoreCorruptFiles).isEmpty
       if (isEmptyFile) {
         Iterator.empty
       } else {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
index 5a3fcd7a759c0..80e44ca504356 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
@@ -17,11 +17,14 @@
 
 package org.apache.spark.sql.hive.orc
 
+import java.io.IOException
+
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader}
 import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector
 
+import org.apache.spark.SparkException
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
@@ -46,7 +49,10 @@ private[hive] object OrcFileOperator extends Logging {
    *       create the result reader from that file.  If no such file is found, it returns `None`.
    * @todo Needs to consider all files when schema evolution is taken into account.
    */
-  def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = {
+  def getFileReader(basePath: String,
+      config: Option[Configuration] = None,
+      ignoreCorruptFiles: Boolean = false)
+      : Option[Reader] = {
     def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = {
       reader.getObjectInspector match {
         case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 =>
@@ -65,16 +71,28 @@ private[hive] object OrcFileOperator extends Logging {
     }
 
     listOrcFiles(basePath, conf).iterator.map { path =>
-      path -> OrcFile.createReader(fs, path)
+      val reader = try {
+        Some(OrcFile.createReader(fs, path))
+      } catch {
+        case e: IOException =>
+          if (ignoreCorruptFiles) {
+            logWarning(s"Skipped the footer in the corrupted file: $path", e)
+            None
+          } else {
+            throw new SparkException(s"Could not read footer for file: $path", e)
+          }
+      }
+      path -> reader
     }.collectFirst {
-      case (path, reader) if isWithNonEmptySchema(path, reader) => reader
+      case (path, Some(reader)) if isWithNonEmptySchema(path, reader) => reader
     }
   }
 
-  def readSchema(paths: Seq[String], conf: Option[Configuration]): Option[StructType] = {
+  def readSchema(paths: Seq[String], conf: Option[Configuration], ignoreCorruptFiles: Boolean)
+      : Option[StructType] = {
     // Take the first file where we can open a valid reader if we can find one.  Otherwise just
     // return None to indicate we can't infer the schema.
-    paths.flatMap(getFileReader(_, conf)).headOption.map { reader =>
+    paths.flatMap(getFileReader(_, conf, ignoreCorruptFiles)).headOption.map { reader =>
       val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
       val schema = readerInspector.getTypeName
       logDebug(s"Reading schema from file $paths, got Hive schema string: $schema")

From 188999a3401357399d8d2b30f440d8b0b0795fc5 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Mon, 15 Jan 2018 16:26:52 +0800
Subject: [PATCH 093/566] [SPARK-23023][SQL] Cast field data to strings in
 showString

## What changes were proposed in this pull request?
The current `Datset.showString` prints rows thru `RowEncoder` deserializers like;
```
scala> Seq(Seq(Seq(1, 2), Seq(3), Seq(4, 5, 6))).toDF("a").show(false)
+------------------------------------------------------------+
|a                                                           |
+------------------------------------------------------------+
|[WrappedArray(1, 2), WrappedArray(3), WrappedArray(4, 5, 6)]|
+------------------------------------------------------------+
```
This result is incorrect because the correct one is;
```
scala> Seq(Seq(Seq(1, 2), Seq(3), Seq(4, 5, 6))).toDF("a").show(false)
+------------------------+
|a                       |
+------------------------+
|[[1, 2], [3], [4, 5, 6]]|
+------------------------+
```
So, this pr fixed code in `showString` to cast field data to strings before printing.

## How was this patch tested?
Added tests in `DataFrameSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #20214 from maropu/SPARK-23023.

(cherry picked from commit b59808385cfe24ce768e5b3098b9034e64b99a5a)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 python/pyspark/sql/functions.py               | 32 +++++++++----------
 .../scala/org/apache/spark/sql/Dataset.scala  | 21 ++++++------
 .../org/apache/spark/sql/DataFrameSuite.scala | 28 ++++++++++++++++
 .../org/apache/spark/sql/DatasetSuite.scala   | 12 +++----
 4 files changed, 61 insertions(+), 32 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index e1ad6590554cf..f7b3f29764040 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1849,14 +1849,14 @@ def explode_outer(col):
     +---+----------+----+-----+
 
     >>> df.select("id", "a_map", explode_outer("an_array")).show()
-    +---+-------------+----+
-    | id|        a_map| col|
-    +---+-------------+----+
-    |  1|Map(x -> 1.0)| foo|
-    |  1|Map(x -> 1.0)| bar|
-    |  2|        Map()|null|
-    |  3|         null|null|
-    +---+-------------+----+
+    +---+----------+----+
+    | id|     a_map| col|
+    +---+----------+----+
+    |  1|[x -> 1.0]| foo|
+    |  1|[x -> 1.0]| bar|
+    |  2|        []|null|
+    |  3|      null|null|
+    +---+----------+----+
     """
     sc = SparkContext._active_spark_context
     jc = sc._jvm.functions.explode_outer(_to_java_column(col))
@@ -1881,14 +1881,14 @@ def posexplode_outer(col):
     |  3|      null|null|null| null|
     +---+----------+----+----+-----+
     >>> df.select("id", "a_map", posexplode_outer("an_array")).show()
-    +---+-------------+----+----+
-    | id|        a_map| pos| col|
-    +---+-------------+----+----+
-    |  1|Map(x -> 1.0)|   0| foo|
-    |  1|Map(x -> 1.0)|   1| bar|
-    |  2|        Map()|null|null|
-    |  3|         null|null|null|
-    +---+-------------+----+----+
+    +---+----------+----+----+
+    | id|     a_map| pos| col|
+    +---+----------+----+----+
+    |  1|[x -> 1.0]|   0| foo|
+    |  1|[x -> 1.0]|   1| bar|
+    |  2|        []|null|null|
+    |  3|      null|null|null|
+    +---+----------+----+----+
     """
     sc = SparkContext._active_spark_context
     jc = sc._jvm.functions.posexplode_outer(_to_java_column(col))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 77e571272920a..34f0ab5aa6699 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -237,13 +237,20 @@ class Dataset[T] private[sql](
   private[sql] def showString(
       _numRows: Int, truncate: Int = 20, vertical: Boolean = false): String = {
     val numRows = _numRows.max(0).min(Int.MaxValue - 1)
-    val takeResult = toDF().take(numRows + 1)
+    val newDf = toDF()
+    val castCols = newDf.logicalPlan.output.map { col =>
+      // Since binary types in top-level schema fields have a specific format to print,
+      // so we do not cast them to strings here.
+      if (col.dataType == BinaryType) {
+        Column(col)
+      } else {
+        Column(col).cast(StringType)
+      }
+    }
+    val takeResult = newDf.select(castCols: _*).take(numRows + 1)
     val hasMoreData = takeResult.length > numRows
     val data = takeResult.take(numRows)
 
-    lazy val timeZone =
-      DateTimeUtils.getTimeZone(sparkSession.sessionState.conf.sessionLocalTimeZone)
-
     // For array values, replace Seq and Array with square brackets
     // For cells that are beyond `truncate` characters, replace it with the
     // first `truncate-3` and "..."
@@ -252,12 +259,6 @@ class Dataset[T] private[sql](
         val str = cell match {
           case null => "null"
           case binary: Array[Byte] => binary.map("%02X".format(_)).mkString("[", " ", "]")
-          case array: Array[_] => array.mkString("[", ", ", "]")
-          case seq: Seq[_] => seq.mkString("[", ", ", "]")
-          case d: Date =>
-            DateTimeUtils.dateToString(DateTimeUtils.fromJavaDate(d))
-          case ts: Timestamp =>
-            DateTimeUtils.timestampToString(DateTimeUtils.fromJavaTimestamp(ts), timeZone)
           case _ => cell.toString
         }
         if (truncate > 0 && str.length > truncate) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 5e4c1a6a484fb..33707080c1301 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1255,6 +1255,34 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     assert(testData.select($"*").showString(1, vertical = true) === expectedAnswer)
   }
 
+  test("SPARK-23023 Cast rows to strings in showString") {
+    val df1 = Seq(Seq(1, 2, 3, 4)).toDF("a")
+    assert(df1.showString(10) ===
+      s"""+------------+
+         ||           a|
+         |+------------+
+         ||[1, 2, 3, 4]|
+         |+------------+
+         |""".stripMargin)
+    val df2 = Seq(Map(1 -> "a", 2 -> "b")).toDF("a")
+    assert(df2.showString(10) ===
+      s"""+----------------+
+         ||               a|
+         |+----------------+
+         ||[1 -> a, 2 -> b]|
+         |+----------------+
+         |""".stripMargin)
+    val df3 = Seq(((1, "a"), 0), ((2, "b"), 0)).toDF("a", "b")
+    assert(df3.showString(10) ===
+      s"""+------+---+
+         ||     a|  b|
+         |+------+---+
+         ||[1, a]|  0|
+         ||[2, b]|  0|
+         |+------+---+
+         |""".stripMargin)
+  }
+
   test("SPARK-7327 show with empty dataFrame") {
     val expectedAnswer = """+---+-----+
                            ||key|value|
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 54893c184642b..49c59cf695dc1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -958,12 +958,12 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
     ).toDS()
 
     val expected =
-      """+-------+
-        ||      f|
-        |+-------+
-        ||[foo,1]|
-        ||[bar,2]|
-        |+-------+
+      """+--------+
+        ||       f|
+        |+--------+
+        ||[foo, 1]|
+        ||[bar, 2]|
+        |+--------+
         |""".stripMargin
 
     checkShowString(ds, expected)

From 3491ca4fb5c2e3fecd727f7a31b8efbe74032bcc Mon Sep 17 00:00:00 2001
From: Yuming Wang <yumwang@ebay.com>
Date: Mon, 15 Jan 2018 07:49:34 -0600
Subject: [PATCH 094/566] [SPARK-19550][BUILD][FOLLOW-UP] Remove MaxPermSize
 for sql module

## What changes were proposed in this pull request?

Remove `MaxPermSize` for `sql` module

## How was this patch tested?

Manually tested.

Author: Yuming Wang <yumwang@ebay.com>

Closes #20268 from wangyum/SPARK-19550-MaxPermSize.

(cherry picked from commit a38c887ac093d7cf343d807515147d87ca931ce7)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 sql/catalyst/pom.xml | 2 +-
 sql/core/pom.xml     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index d053007755dfd..865d3b3f6ad4e 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -134,7 +134,7 @@
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
         <configuration>
-          <argLine>-ea -Xmx4g -Xss4m -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m</argLine>
+          <argLine>-ea -Xmx4g -Xss4m -XX:ReservedCodeCacheSize=${CodeCacheSize}</argLine>
         </configuration>
       </plugin>
       <plugin>
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index ac5efeae37f33..f39bffe417666 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -195,7 +195,7 @@
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
         <configuration>
-          <argLine>-ea -Xmx4g -Xss4m -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m</argLine>
+          <argLine>-ea -Xmx4g -Xss4m -XX:ReservedCodeCacheSize=${CodeCacheSize}</argLine>
         </configuration>
       </plugin>
       <plugin>

From c6a3b9297f0246cfc02a57ec099ca23db90f343f Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Mon, 15 Jan 2018 22:32:38 +0800
Subject: [PATCH 095/566] [SPARK-23070] Bump previousSparkVersion in
 MimaBuild.scala to be 2.2.0

## What changes were proposed in this pull request?
Bump previousSparkVersion in MimaBuild.scala to be 2.2.0 and add the missing exclusions to `v23excludes` in `MimaExcludes`. No item can be un-excluded in `v23excludes`.

## How was this patch tested?
The existing tests.

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20264 from gatorsmile/bump22.

(cherry picked from commit bd08a9e7af4137bddca638e627ad2ae531bce20f)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 project/MimaBuild.scala    |  2 +-
 project/MimaExcludes.scala | 35 ++++++++++++++++++++++++++++++++++-
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/project/MimaBuild.scala b/project/MimaBuild.scala
index 2ef0e7b40d940..adde213e361f0 100644
--- a/project/MimaBuild.scala
+++ b/project/MimaBuild.scala
@@ -88,7 +88,7 @@ object MimaBuild {
 
   def mimaSettings(sparkHome: File, projectRef: ProjectRef) = {
     val organization = "org.apache.spark"
-    val previousSparkVersion = "2.0.0"
+    val previousSparkVersion = "2.2.0"
     val project = projectRef.project
     val fullId = "spark-" + project + "_2.11"
     mimaDefaultSettings ++
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 3b452f35c5ec1..eec2e2b1757ae 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -98,7 +98,40 @@ object MimaExcludes {
 
     // [SPARK-21087] CrossValidator, TrainValidationSplit expose sub models after fitting: Scala
     ProblemFilters.exclude[FinalClassProblem]("org.apache.spark.ml.tuning.CrossValidatorModel$CrossValidatorModelWriter"),
-    ProblemFilters.exclude[FinalClassProblem]("org.apache.spark.ml.tuning.TrainValidationSplitModel$TrainValidationSplitModelWriter")
+    ProblemFilters.exclude[FinalClassProblem]("org.apache.spark.ml.tuning.TrainValidationSplitModel$TrainValidationSplitModelWriter"),
+
+    // [SPARK-21728][CORE] Allow SparkSubmit to use Logging
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.deploy.SparkSubmit.downloadFileList"),
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.deploy.SparkSubmit.downloadFile"),
+
+    // [SPARK-21714][CORE][YARN] Avoiding re-uploading remote resources in yarn client mode
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment"),
+
+    // [SPARK-22324][SQL][PYTHON] Upgrade Arrow to 0.8.0
+    ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.network.util.AbstractFileRegion.transfered"),
+
+    // [SPARK-20643][CORE] Add listener implementation to collect app state
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.TaskData.<init>$default$5"),
+
+    // [SPARK-20648][CORE] Port JobsTab and StageTab to the new UI backend
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.TaskData.<init>$default$12"),
+
+    // [SPARK-21462][SS] Added batchId to StreamingQueryProgress.json
+    // [SPARK-21409][SS] Expose state store memory usage in SQL metrics and progress updates
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StateOperatorProgress.this"),
+
+    // [SPARK-22278][SS] Expose current event time watermark and current processing time in GroupState
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.GroupState.getCurrentWatermarkMs"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.GroupState.getCurrentProcessingTimeMs"),
+
+    // [SPARK-20542][ML][SQL] Add an API to Bucketizer that can bin multiple columns
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasOutputCols.org$apache$spark$ml$param$shared$HasOutputCols$_setter_$outputCols_="),
+
+    // [SPARK-18619][ML] Make QuantileDiscretizer/Bucketizer/StringIndexer/RFormula inherit from HasHandleInvalid
+    ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.feature.Bucketizer.getHandleInvalid"),
+    ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.feature.StringIndexer.getHandleInvalid"),
+    ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.feature.QuantileDiscretizer.getHandleInvalid"),
+    ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.feature.StringIndexerModel.getHandleInvalid")
   )
 
   // Exclude rules for 2.2.x

From 706a308bdf4a2492d7e01bd3cf2710072704de0e Mon Sep 17 00:00:00 2001
From: xubo245 <601450868@qq.com>
Date: Mon, 15 Jan 2018 23:13:15 +0800
Subject: [PATCH 096/566] [SPARK-23035][SQL] Fix improper information of
 TempTableAlreadyExistsException

## What changes were proposed in this pull request?

Problem: it throw TempTableAlreadyExistsException and output "Temporary table '$table' already exists" when we create temp view by using org.apache.spark.sql.catalyst.catalog.GlobalTempViewManager#create, it's improper.

So fix improper information about TempTableAlreadyExistsException when create temp view:

change "Temporary table"  to  "Temporary view"

## How was this patch tested?

test("rename temporary view - destination table already exists, with: CREATE TEMPORARY view")

test("rename temporary view - destination table with database name,with:CREATE TEMPORARY view")

Author: xubo245 <601450868@qq.com>

Closes #20227 from xubo245/fixDeprecated.

(cherry picked from commit 6c81fe227a6233f5d9665d2efadf8a1cf09f700d)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../analysis/AlreadyExistException.scala      |  2 +-
 .../catalog/SessionCatalogSuite.scala         |  6 +-
 .../spark/sql/execution/SQLViewSuite.scala    |  2 +-
 .../sql/execution/command/DDLSuite.scala      | 75 ++++++++++++++++++-
 4 files changed, 78 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlreadyExistException.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlreadyExistException.scala
index 57f7a80bedc6c..6d587abd8fd4d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlreadyExistException.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlreadyExistException.scala
@@ -31,7 +31,7 @@ class TableAlreadyExistsException(db: String, table: String)
   extends AnalysisException(s"Table or view '$table' already exists in database '$db'")
 
 class TempTableAlreadyExistsException(table: String)
-  extends AnalysisException(s"Temporary table '$table' already exists")
+  extends AnalysisException(s"Temporary view '$table' already exists")
 
 class PartitionAlreadyExistsException(db: String, table: String, spec: TablePartitionSpec)
   extends AnalysisException(
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
index 95c87ffa20cb7..6abab0073cca3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -279,7 +279,7 @@ abstract class SessionCatalogSuite extends AnalysisTest {
     }
   }
 
-  test("create temp table") {
+  test("create temp view") {
     withBasicCatalog { catalog =>
       val tempTable1 = Range(1, 10, 1, 10)
       val tempTable2 = Range(1, 20, 2, 10)
@@ -288,11 +288,11 @@ abstract class SessionCatalogSuite extends AnalysisTest {
       assert(catalog.getTempView("tbl1") == Option(tempTable1))
       assert(catalog.getTempView("tbl2") == Option(tempTable2))
       assert(catalog.getTempView("tbl3").isEmpty)
-      // Temporary table already exists
+      // Temporary view already exists
       intercept[TempTableAlreadyExistsException] {
         catalog.createTempView("tbl1", tempTable1, overrideIfExists = false)
       }
-      // Temporary table already exists but we override it
+      // Temporary view already exists but we override it
       catalog.createTempView("tbl1", tempTable2, overrideIfExists = true)
       assert(catalog.getTempView("tbl1") == Option(tempTable2))
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
index 8c55758cfe38d..14082197ba0bd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
@@ -293,7 +293,7 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils {
         sql("CREATE TEMPORARY VIEW testView AS SELECT id FROM jt")
       }
 
-      assert(e.message.contains("Temporary table") && e.message.contains("already exists"))
+      assert(e.message.contains("Temporary view") && e.message.contains("already exists"))
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 2b4b7c137428a..6ca21b5aa1595 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -835,6 +835,31 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
     }
   }
 
+  test("rename temporary view - destination table with database name,with:CREATE TEMPORARY view") {
+    withTempView("view1") {
+      sql(
+        """
+          |CREATE TEMPORARY VIEW view1
+          |USING org.apache.spark.sql.sources.DDLScanSource
+          |OPTIONS (
+          |  From '1',
+          |  To '10',
+          |  Table 'test1'
+          |)
+        """.stripMargin)
+
+      val e = intercept[AnalysisException] {
+        sql("ALTER TABLE view1 RENAME TO default.tab2")
+      }
+      assert(e.getMessage.contains(
+        "RENAME TEMPORARY VIEW from '`view1`' to '`default`.`tab2`': " +
+          "cannot specify database name 'default' in the destination table"))
+
+      val catalog = spark.sessionState.catalog
+      assert(catalog.listTables("default") == Seq(TableIdentifier("view1")))
+    }
+  }
+
   test("rename temporary view") {
     withTempView("tab1", "tab2") {
       spark.range(10).createOrReplaceTempView("tab1")
@@ -883,6 +908,42 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
     }
   }
 
+  test("rename temporary view - destination table already exists, with: CREATE TEMPORARY view") {
+    withTempView("view1", "view2") {
+      sql(
+        """
+          |CREATE TEMPORARY VIEW view1
+          |USING org.apache.spark.sql.sources.DDLScanSource
+          |OPTIONS (
+          |  From '1',
+          |  To '10',
+          |  Table 'test1'
+          |)
+        """.stripMargin)
+
+      sql(
+        """
+          |CREATE TEMPORARY VIEW view2
+          |USING org.apache.spark.sql.sources.DDLScanSource
+          |OPTIONS (
+          |  From '1',
+          |  To '10',
+          |  Table 'test1'
+          |)
+        """.stripMargin)
+
+      val e = intercept[AnalysisException] {
+        sql("ALTER TABLE view1 RENAME TO view2")
+      }
+      assert(e.getMessage.contains(
+        "RENAME TEMPORARY VIEW from '`view1`' to '`view2`': destination table already exists"))
+
+      val catalog = spark.sessionState.catalog
+      assert(catalog.listTables("default") ==
+        Seq(TableIdentifier("view1"), TableIdentifier("view2")))
+    }
+  }
+
   test("alter table: bucketing is not supported") {
     val catalog = spark.sessionState.catalog
     val tableIdent = TableIdentifier("tab1", Some("dbx"))
@@ -1728,12 +1789,22 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
   }
 
   test("block creating duplicate temp table") {
-    withView("t_temp") {
+    withTempView("t_temp") {
       sql("CREATE TEMPORARY VIEW t_temp AS SELECT 1, 2")
       val e = intercept[TempTableAlreadyExistsException] {
         sql("CREATE TEMPORARY TABLE t_temp (c3 int, c4 string) USING JSON")
       }.getMessage
-      assert(e.contains("Temporary table 't_temp' already exists"))
+      assert(e.contains("Temporary view 't_temp' already exists"))
+    }
+  }
+
+  test("block creating duplicate temp view") {
+    withTempView("t_temp") {
+      sql("CREATE TEMPORARY VIEW t_temp AS SELECT 1, 2")
+      val e = intercept[TempTableAlreadyExistsException] {
+        sql("CREATE TEMPORARY VIEW t_temp (c3 int, c4 string) USING JSON")
+      }.getMessage
+      assert(e.contains("Temporary view 't_temp' already exists"))
     }
   }
 

From bb8e5addc79652308169532c33baa8117c2464ca Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Tue, 16 Jan 2018 11:47:42 +0900
Subject: [PATCH 097/566] [SPARK-23080][SQL] Improve error message for built-in
 functions

## What changes were proposed in this pull request?

When a user puts the wrong number of parameters in a function, an AnalysisException is thrown. If the function is a UDF, he user is told how many parameters the function expected and how many he/she put. If the function, instead, is a built-in one, no information about the number of parameters expected and the actual one is provided. This can help in some cases, to debug the errors (eg. bad quotes escaping may lead to a different number of parameters than expected, etc. etc.)

The PR adds the information about the number of parameters passed and the expected one, analogously to what happens for UDF.

## How was this patch tested?

modified existing UT + manual test

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #20271 from mgaido91/SPARK-23080.

(cherry picked from commit 8ab2d7ea99b2cff8b54b2cb3a1dbf7580845986a)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 .../spark/sql/catalyst/analysis/FunctionRegistry.scala | 10 +++++++++-
 .../resources/sql-tests/results/json-functions.sql.out |  4 ++--
 .../src/test/scala/org/apache/spark/sql/UDFSuite.scala |  4 ++--
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 5ddb39822617d..747016beb06e7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -526,7 +526,15 @@ object FunctionRegistry {
         // Otherwise, find a constructor method that matches the number of arguments, and use that.
         val params = Seq.fill(expressions.size)(classOf[Expression])
         val f = constructors.find(_.getParameterTypes.toSeq == params).getOrElse {
-          throw new AnalysisException(s"Invalid number of arguments for function $name")
+          val validParametersCount = constructors.map(_.getParameterCount).distinct.sorted
+          val expectedNumberOfParameters = if (validParametersCount.length == 1) {
+            validParametersCount.head.toString
+          } else {
+            validParametersCount.init.mkString("one of ", ", ", " and ") +
+              validParametersCount.last
+          }
+          throw new AnalysisException(s"Invalid number of arguments for function $name. " +
+            s"Expected: $expectedNumberOfParameters; Found: ${params.length}")
         }
         Try(f.newInstance(expressions : _*).asInstanceOf[Expression]) match {
           case Success(e) => e
diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
index d9dc728a18e8d..581dddc89d0bb 100644
--- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
@@ -129,7 +129,7 @@ select to_json()
 struct<>
 -- !query 12 output
 org.apache.spark.sql.AnalysisException
-Invalid number of arguments for function to_json; line 1 pos 7
+Invalid number of arguments for function to_json. Expected: one of 1, 2 and 3; Found: 0; line 1 pos 7
 
 
 -- !query 13
@@ -225,7 +225,7 @@ select from_json()
 struct<>
 -- !query 21 output
 org.apache.spark.sql.AnalysisException
-Invalid number of arguments for function from_json; line 1 pos 7
+Invalid number of arguments for function from_json. Expected: one of 2, 3 and 4; Found: 0; line 1 pos 7
 
 
 -- !query 22
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index db37be68e42e6..af6a10b425b9f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -80,7 +80,7 @@ class UDFSuite extends QueryTest with SharedSQLContext {
     val e = intercept[AnalysisException] {
       df.selectExpr("substr('abcd', 2, 3, 4)")
     }
-    assert(e.getMessage.contains("Invalid number of arguments for function substr"))
+    assert(e.getMessage.contains("Invalid number of arguments for function substr. Expected:"))
   }
 
   test("error reporting for incorrect number of arguments - udf") {
@@ -89,7 +89,7 @@ class UDFSuite extends QueryTest with SharedSQLContext {
       spark.udf.register("foo", (_: String).length)
       df.selectExpr("foo(2, 3, 4)")
     }
-    assert(e.getMessage.contains("Invalid number of arguments for function foo"))
+    assert(e.getMessage.contains("Invalid number of arguments for function foo. Expected:"))
   }
 
   test("error reporting for undefined functions") {

From e2ffb97819612c062bdfcde12e27e9d04c1a846d Mon Sep 17 00:00:00 2001
From: Sameer Agarwal <sameerag@apache.org>
Date: Tue, 16 Jan 2018 11:20:18 +0800
Subject: [PATCH 098/566] [SPARK-23000] Use fully qualified table names in
 HiveMetastoreCatalogSuite

## What changes were proposed in this pull request?

In another attempt to fix DataSourceWithHiveMetastoreCatalogSuite, this patch uses qualified table names (`default.t`) in the individual tests.

## How was this patch tested?

N/A (Test Only Change)

Author: Sameer Agarwal <sameerag@apache.org>

Closes #20273 from sameeragarwal/flaky-test.

(cherry picked from commit c7572b79da0a29e502890d7618eaf805a1c9f474)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../sql/hive/HiveMetastoreCatalogSuite.scala  | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index ba9b944e4a055..83b4c862e2546 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -166,13 +166,13 @@ class DataSourceWithHiveMetastoreCatalogSuite
     ))
   ).foreach { case (provider, (inputFormat, outputFormat, serde)) =>
     test(s"Persist non-partitioned $provider relation into metastore as managed table") {
-      withTable("t") {
+      withTable("default.t") {
         withSQLConf(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "true") {
           testDF
             .write
             .mode(SaveMode.Overwrite)
             .format(provider)
-            .saveAsTable("t")
+            .saveAsTable("default.t")
         }
 
         val hiveTable = sessionState.catalog.getTableMetadata(TableIdentifier("t", Some("default")))
@@ -187,14 +187,15 @@ class DataSourceWithHiveMetastoreCatalogSuite
         assert(columns.map(_.name) === Seq("d1", "d2"))
         assert(columns.map(_.dataType) === Seq(DecimalType(10, 3), StringType))
 
-        checkAnswer(table("t"), testDF)
-        assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM t") === Seq("1.1\t1", "2.1\t2"))
+        checkAnswer(table("default.t"), testDF)
+        assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM default.t") ===
+          Seq("1.1\t1", "2.1\t2"))
       }
     }
 
     test(s"Persist non-partitioned $provider relation into metastore as external table") {
       withTempPath { dir =>
-        withTable("t") {
+        withTable("default.t") {
           val path = dir.getCanonicalFile
 
           withSQLConf(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "true") {
@@ -203,7 +204,7 @@ class DataSourceWithHiveMetastoreCatalogSuite
               .mode(SaveMode.Overwrite)
               .format(provider)
               .option("path", path.toString)
-              .saveAsTable("t")
+              .saveAsTable("default.t")
           }
 
           val hiveTable =
@@ -219,8 +220,8 @@ class DataSourceWithHiveMetastoreCatalogSuite
           assert(columns.map(_.name) === Seq("d1", "d2"))
           assert(columns.map(_.dataType) === Seq(DecimalType(10, 3), StringType))
 
-          checkAnswer(table("t"), testDF)
-          assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM t") ===
+          checkAnswer(table("default.t"), testDF)
+          assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM default.t") ===
             Seq("1.1\t1", "2.1\t2"))
         }
       }
@@ -228,9 +229,9 @@ class DataSourceWithHiveMetastoreCatalogSuite
 
     test(s"Persist non-partitioned $provider relation into metastore as managed table using CTAS") {
       withTempPath { dir =>
-        withTable("t") {
+        withTable("default.t") {
           sql(
-            s"""CREATE TABLE t USING $provider
+            s"""CREATE TABLE default.t USING $provider
                |OPTIONS (path '${dir.toURI}')
                |AS SELECT 1 AS d1, "val_1" AS d2
              """.stripMargin)
@@ -248,8 +249,9 @@ class DataSourceWithHiveMetastoreCatalogSuite
           assert(columns.map(_.name) === Seq("d1", "d2"))
           assert(columns.map(_.dataType) === Seq(IntegerType, StringType))
 
-          checkAnswer(table("t"), Row(1, "val_1"))
-          assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM t") === Seq("1\tval_1"))
+          checkAnswer(table("default.t"), Row(1, "val_1"))
+          assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM default.t") ===
+            Seq("1\tval_1"))
         }
       }
     }

From e58c4a929a5cbd2d611b3e07a29fcc93a827d980 Mon Sep 17 00:00:00 2001
From: Yuanjian Li <xyliyuanjian@gmail.com>
Date: Mon, 15 Jan 2018 22:01:14 -0800
Subject: [PATCH 099/566] [SPARK-22956][SS] Bug fix for 2 streams union
 failover scenario

## What changes were proposed in this pull request?

This problem reported by yanlin-Lynn ivoson and LiangchangZ. Thanks!

When we union 2 streams from kafka or other sources, while one of them have no continues data coming and in the same time task restart, this will cause an `IllegalStateException`. This mainly cause because the code in [MicroBatchExecution](https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala#L190) , while one stream has no continues data, its comittedOffset same with availableOffset during `populateStartOffsets`, and `currentPartitionOffsets` not properly handled in KafkaSource. Also, maybe we should also consider this scenario in other Source.

## How was this patch tested?

Add a UT in KafkaSourceSuite.scala

Author: Yuanjian Li <xyliyuanjian@gmail.com>

Closes #20150 from xuanyuanking/SPARK-22956.

(cherry picked from commit 07ae39d0ec1f03b1c73259373a8bb599694c7860)
Signed-off-by: Shixiong Zhu <zsxwing@gmail.com>
---
 .../spark/sql/kafka010/KafkaSource.scala      | 13 ++--
 .../spark/sql/kafka010/KafkaSourceSuite.scala | 65 +++++++++++++++++++
 .../streaming/MicroBatchExecution.scala       |  6 +-
 .../sql/execution/streaming/memory.scala      |  6 ++
 4 files changed, 81 insertions(+), 9 deletions(-)

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
index e9cff04ba5f2e..864a92b8f813f 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -223,6 +223,14 @@ private[kafka010] class KafkaSource(
 
     logInfo(s"GetBatch called with start = $start, end = $end")
     val untilPartitionOffsets = KafkaSourceOffset.getPartitionOffsets(end)
+    // On recovery, getBatch will get called before getOffset
+    if (currentPartitionOffsets.isEmpty) {
+      currentPartitionOffsets = Some(untilPartitionOffsets)
+    }
+    if (start.isDefined && start.get == end) {
+      return sqlContext.internalCreateDataFrame(
+        sqlContext.sparkContext.emptyRDD, schema, isStreaming = true)
+    }
     val fromPartitionOffsets = start match {
       case Some(prevBatchEndOffset) =>
         KafkaSourceOffset.getPartitionOffsets(prevBatchEndOffset)
@@ -305,11 +313,6 @@ private[kafka010] class KafkaSource(
     logInfo("GetBatch generating RDD of offset range: " +
       offsetRanges.sortBy(_.topicPartition.toString).mkString(", "))
 
-    // On recovery, getBatch will get called before getOffset
-    if (currentPartitionOffsets.isEmpty) {
-      currentPartitionOffsets = Some(untilPartitionOffsets)
-    }
-
     sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true)
   }
 
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index 2034b9be07f24..a0f5695fc485c 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -318,6 +318,71 @@ class KafkaSourceSuite extends KafkaSourceTest {
     )
   }
 
+  test("SPARK-22956: currentPartitionOffsets should be set when no new data comes in") {
+    def getSpecificDF(range: Range.Inclusive): org.apache.spark.sql.Dataset[Int] = {
+      val topic = newTopic()
+      testUtils.createTopic(topic, partitions = 1)
+      testUtils.sendMessages(topic, range.map(_.toString).toArray, Some(0))
+
+      val reader = spark
+        .readStream
+        .format("kafka")
+        .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+        .option("kafka.metadata.max.age.ms", "1")
+        .option("maxOffsetsPerTrigger", 5)
+        .option("subscribe", topic)
+        .option("startingOffsets", "earliest")
+
+      reader.load()
+        .selectExpr("CAST(value AS STRING)")
+        .as[String]
+        .map(k => k.toInt)
+    }
+
+    val df1 = getSpecificDF(0 to 9)
+    val df2 = getSpecificDF(100 to 199)
+
+    val kafka = df1.union(df2)
+
+    val clock = new StreamManualClock
+
+    val waitUntilBatchProcessed = AssertOnQuery { q =>
+      eventually(Timeout(streamingTimeout)) {
+        if (!q.exception.isDefined) {
+          assert(clock.isStreamWaitingAt(clock.getTimeMillis()))
+        }
+      }
+      if (q.exception.isDefined) {
+        throw q.exception.get
+      }
+      true
+    }
+
+    testStream(kafka)(
+      StartStream(ProcessingTime(100), clock),
+      waitUntilBatchProcessed,
+      // 5 from smaller topic, 5 from bigger one
+      CheckLastBatch((0 to 4) ++ (100 to 104): _*),
+      AdvanceManualClock(100),
+      waitUntilBatchProcessed,
+      // 5 from smaller topic, 5 from bigger one
+      CheckLastBatch((5 to 9) ++ (105 to 109): _*),
+      AdvanceManualClock(100),
+      waitUntilBatchProcessed,
+      // smaller topic empty, 5 from bigger one
+      CheckLastBatch(110 to 114: _*),
+      StopStream,
+      StartStream(ProcessingTime(100), clock),
+      waitUntilBatchProcessed,
+      // smallest now empty, 5 from bigger one
+      CheckLastBatch(115 to 119: _*),
+      AdvanceManualClock(100),
+      waitUntilBatchProcessed,
+      // smallest now empty, 5 from bigger one
+      CheckLastBatch(120 to 124: _*)
+    )
+  }
+
   test("cannot stop Kafka stream") {
     val topic = newTopic()
     testUtils.createTopic(topic, partitions = 5)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
index 42240eeb58d4b..70407f0580f97 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
@@ -208,10 +208,8 @@ class MicroBatchExecution(
                * batch will be executed before getOffset is called again. */
               availableOffsets.foreach {
                 case (source: Source, end: Offset) =>
-                  if (committedOffsets.get(source).map(_ != end).getOrElse(true)) {
-                    val start = committedOffsets.get(source)
-                    source.getBatch(start, end)
-                  }
+                  val start = committedOffsets.get(source)
+                  source.getBatch(start, end)
                 case nonV1Tuple =>
                   // The V2 API does not have the same edge case requiring getBatch to be called
                   // here, so we do nothing here.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
index 3041d4d703cb4..509a69dd922fb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
@@ -119,9 +119,15 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)
     val newBlocks = synchronized {
       val sliceStart = startOrdinal - lastOffsetCommitted.offset.toInt - 1
       val sliceEnd = endOrdinal - lastOffsetCommitted.offset.toInt - 1
+      assert(sliceStart <= sliceEnd, s"sliceStart: $sliceStart sliceEnd: $sliceEnd")
       batches.slice(sliceStart, sliceEnd)
     }
 
+    if (newBlocks.isEmpty) {
+      return sqlContext.internalCreateDataFrame(
+        sqlContext.sparkContext.emptyRDD, schema, isStreaming = true)
+    }
+
     logDebug(generateDebugString(newBlocks, startOrdinal, endOrdinal))
 
     newBlocks

From 20c69816a63071b82b1035d4b48798c358206421 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Mon, 15 Jan 2018 22:40:44 -0800
Subject: [PATCH 100/566] [SPARK-23020][CORE] Fix races in launcher code, test.

The race in the code is because the handle might update
its state to the wrong state if the connection handling
thread is still processing incoming data; so the handle
needs to wait for the connection to finish up before
checking the final state.

The race in the test is because when waiting for a handle
to reach a final state, the waitFor() method needs to wait
until all handle state is updated (which also includes
waiting for the connection thread above to finish).
Otherwise, waitFor() may return too early, which would cause
a bunch of different races (like the listener not being yet
notified of the state change, or being in the middle of
being notified, or the handle not being properly disposed
and causing postChecks() to assert).

On top of that I found, by code inspection, a couple of
potential races that could make a handle end up in the
wrong state when being killed.

Tested by running the existing unit tests a lot (and not
seeing the errors I was seeing before).

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20223 from vanzin/SPARK-23020.

(cherry picked from commit 66217dac4f8952a9923625908ad3dcb030763c81)
Signed-off-by: Sameer Agarwal <sameerag@apache.org>
---
 .../spark/launcher/SparkLauncherSuite.java    | 49 ++++++++++++-------
 .../spark/launcher/AbstractAppHandle.java     | 22 +++++++--
 .../spark/launcher/ChildProcAppHandle.java    | 18 ++++---
 .../spark/launcher/InProcessAppHandle.java    | 17 ++++---
 .../spark/launcher/LauncherConnection.java    | 14 +++---
 .../apache/spark/launcher/LauncherServer.java | 46 ++++++++++++++---
 .../org/apache/spark/launcher/BaseSuite.java  | 42 +++++++++++++---
 .../spark/launcher/LauncherServerSuite.java   | 20 +++-----
 8 files changed, 156 insertions(+), 72 deletions(-)

diff --git a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
index 9d2f563b2e367..a042375c6ae91 100644
--- a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
+++ b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.launcher;
 
+import java.time.Duration;
 import java.util.Arrays;
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -31,6 +32,7 @@
 import static org.mockito.Mockito.*;
 
 import org.apache.spark.SparkContext;
+import org.apache.spark.SparkContext$;
 import org.apache.spark.internal.config.package$;
 import org.apache.spark.util.Utils;
 
@@ -137,7 +139,9 @@ public void testInProcessLauncher() throws Exception {
       // Here DAGScheduler is stopped, while SparkContext.clearActiveContext may not be called yet.
       // Wait for a reasonable amount of time to avoid creating two active SparkContext in JVM.
       // See SPARK-23019 and SparkContext.stop() for details.
-      TimeUnit.MILLISECONDS.sleep(500);
+      eventually(Duration.ofSeconds(5), Duration.ofMillis(10), () -> {
+        assertTrue("SparkContext is still alive.", SparkContext$.MODULE$.getActive().isEmpty());
+      });
     }
   }
 
@@ -146,26 +150,35 @@ private void inProcessLauncherTestImpl() throws Exception {
     SparkAppHandle.Listener listener = mock(SparkAppHandle.Listener.class);
     doAnswer(invocation -> {
       SparkAppHandle h = (SparkAppHandle) invocation.getArguments()[0];
-      transitions.add(h.getState());
+      synchronized (transitions) {
+        transitions.add(h.getState());
+      }
       return null;
     }).when(listener).stateChanged(any(SparkAppHandle.class));
 
-    SparkAppHandle handle = new InProcessLauncher()
-      .setMaster("local")
-      .setAppResource(SparkLauncher.NO_RESOURCE)
-      .setMainClass(InProcessTestApp.class.getName())
-      .addAppArgs("hello")
-      .startApplication(listener);
-
-    waitFor(handle);
-    assertEquals(SparkAppHandle.State.FINISHED, handle.getState());
-
-    // Matches the behavior of LocalSchedulerBackend.
-    List<SparkAppHandle.State> expected = Arrays.asList(
-      SparkAppHandle.State.CONNECTED,
-      SparkAppHandle.State.RUNNING,
-      SparkAppHandle.State.FINISHED);
-    assertEquals(expected, transitions);
+    SparkAppHandle handle = null;
+    try {
+      handle = new InProcessLauncher()
+        .setMaster("local")
+        .setAppResource(SparkLauncher.NO_RESOURCE)
+        .setMainClass(InProcessTestApp.class.getName())
+        .addAppArgs("hello")
+        .startApplication(listener);
+
+      waitFor(handle);
+      assertEquals(SparkAppHandle.State.FINISHED, handle.getState());
+
+      // Matches the behavior of LocalSchedulerBackend.
+      List<SparkAppHandle.State> expected = Arrays.asList(
+        SparkAppHandle.State.CONNECTED,
+        SparkAppHandle.State.RUNNING,
+        SparkAppHandle.State.FINISHED);
+      assertEquals(expected, transitions);
+    } finally {
+      if (handle != null) {
+        handle.kill();
+      }
+    }
   }
 
   public static class SparkLauncherTestApp {
diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java
index df1e7316861d4..daf0972f824dd 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java
@@ -33,7 +33,7 @@ abstract class AbstractAppHandle implements SparkAppHandle {
   private List<Listener> listeners;
   private State state;
   private String appId;
-  private boolean disposed;
+  private volatile boolean disposed;
 
   protected AbstractAppHandle(LauncherServer server) {
     this.server = server;
@@ -70,8 +70,7 @@ public void stop() {
 
   @Override
   public synchronized void disconnect() {
-    if (!disposed) {
-      disposed = true;
+    if (!isDisposed()) {
       if (connection != null) {
         try {
           connection.close();
@@ -79,7 +78,7 @@ public synchronized void disconnect() {
           // no-op.
         }
       }
-      server.unregister(this);
+      dispose();
     }
   }
 
@@ -95,6 +94,21 @@ boolean isDisposed() {
     return disposed;
   }
 
+  /**
+   * Mark the handle as disposed, and set it as LOST in case the current state is not final.
+   */
+  synchronized void dispose() {
+    if (!isDisposed()) {
+      // Unregister first to make sure that the connection with the app has been really
+      // terminated.
+      server.unregister(this);
+      if (!getState().isFinal()) {
+        setState(State.LOST);
+      }
+      this.disposed = true;
+    }
+  }
+
   void setState(State s) {
     setState(s, false);
   }
diff --git a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
index 8b3f427b7750e..2b99461652e1f 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
@@ -48,14 +48,16 @@ public synchronized void disconnect() {
 
   @Override
   public synchronized void kill() {
-    disconnect();
-    if (childProc != null) {
-      if (childProc.isAlive()) {
-        childProc.destroyForcibly();
+    if (!isDisposed()) {
+      setState(State.KILLED);
+      disconnect();
+      if (childProc != null) {
+        if (childProc.isAlive()) {
+          childProc.destroyForcibly();
+        }
+        childProc = null;
       }
-      childProc = null;
     }
-    setState(State.KILLED);
   }
 
   void setChildProc(Process childProc, String loggerName, InputStream logStream) {
@@ -94,8 +96,6 @@ void monitorChild() {
         return;
       }
 
-      disconnect();
-
       int ec;
       try {
         ec = proc.exitValue();
@@ -118,6 +118,8 @@ void monitorChild() {
       if (newState != null) {
         setState(newState, true);
       }
+
+      disconnect();
     }
   }
 
diff --git a/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
index acd64c962604f..f04263cb74a58 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
@@ -39,15 +39,16 @@ class InProcessAppHandle extends AbstractAppHandle {
 
   @Override
   public synchronized void kill() {
-    LOG.warning("kill() may leave the underlying app running in in-process mode.");
-    disconnect();
-
-    // Interrupt the thread. This is not guaranteed to kill the app, though.
-    if (app != null) {
-      app.interrupt();
+    if (!isDisposed()) {
+      LOG.warning("kill() may leave the underlying app running in in-process mode.");
+      setState(State.KILLED);
+      disconnect();
+
+      // Interrupt the thread. This is not guaranteed to kill the app, though.
+      if (app != null) {
+        app.interrupt();
+      }
     }
-
-    setState(State.KILLED);
   }
 
   synchronized void start(String appName, Method main, String[] args) {
diff --git a/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java b/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java
index b4a8719e26053..fd6f229b2349c 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java
@@ -95,15 +95,15 @@ protected synchronized void send(Message msg) throws IOException {
   }
 
   @Override
-  public void close() throws IOException {
+  public synchronized void close() throws IOException {
     if (!closed) {
-      synchronized (this) {
-        if (!closed) {
-          closed = true;
-          socket.close();
-        }
-      }
+      closed = true;
+      socket.close();
     }
   }
 
+  boolean isOpen() {
+    return !closed;
+  }
+
 }
diff --git a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
index b8999a1d7a4f4..660c4443b20b9 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
@@ -217,6 +217,33 @@ void unregister(AbstractAppHandle handle) {
         break;
       }
     }
+
+    // If there is a live connection for this handle, we need to wait for it to finish before
+    // returning, otherwise there might be a race between the connection thread processing
+    // buffered data and the handle cleaning up after itself, leading to potentially the wrong
+    // state being reported for the handle.
+    ServerConnection conn = null;
+    synchronized (clients) {
+      for (ServerConnection c : clients) {
+        if (c.handle == handle) {
+          conn = c;
+          break;
+        }
+      }
+    }
+
+    if (conn != null) {
+      synchronized (conn) {
+        if (conn.isOpen()) {
+          try {
+            conn.wait();
+          } catch (InterruptedException ie) {
+            // Ignore.
+          }
+        }
+      }
+    }
+
     unref();
   }
 
@@ -288,7 +315,7 @@ private String createSecret() {
   private class ServerConnection extends LauncherConnection {
 
     private TimerTask timeout;
-    private AbstractAppHandle handle;
+    volatile AbstractAppHandle handle;
 
     ServerConnection(Socket socket, TimerTask timeout) throws IOException {
       super(socket);
@@ -338,16 +365,21 @@ protected void handle(Message msg) throws IOException {
 
     @Override
     public void close() throws IOException {
+      if (!isOpen()) {
+        return;
+      }
+
       synchronized (clients) {
         clients.remove(this);
       }
-      super.close();
+
+      synchronized (this) {
+        super.close();
+        notifyAll();
+      }
+
       if (handle != null) {
-        if (!handle.getState().isFinal()) {
-          LOG.log(Level.WARNING, "Lost connection to spark application.");
-          handle.setState(SparkAppHandle.State.LOST);
-        }
-        handle.disconnect();
+        handle.dispose();
       }
     }
 
diff --git a/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java b/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java
index 3e1a90eae98d4..3722a59d9438e 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.launcher;
 
+import java.time.Duration;
 import java.util.concurrent.TimeUnit;
 
 import org.junit.After;
@@ -47,19 +48,46 @@ public void postChecks() {
     assertNull(server);
   }
 
-  protected void waitFor(SparkAppHandle handle) throws Exception {
-    long deadline = System.nanoTime() + TimeUnit.SECONDS.toNanos(10);
+  protected void waitFor(final SparkAppHandle handle) throws Exception {
     try {
-      while (!handle.getState().isFinal()) {
-        assertTrue("Timed out waiting for handle to transition to final state.",
-          System.nanoTime() < deadline);
-        TimeUnit.MILLISECONDS.sleep(10);
-      }
+      eventually(Duration.ofSeconds(10), Duration.ofMillis(10), () -> {
+        assertTrue("Handle is not in final state.", handle.getState().isFinal());
+      });
     } finally {
       if (!handle.getState().isFinal()) {
         handle.kill();
       }
     }
+
+    // Wait until the handle has been marked as disposed, to make sure all cleanup tasks
+    // have been performed.
+    AbstractAppHandle ahandle = (AbstractAppHandle) handle;
+    eventually(Duration.ofSeconds(10), Duration.ofMillis(10), () -> {
+      assertTrue("Handle is still not marked as disposed.", ahandle.isDisposed());
+    });
+  }
+
+  /**
+   * Call a closure that performs a check every "period" until it succeeds, or the timeout
+   * elapses.
+   */
+  protected void eventually(Duration timeout, Duration period, Runnable check) throws Exception {
+    assertTrue("Timeout needs to be larger than period.", timeout.compareTo(period) > 0);
+    long deadline = System.nanoTime() + timeout.toNanos();
+    int count = 0;
+    while (true) {
+      try {
+        count++;
+        check.run();
+        return;
+      } catch (Throwable t) {
+        if (System.nanoTime() >= deadline) {
+          String msg = String.format("Failed check after %d tries: %s.", count, t.getMessage());
+          throw new IllegalStateException(msg, t);
+        }
+        Thread.sleep(period.toMillis());
+      }
+    }
   }
 
 }
diff --git a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
index 7e2b09ce25c9b..75c1af0c71e2a 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
@@ -23,12 +23,14 @@
 import java.net.InetAddress;
 import java.net.Socket;
 import java.net.SocketException;
+import java.time.Duration;
 import java.util.Arrays;
 import java.util.List;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.Semaphore;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
 
 import org.junit.Test;
 import static org.junit.Assert.*;
@@ -197,28 +199,20 @@ private void close(Closeable c) {
    * server-side close immediately.
    */
   private void waitForError(TestClient client, String secret) throws Exception {
-    boolean helloSent = false;
-    int maxTries = 10;
-    for (int i = 0; i < maxTries; i++) {
+    final AtomicBoolean helloSent = new AtomicBoolean();
+    eventually(Duration.ofSeconds(1), Duration.ofMillis(10), () -> {
       try {
-        if (!helloSent) {
+        if (!helloSent.get()) {
           client.send(new Hello(secret, "1.4.0"));
-          helloSent = true;
+          helloSent.set(true);
         } else {
           client.send(new SetAppId("appId"));
         }
         fail("Expected error but message went through.");
       } catch (IllegalStateException | IOException e) {
         // Expected.
-        break;
-      } catch (AssertionError e) {
-        if (i < maxTries - 1) {
-          Thread.sleep(100);
-        } else {
-          throw new AssertionError("Test failed after " + maxTries + " attempts.", e);
-        }
       }
-    }
+    });
   }
 
   private static class TestClient extends LauncherConnection {

From 5c06ee2d49987c297e93de87f99c701e178ba294 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 16 Jan 2018 20:20:33 +0900
Subject: [PATCH 101/566] [SPARK-22978][PYSPARK] Register Vectorized UDFs for
 SQL Statement

## What changes were proposed in this pull request?
Register Vectorized UDFs for SQL Statement. For example,

```Python
>>> from pyspark.sql.functions import pandas_udf, PandasUDFType
>>> pandas_udf("integer", PandasUDFType.SCALAR)
... def add_one(x):
...     return x + 1
...
>>> _ = spark.udf.register("add_one", add_one)
>>> spark.sql("SELECT add_one(id) FROM range(3)").collect()
[Row(add_one(id)=1), Row(add_one(id)=2), Row(add_one(id)=3)]
```

## How was this patch tested?
Added test cases

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20171 from gatorsmile/supportVectorizedUDF.

(cherry picked from commit b85eb946ac298e711dad25db0d04eee41d7fd236)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 python/pyspark/sql/catalog.py | 75 ++++++++++++++++++++++++----------
 python/pyspark/sql/context.py | 51 ++++++++++++++++-------
 python/pyspark/sql/tests.py   | 76 ++++++++++++++++++++++++++++++-----
 3 files changed, 155 insertions(+), 47 deletions(-)

diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py
index 156603128d063..35fbe9e669adb 100644
--- a/python/pyspark/sql/catalog.py
+++ b/python/pyspark/sql/catalog.py
@@ -226,18 +226,23 @@ def dropGlobalTempView(self, viewName):
 
     @ignore_unicode_prefix
     @since(2.0)
-    def registerFunction(self, name, f, returnType=StringType()):
+    def registerFunction(self, name, f, returnType=None):
         """Registers a Python function (including lambda function) or a :class:`UserDefinedFunction`
-        as a UDF. The registered UDF can be used in SQL statement.
+        as a UDF. The registered UDF can be used in SQL statements.
 
-        In addition to a name and the function itself, the return type can be optionally specified.
-        When the return type is not given it default to a string and conversion will automatically
-        be done.  For any other return type, the produced object must match the specified type.
+        :func:`spark.udf.register` is an alias for :func:`spark.catalog.registerFunction`.
 
-        :param name: name of the UDF
-        :param f: a Python function, or a wrapped/native UserDefinedFunction
-        :param returnType: a :class:`pyspark.sql.types.DataType` object
-        :return: a wrapped :class:`UserDefinedFunction`
+        In addition to a name and the function itself, `returnType` can be optionally specified.
+        1) When f is a Python function, `returnType` defaults to a string. The produced object must
+        match the specified type. 2) When f is a :class:`UserDefinedFunction`, Spark uses the return
+        type of the given UDF as the return type of the registered UDF. The input parameter
+        `returnType` is None by default. If given by users, the value must be None.
+
+        :param name: name of the UDF in SQL statements.
+        :param f: a Python function, or a wrapped/native UserDefinedFunction. The UDF can be either
+            row-at-a-time or vectorized.
+        :param returnType: the return type of the registered UDF.
+        :return: a wrapped/native :class:`UserDefinedFunction`
 
         >>> strlen = spark.catalog.registerFunction("stringLengthString", len)
         >>> spark.sql("SELECT stringLengthString('test')").collect()
@@ -256,27 +261,55 @@ def registerFunction(self, name, f, returnType=StringType()):
         >>> spark.sql("SELECT stringLengthInt('test')").collect()
         [Row(stringLengthInt(test)=4)]
 
+        >>> from pyspark.sql.types import IntegerType
+        >>> from pyspark.sql.functions import udf
+        >>> slen = udf(lambda s: len(s), IntegerType())
+        >>> _ = spark.udf.register("slen", slen)
+        >>> spark.sql("SELECT slen('test')").collect()
+        [Row(slen(test)=4)]
+
         >>> import random
         >>> from pyspark.sql.functions import udf
-        >>> from pyspark.sql.types import IntegerType, StringType
+        >>> from pyspark.sql.types import IntegerType
         >>> random_udf = udf(lambda: random.randint(0, 100), IntegerType()).asNondeterministic()
-        >>> newRandom_udf = spark.catalog.registerFunction("random_udf", random_udf, StringType())
+        >>> new_random_udf = spark.catalog.registerFunction("random_udf", random_udf)
         >>> spark.sql("SELECT random_udf()").collect()  # doctest: +SKIP
-        [Row(random_udf()=u'82')]
-        >>> spark.range(1).select(newRandom_udf()).collect()  # doctest: +SKIP
-        [Row(random_udf()=u'62')]
+        [Row(random_udf()=82)]
+        >>> spark.range(1).select(new_random_udf()).collect()  # doctest: +SKIP
+        [Row(<lambda>()=26)]
+
+        >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
+        >>> @pandas_udf("integer", PandasUDFType.SCALAR)  # doctest: +SKIP
+        ... def add_one(x):
+        ...     return x + 1
+        ...
+        >>> _ = spark.udf.register("add_one", add_one)  # doctest: +SKIP
+        >>> spark.sql("SELECT add_one(id) FROM range(3)").collect()  # doctest: +SKIP
+        [Row(add_one(id)=1), Row(add_one(id)=2), Row(add_one(id)=3)]
         """
 
         # This is to check whether the input function is a wrapped/native UserDefinedFunction
         if hasattr(f, 'asNondeterministic'):
-            udf = UserDefinedFunction(f.func, returnType=returnType, name=name,
-                                      evalType=PythonEvalType.SQL_BATCHED_UDF,
-                                      deterministic=f.deterministic)
+            if returnType is not None:
+                raise TypeError(
+                    "Invalid returnType: None is expected when f is a UserDefinedFunction, "
+                    "but got %s." % returnType)
+            if f.evalType not in [PythonEvalType.SQL_BATCHED_UDF,
+                                  PythonEvalType.SQL_PANDAS_SCALAR_UDF]:
+                raise ValueError(
+                    "Invalid f: f must be either SQL_BATCHED_UDF or SQL_PANDAS_SCALAR_UDF")
+            register_udf = UserDefinedFunction(f.func, returnType=f.returnType, name=name,
+                                               evalType=f.evalType,
+                                               deterministic=f.deterministic)
+            return_udf = f
         else:
-            udf = UserDefinedFunction(f, returnType=returnType, name=name,
-                                      evalType=PythonEvalType.SQL_BATCHED_UDF)
-        self._jsparkSession.udf().registerPython(name, udf._judf)
-        return udf._wrapped()
+            if returnType is None:
+                returnType = StringType()
+            register_udf = UserDefinedFunction(f, returnType=returnType, name=name,
+                                               evalType=PythonEvalType.SQL_BATCHED_UDF)
+            return_udf = register_udf._wrapped()
+        self._jsparkSession.udf().registerPython(name, register_udf._judf)
+        return return_udf
 
     @since(2.0)
     def isCached(self, tableName):
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index b8d86cc098e94..85479095af594 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -174,18 +174,23 @@ def range(self, start, end=None, step=1, numPartitions=None):
 
     @ignore_unicode_prefix
     @since(1.2)
-    def registerFunction(self, name, f, returnType=StringType()):
+    def registerFunction(self, name, f, returnType=None):
         """Registers a Python function (including lambda function) or a :class:`UserDefinedFunction`
-        as a UDF. The registered UDF can be used in SQL statement.
+        as a UDF. The registered UDF can be used in SQL statements.
 
-        In addition to a name and the function itself, the return type can be optionally specified.
-        When the return type is not given it default to a string and conversion will automatically
-        be done.  For any other return type, the produced object must match the specified type.
+        :func:`spark.udf.register` is an alias for :func:`sqlContext.registerFunction`.
 
-        :param name: name of the UDF
-        :param f: a Python function, or a wrapped/native UserDefinedFunction
-        :param returnType: a :class:`pyspark.sql.types.DataType` object
-        :return: a wrapped :class:`UserDefinedFunction`
+        In addition to a name and the function itself, `returnType` can be optionally specified.
+        1) When f is a Python function, `returnType` defaults to a string. The produced object must
+        match the specified type. 2) When f is a :class:`UserDefinedFunction`, Spark uses the return
+        type of the given UDF as the return type of the registered UDF. The input parameter
+        `returnType` is None by default. If given by users, the value must be None.
+
+        :param name: name of the UDF in SQL statements.
+        :param f: a Python function, or a wrapped/native UserDefinedFunction. The UDF can be either
+            row-at-a-time or vectorized.
+        :param returnType: the return type of the registered UDF.
+        :return: a wrapped/native :class:`UserDefinedFunction`
 
         >>> strlen = sqlContext.registerFunction("stringLengthString", lambda x: len(x))
         >>> sqlContext.sql("SELECT stringLengthString('test')").collect()
@@ -204,15 +209,31 @@ def registerFunction(self, name, f, returnType=StringType()):
         >>> sqlContext.sql("SELECT stringLengthInt('test')").collect()
         [Row(stringLengthInt(test)=4)]
 
+        >>> from pyspark.sql.types import IntegerType
+        >>> from pyspark.sql.functions import udf
+        >>> slen = udf(lambda s: len(s), IntegerType())
+        >>> _ = sqlContext.udf.register("slen", slen)
+        >>> sqlContext.sql("SELECT slen('test')").collect()
+        [Row(slen(test)=4)]
+
         >>> import random
         >>> from pyspark.sql.functions import udf
-        >>> from pyspark.sql.types import IntegerType, StringType
+        >>> from pyspark.sql.types import IntegerType
         >>> random_udf = udf(lambda: random.randint(0, 100), IntegerType()).asNondeterministic()
-        >>> newRandom_udf = sqlContext.registerFunction("random_udf", random_udf, StringType())
+        >>> new_random_udf = sqlContext.registerFunction("random_udf", random_udf)
         >>> sqlContext.sql("SELECT random_udf()").collect()  # doctest: +SKIP
-        [Row(random_udf()=u'82')]
-        >>> sqlContext.range(1).select(newRandom_udf()).collect()  # doctest: +SKIP
-        [Row(random_udf()=u'62')]
+        [Row(random_udf()=82)]
+        >>> sqlContext.range(1).select(new_random_udf()).collect()  # doctest: +SKIP
+        [Row(<lambda>()=26)]
+
+        >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
+        >>> @pandas_udf("integer", PandasUDFType.SCALAR)  # doctest: +SKIP
+        ... def add_one(x):
+        ...     return x + 1
+        ...
+        >>> _ = sqlContext.udf.register("add_one", add_one)  # doctest: +SKIP
+        >>> sqlContext.sql("SELECT add_one(id) FROM range(3)").collect()  # doctest: +SKIP
+        [Row(add_one(id)=1), Row(add_one(id)=2), Row(add_one(id)=3)]
         """
         return self.sparkSession.catalog.registerFunction(name, f, returnType)
 
@@ -575,7 +596,7 @@ class UDFRegistration(object):
     def __init__(self, sqlContext):
         self.sqlContext = sqlContext
 
-    def register(self, name, f, returnType=StringType()):
+    def register(self, name, f, returnType=None):
         return self.sqlContext.registerFunction(name, f, returnType)
 
     def registerJavaFunction(self, name, javaClassName, returnType=None):
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 80a94a91a87b3..8906618666b14 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -380,12 +380,25 @@ def test_udf2(self):
         self.assertEqual(4, res[0])
 
     def test_udf3(self):
-        twoargs = self.spark.catalog.registerFunction(
-            "twoArgs", UserDefinedFunction(lambda x, y: len(x) + y), IntegerType())
-        self.assertEqual(twoargs.deterministic, True)
+        two_args = self.spark.catalog.registerFunction(
+            "twoArgs", UserDefinedFunction(lambda x, y: len(x) + y))
+        self.assertEqual(two_args.deterministic, True)
+        [row] = self.spark.sql("SELECT twoArgs('test', 1)").collect()
+        self.assertEqual(row[0], u'5')
+
+    def test_udf_registration_return_type_none(self):
+        two_args = self.spark.catalog.registerFunction(
+            "twoArgs", UserDefinedFunction(lambda x, y: len(x) + y, "integer"), None)
+        self.assertEqual(two_args.deterministic, True)
         [row] = self.spark.sql("SELECT twoArgs('test', 1)").collect()
         self.assertEqual(row[0], 5)
 
+    def test_udf_registration_return_type_not_none(self):
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(TypeError, "Invalid returnType"):
+                self.spark.catalog.registerFunction(
+                    "f", UserDefinedFunction(lambda x, y: len(x) + y, StringType()), StringType())
+
     def test_nondeterministic_udf(self):
         # Test that nondeterministic UDFs are evaluated only once in chained UDF evaluations
         from pyspark.sql.functions import udf
@@ -402,12 +415,12 @@ def test_nondeterministic_udf2(self):
         from pyspark.sql.functions import udf
         random_udf = udf(lambda: random.randint(6, 6), IntegerType()).asNondeterministic()
         self.assertEqual(random_udf.deterministic, False)
-        random_udf1 = self.spark.catalog.registerFunction("randInt", random_udf, StringType())
+        random_udf1 = self.spark.catalog.registerFunction("randInt", random_udf)
         self.assertEqual(random_udf1.deterministic, False)
         [row] = self.spark.sql("SELECT randInt()").collect()
-        self.assertEqual(row[0], "6")
+        self.assertEqual(row[0], 6)
         [row] = self.spark.range(1).select(random_udf1()).collect()
-        self.assertEqual(row[0], "6")
+        self.assertEqual(row[0], 6)
         [row] = self.spark.range(1).select(random_udf()).collect()
         self.assertEqual(row[0], 6)
         # render_doc() reproduces the help() exception without printing output
@@ -3691,7 +3704,7 @@ def tearDownClass(cls):
         ReusedSQLTestCase.tearDownClass()
 
     @property
-    def random_udf(self):
+    def nondeterministic_vectorized_udf(self):
         from pyspark.sql.functions import pandas_udf
 
         @pandas_udf('double')
@@ -3726,6 +3739,21 @@ def test_vectorized_udf_basic(self):
                         bool_f(col('bool')))
         self.assertEquals(df.collect(), res.collect())
 
+    def test_register_nondeterministic_vectorized_udf_basic(self):
+        from pyspark.sql.functions import pandas_udf
+        from pyspark.rdd import PythonEvalType
+        import random
+        random_pandas_udf = pandas_udf(
+            lambda x: random.randint(6, 6) + x, IntegerType()).asNondeterministic()
+        self.assertEqual(random_pandas_udf.deterministic, False)
+        self.assertEqual(random_pandas_udf.evalType, PythonEvalType.SQL_PANDAS_SCALAR_UDF)
+        nondeterministic_pandas_udf = self.spark.catalog.registerFunction(
+            "randomPandasUDF", random_pandas_udf)
+        self.assertEqual(nondeterministic_pandas_udf.deterministic, False)
+        self.assertEqual(nondeterministic_pandas_udf.evalType, PythonEvalType.SQL_PANDAS_SCALAR_UDF)
+        [row] = self.spark.sql("SELECT randomPandasUDF(1)").collect()
+        self.assertEqual(row[0], 7)
+
     def test_vectorized_udf_null_boolean(self):
         from pyspark.sql.functions import pandas_udf, col
         data = [(True,), (True,), (None,), (False,)]
@@ -4085,14 +4113,14 @@ def test_vectorized_udf_timestamps_respect_session_timezone(self):
         finally:
             self.spark.conf.set("spark.sql.session.timeZone", orig_tz)
 
-    def test_nondeterministic_udf(self):
+    def test_nondeterministic_vectorized_udf(self):
         # Test that nondeterministic UDFs are evaluated only once in chained UDF evaluations
         from pyspark.sql.functions import udf, pandas_udf, col
 
         @pandas_udf('double')
         def plus_ten(v):
             return v + 10
-        random_udf = self.random_udf
+        random_udf = self.nondeterministic_vectorized_udf
 
         df = self.spark.range(10).withColumn('rand', random_udf(col('id')))
         result1 = df.withColumn('plus_ten(rand)', plus_ten(df['rand'])).toPandas()
@@ -4100,11 +4128,11 @@ def plus_ten(v):
         self.assertEqual(random_udf.deterministic, False)
         self.assertTrue(result1['plus_ten(rand)'].equals(result1['rand'] + 10))
 
-    def test_nondeterministic_udf_in_aggregate(self):
+    def test_nondeterministic_vectorized_udf_in_aggregate(self):
         from pyspark.sql.functions import pandas_udf, sum
 
         df = self.spark.range(10)
-        random_udf = self.random_udf
+        random_udf = self.nondeterministic_vectorized_udf
 
         with QuietTest(self.sc):
             with self.assertRaisesRegexp(AnalysisException, 'nondeterministic'):
@@ -4112,6 +4140,23 @@ def test_nondeterministic_udf_in_aggregate(self):
             with self.assertRaisesRegexp(AnalysisException, 'nondeterministic'):
                 df.agg(sum(random_udf(df.id))).collect()
 
+    def test_register_vectorized_udf_basic(self):
+        from pyspark.rdd import PythonEvalType
+        from pyspark.sql.functions import pandas_udf, col, expr
+        df = self.spark.range(10).select(
+            col('id').cast('int').alias('a'),
+            col('id').cast('int').alias('b'))
+        original_add = pandas_udf(lambda x, y: x + y, IntegerType())
+        self.assertEqual(original_add.deterministic, True)
+        self.assertEqual(original_add.evalType, PythonEvalType.SQL_PANDAS_SCALAR_UDF)
+        new_add = self.spark.catalog.registerFunction("add1", original_add)
+        res1 = df.select(new_add(col('a'), col('b')))
+        res2 = self.spark.sql(
+            "SELECT add1(t.a, t.b) FROM (SELECT id as a, id as b FROM range(10)) t")
+        expected = df.select(expr('a + b'))
+        self.assertEquals(expected.collect(), res1.collect())
+        self.assertEquals(expected.collect(), res2.collect())
+
 
 @unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
 class GroupbyApplyTests(ReusedSQLTestCase):
@@ -4147,6 +4192,15 @@ def test_simple(self):
         expected = df.toPandas().groupby('id').apply(foo_udf.func).reset_index(drop=True)
         self.assertFramesEqual(expected, result)
 
+    def test_register_group_map_udf(self):
+        from pyspark.sql.functions import pandas_udf, PandasUDFType
+
+        foo_udf = pandas_udf(lambda x: x, "id long", PandasUDFType.GROUP_MAP)
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(ValueError, 'f must be either SQL_BATCHED_UDF or '
+                                                     'SQL_PANDAS_SCALAR_UDF'):
+                self.spark.catalog.registerFunction("foo_udf", foo_udf)
+
     def test_decorator(self):
         from pyspark.sql.functions import pandas_udf, PandasUDFType
         df = self.data

From 863ffdc8aefdfab107d406cfddec1d3d123b752e Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 16 Jan 2018 22:41:30 +0800
Subject: [PATCH 102/566] [SPARK-22392][SQL] data source v2 columnar batch
 reader

## What changes were proposed in this pull request?

a new Data Source V2 interface to allow the data source to return `ColumnarBatch` during the scan.

## How was this patch tested?

new tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20153 from cloud-fan/columnar-reader.

(cherry picked from commit 75db14864d2bd9b8e13154226e94d466e3a7e0a0)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sources/v2/reader/DataSourceV2Reader.java |   5 +-
 .../v2/reader/SupportsScanColumnarBatch.java  |  52 ++++++++
 .../v2/reader/SupportsScanUnsafeRow.java      |   2 +-
 .../sql/execution/ColumnarBatchScan.scala     |  37 +++++-
 .../sql/execution/DataSourceScanExec.scala    |  39 ++----
 .../columnar/InMemoryTableScanExec.scala      | 101 +++++++++-------
 .../datasources/v2/DataSourceRDD.scala        |  20 ++--
 .../datasources/v2/DataSourceV2ScanExec.scala |  72 ++++++-----
 .../ContinuousDataSourceRDDIter.scala         |   4 +-
 .../sql/sources/v2/JavaBatchDataSourceV2.java | 112 ++++++++++++++++++
 .../execution/WholeStageCodegenSuite.scala    |  28 ++---
 .../sql/sources/v2/DataSourceV2Suite.scala    |  72 ++++++++++-
 12 files changed, 400 insertions(+), 144 deletions(-)
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanColumnarBatch.java
 create mode 100644 sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java

diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
index 95ee4a8278322..f23c3842bf1b1 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
@@ -38,7 +38,10 @@
  *   2. Information Reporting. E.g., statistics reporting, ordering reporting, etc.
  *      Names of these interfaces start with `SupportsReporting`.
  *   3. Special scans. E.g, columnar scan, unsafe row scan, etc.
- *      Names of these interfaces start with `SupportsScan`.
+ *      Names of these interfaces start with `SupportsScan`. Note that a reader should only
+ *      implement at most one of the special scans, if more than one special scans are implemented,
+ *      only one of them would be respected, according to the priority list from high to low:
+ *      {@link SupportsScanColumnarBatch}, {@link SupportsScanUnsafeRow}.
  *
  * If an exception was throw when applying any of these query optimizations, the action would fail
  * and no Spark job was submitted.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanColumnarBatch.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanColumnarBatch.java
new file mode 100644
index 0000000000000..27cf3a77724f0
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanColumnarBatch.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import java.util.List;
+
+import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
+
+/**
+ * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * interface to output {@link ColumnarBatch} and make the scan faster.
+ */
+@InterfaceStability.Evolving
+public interface SupportsScanColumnarBatch extends DataSourceV2Reader {
+  @Override
+  default List<ReadTask<Row>> createReadTasks() {
+    throw new IllegalStateException(
+      "createReadTasks not supported by default within SupportsScanColumnarBatch.");
+  }
+
+  /**
+   * Similar to {@link DataSourceV2Reader#createReadTasks()}, but returns columnar data in batches.
+   */
+  List<ReadTask<ColumnarBatch>> createBatchReadTasks();
+
+  /**
+   * Returns true if the concrete data source reader can read data in batch according to the scan
+   * properties like required columns, pushes filters, etc. It's possible that the implementation
+   * can only support some certain columns with certain types. Users can overwrite this method and
+   * {@link #createReadTasks()} to fallback to normal read path under some conditions.
+   */
+  default boolean enableBatchRead() {
+    return true;
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java
index b90ec880dc85e..2d3ad0eee65ff 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java
@@ -35,7 +35,7 @@ public interface SupportsScanUnsafeRow extends DataSourceV2Reader {
   @Override
   default List<ReadTask<Row>> createReadTasks() {
     throw new IllegalStateException(
-        "createReadTasks should not be called with SupportsScanUnsafeRow.");
+      "createReadTasks not supported by default within SupportsScanUnsafeRow");
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
index 5617046e1396e..dd68df9686691 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.catalyst.expressions.{BoundReference, UnsafeRow}
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.types.DataType
@@ -25,13 +25,16 @@ import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
 
 
 /**
- * Helper trait for abstracting scan functionality using
- * [[ColumnarBatch]]es.
+ * Helper trait for abstracting scan functionality using [[ColumnarBatch]]es.
  */
 private[sql] trait ColumnarBatchScan extends CodegenSupport {
 
   def vectorTypes: Option[Seq[String]] = None
 
+  protected def supportsBatch: Boolean = true
+
+  protected def needsUnsafeRowConversion: Boolean = true
+
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
     "scanTime" -> SQLMetrics.createTimingMetric(sparkContext, "scan time"))
@@ -71,7 +74,14 @@ private[sql] trait ColumnarBatchScan extends CodegenSupport {
     // PhysicalRDD always just has one input
     val input = ctx.addMutableState("scala.collection.Iterator", "input",
       v => s"$v = inputs[0];")
+    if (supportsBatch) {
+      produceBatches(ctx, input)
+    } else {
+      produceRows(ctx, input)
+    }
+  }
 
+  private def produceBatches(ctx: CodegenContext, input: String): String = {
     // metrics
     val numOutputRows = metricTerm(ctx, "numOutputRows")
     val scanTimeMetric = metricTerm(ctx, "scanTime")
@@ -137,4 +147,25 @@ private[sql] trait ColumnarBatchScan extends CodegenSupport {
      """.stripMargin
   }
 
+  private def produceRows(ctx: CodegenContext, input: String): String = {
+    val numOutputRows = metricTerm(ctx, "numOutputRows")
+    val row = ctx.freshName("row")
+
+    ctx.INPUT_ROW = row
+    ctx.currentVars = null
+    // Always provide `outputVars`, so that the framework can help us build unsafe row if the input
+    // row is not unsafe row, i.e. `needsUnsafeRowConversion` is true.
+    val outputVars = output.zipWithIndex.map { case (a, i) =>
+      BoundReference(i, a.dataType, a.nullable).genCode(ctx)
+    }
+    val inputRow = if (needsUnsafeRowConversion) null else row
+    s"""
+       |while ($input.hasNext()) {
+       |  InternalRow $row = (InternalRow) $input.next();
+       |  $numOutputRows.add(1);
+       |  ${consume(ctx, outputVars, inputRow).trim}
+       |  if (shouldStop()) return;
+       |}
+     """.stripMargin
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index d1ff82c7c06bc..7c7d79c2bbd7c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -164,13 +164,15 @@ case class FileSourceScanExec(
     override val tableIdentifier: Option[TableIdentifier])
   extends DataSourceScanExec with ColumnarBatchScan  {
 
-  val supportsBatch: Boolean = relation.fileFormat.supportBatch(
+  override val supportsBatch: Boolean = relation.fileFormat.supportBatch(
     relation.sparkSession, StructType.fromAttributes(output))
 
-  val needsUnsafeRowConversion: Boolean = if (relation.fileFormat.isInstanceOf[ParquetSource]) {
-    SparkSession.getActiveSession.get.sessionState.conf.parquetVectorizedReaderEnabled
-  } else {
-    false
+  override val needsUnsafeRowConversion: Boolean = {
+    if (relation.fileFormat.isInstanceOf[ParquetSource]) {
+      SparkSession.getActiveSession.get.sessionState.conf.parquetVectorizedReaderEnabled
+    } else {
+      false
+    }
   }
 
   override def vectorTypes: Option[Seq[String]] =
@@ -346,33 +348,6 @@ case class FileSourceScanExec(
 
   override val nodeNamePrefix: String = "File"
 
-  override protected def doProduce(ctx: CodegenContext): String = {
-    if (supportsBatch) {
-      return super.doProduce(ctx)
-    }
-    val numOutputRows = metricTerm(ctx, "numOutputRows")
-    // PhysicalRDD always just has one input
-    val input = ctx.addMutableState("scala.collection.Iterator", "input", v => s"$v = inputs[0];")
-    val row = ctx.freshName("row")
-
-    ctx.INPUT_ROW = row
-    ctx.currentVars = null
-    // Always provide `outputVars`, so that the framework can help us build unsafe row if the input
-    // row is not unsafe row, i.e. `needsUnsafeRowConversion` is true.
-    val outputVars = output.zipWithIndex.map{ case (a, i) =>
-      BoundReference(i, a.dataType, a.nullable).genCode(ctx)
-    }
-    val inputRow = if (needsUnsafeRowConversion) null else row
-    s"""
-       |while ($input.hasNext()) {
-       |  InternalRow $row = (InternalRow) $input.next();
-       |  $numOutputRows.add(1);
-       |  ${consume(ctx, outputVars, inputRow).trim}
-       |  if (shouldStop()) return;
-       |}
-     """.stripMargin
-  }
-
   /**
    * Create an RDD for bucketed reads.
    * The non-bucketed variant of this function is [[createNonBucketedReadRDD]].
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
index 933b9753faa61..3565ee3af1b9f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
@@ -49,9 +49,9 @@ case class InMemoryTableScanExec(
 
   /**
    * If true, get data from ColumnVector in ColumnarBatch, which are generally faster.
-   * If false, get data from UnsafeRow build from ColumnVector
+   * If false, get data from UnsafeRow build from CachedBatch
    */
-  override val supportCodegen: Boolean = {
+  override val supportsBatch: Boolean = {
     // In the initial implementation, for ease of review
     // support only primitive data types and # of fields is less than wholeStageMaxNumFields
     relation.schema.fields.forall(f => f.dataType match {
@@ -61,6 +61,8 @@ case class InMemoryTableScanExec(
     }) && !WholeStageCodegenExec.isTooManyFields(conf, relation.schema)
   }
 
+  override protected def needsUnsafeRowConversion: Boolean = false
+
   private val columnIndices =
     attributes.map(a => relation.output.map(o => o.exprId).indexOf(a.exprId)).toArray
 
@@ -90,14 +92,56 @@ case class InMemoryTableScanExec(
     columnarBatch
   }
 
-  override def inputRDDs(): Seq[RDD[InternalRow]] = {
-    assert(supportCodegen)
+  private lazy val inputRDD: RDD[InternalRow] = {
     val buffers = filteredCachedBatches()
-    // HACK ALERT: This is actually an RDD[ColumnarBatch].
-    // We're taking advantage of Scala's type erasure here to pass these batches along.
-    Seq(buffers.map(createAndDecompressColumn(_)).asInstanceOf[RDD[InternalRow]])
+    if (supportsBatch) {
+      // HACK ALERT: This is actually an RDD[ColumnarBatch].
+      // We're taking advantage of Scala's type erasure here to pass these batches along.
+      buffers.map(createAndDecompressColumn).asInstanceOf[RDD[InternalRow]]
+    } else {
+      val numOutputRows = longMetric("numOutputRows")
+
+      if (enableAccumulatorsForTest) {
+        readPartitions.setValue(0)
+        readBatches.setValue(0)
+      }
+
+      // Using these variables here to avoid serialization of entire objects (if referenced
+      // directly) within the map Partitions closure.
+      val relOutput: AttributeSeq = relation.output
+
+      filteredCachedBatches().mapPartitionsInternal { cachedBatchIterator =>
+        // Find the ordinals and data types of the requested columns.
+        val (requestedColumnIndices, requestedColumnDataTypes) =
+          attributes.map { a =>
+            relOutput.indexOf(a.exprId) -> a.dataType
+          }.unzip
+
+        // update SQL metrics
+        val withMetrics = cachedBatchIterator.map { batch =>
+          if (enableAccumulatorsForTest) {
+            readBatches.add(1)
+          }
+          numOutputRows += batch.numRows
+          batch
+        }
+
+        val columnTypes = requestedColumnDataTypes.map {
+          case udt: UserDefinedType[_] => udt.sqlType
+          case other => other
+        }.toArray
+        val columnarIterator = GenerateColumnAccessor.generate(columnTypes)
+        columnarIterator.initialize(withMetrics, columnTypes, requestedColumnIndices.toArray)
+        if (enableAccumulatorsForTest && columnarIterator.hasNext) {
+          readPartitions.add(1)
+        }
+        columnarIterator
+      }
+    }
   }
 
+  override def inputRDDs(): Seq[RDD[InternalRow]] = Seq(inputRDD)
+
   override def output: Seq[Attribute] = attributes
 
   private def updateAttribute(expr: Expression): Expression = {
@@ -185,7 +229,7 @@ case class InMemoryTableScanExec(
     }
   }
 
-  lazy val enableAccumulators: Boolean =
+  lazy val enableAccumulatorsForTest: Boolean =
     sqlContext.getConf("spark.sql.inMemoryTableScanStatistics.enable", "false").toBoolean
 
   // Accumulators used for testing purposes
@@ -230,43 +274,10 @@ case class InMemoryTableScanExec(
   }
 
   protected override def doExecute(): RDD[InternalRow] = {
-    val numOutputRows = longMetric("numOutputRows")
-
-    if (enableAccumulators) {
-      readPartitions.setValue(0)
-      readBatches.setValue(0)
-    }
-
-    // Using these variables here to avoid serialization of entire objects (if referenced directly)
-    // within the map Partitions closure.
-    val relOutput: AttributeSeq = relation.output
-
-    filteredCachedBatches().mapPartitionsInternal { cachedBatchIterator =>
-      // Find the ordinals and data types of the requested columns.
-      val (requestedColumnIndices, requestedColumnDataTypes) =
-        attributes.map { a =>
-          relOutput.indexOf(a.exprId) -> a.dataType
-        }.unzip
-
-      // update SQL metrics
-      val withMetrics = cachedBatchIterator.map { batch =>
-        if (enableAccumulators) {
-          readBatches.add(1)
-        }
-        numOutputRows += batch.numRows
-        batch
-      }
-
-      val columnTypes = requestedColumnDataTypes.map {
-        case udt: UserDefinedType[_] => udt.sqlType
-        case other => other
-      }.toArray
-      val columnarIterator = GenerateColumnAccessor.generate(columnTypes)
-      columnarIterator.initialize(withMetrics, columnTypes, requestedColumnIndices.toArray)
-      if (enableAccumulators && columnarIterator.hasNext) {
-        readPartitions.add(1)
-      }
-      columnarIterator
+    if (supportsBatch) {
+      WholeStageCodegenExec(this).execute()
+    } else {
+      inputRDD
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
index 5f30be5ed4af1..ac104d7cd0cb3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
@@ -18,19 +18,19 @@
 package org.apache.spark.sql.execution.datasources.v2
 
 import scala.collection.JavaConverters._
+import scala.reflect.ClassTag
 
 import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.sources.v2.reader.ReadTask
 
-class DataSourceRDDPartition(val index: Int, val readTask: ReadTask[UnsafeRow])
+class DataSourceRDDPartition[T : ClassTag](val index: Int, val readTask: ReadTask[T])
   extends Partition with Serializable
 
-class DataSourceRDD(
+class DataSourceRDD[T: ClassTag](
     sc: SparkContext,
-    @transient private val readTasks: java.util.List[ReadTask[UnsafeRow]])
-  extends RDD[UnsafeRow](sc, Nil) {
+    @transient private val readTasks: java.util.List[ReadTask[T]])
+  extends RDD[T](sc, Nil) {
 
   override protected def getPartitions: Array[Partition] = {
     readTasks.asScala.zipWithIndex.map {
@@ -38,10 +38,10 @@ class DataSourceRDD(
     }.toArray
   }
 
-  override def compute(split: Partition, context: TaskContext): Iterator[UnsafeRow] = {
-    val reader = split.asInstanceOf[DataSourceRDDPartition].readTask.createDataReader()
+  override def compute(split: Partition, context: TaskContext): Iterator[T] = {
+    val reader = split.asInstanceOf[DataSourceRDDPartition[T]].readTask.createDataReader()
     context.addTaskCompletionListener(_ => reader.close())
-    val iter = new Iterator[UnsafeRow] {
+    val iter = new Iterator[T] {
       private[this] var valuePrepared = false
 
       override def hasNext: Boolean = {
@@ -51,7 +51,7 @@ class DataSourceRDD(
         valuePrepared
       }
 
-      override def next(): UnsafeRow = {
+      override def next(): T = {
         if (!hasNext) {
           throw new java.util.NoSuchElementException("End of stream")
         }
@@ -63,6 +63,6 @@ class DataSourceRDD(
   }
 
   override def getPreferredLocations(split: Partition): Seq[String] = {
-    split.asInstanceOf[DataSourceRDDPartition].readTask.preferredLocations()
+    split.asInstanceOf[DataSourceRDDPartition[T]].readTask.preferredLocations()
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
index 49c506bc560cf..8c64df080242f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
@@ -24,10 +24,8 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.execution.LeafExecNode
-import org.apache.spark.sql.execution.metric.SQLMetrics
-import org.apache.spark.sql.execution.streaming.StreamExecution
-import org.apache.spark.sql.execution.streaming.continuous.{ContinuousDataSourceRDD, ContinuousExecution, EpochCoordinatorRef, SetReaderPartitions}
+import org.apache.spark.sql.execution.{ColumnarBatchScan, LeafExecNode, WholeStageCodegenExec}
+import org.apache.spark.sql.execution.streaming.continuous._
 import org.apache.spark.sql.sources.v2.reader._
 import org.apache.spark.sql.sources.v2.streaming.reader.ContinuousReader
 import org.apache.spark.sql.types.StructType
@@ -37,40 +35,56 @@ import org.apache.spark.sql.types.StructType
  */
 case class DataSourceV2ScanExec(
     fullOutput: Seq[AttributeReference],
-    @transient reader: DataSourceV2Reader) extends LeafExecNode with DataSourceReaderHolder {
+    @transient reader: DataSourceV2Reader)
+  extends LeafExecNode with DataSourceReaderHolder with ColumnarBatchScan {
 
   override def canEqual(other: Any): Boolean = other.isInstanceOf[DataSourceV2ScanExec]
 
-  override def references: AttributeSet = AttributeSet.empty
+  override def producedAttributes: AttributeSet = AttributeSet(fullOutput)
 
-  override lazy val metrics = Map(
-    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
+  private lazy val readTasks: java.util.List[ReadTask[UnsafeRow]] = reader match {
+    case r: SupportsScanUnsafeRow => r.createUnsafeRowReadTasks()
+    case _ =>
+      reader.createReadTasks().asScala.map {
+        new RowToUnsafeRowReadTask(_, reader.readSchema()): ReadTask[UnsafeRow]
+      }.asJava
+  }
 
-  override protected def doExecute(): RDD[InternalRow] = {
-    val readTasks: java.util.List[ReadTask[UnsafeRow]] = reader match {
-      case r: SupportsScanUnsafeRow => r.createUnsafeRowReadTasks()
-      case _ =>
-        reader.createReadTasks().asScala.map {
-          new RowToUnsafeRowReadTask(_, reader.readSchema()): ReadTask[UnsafeRow]
-        }.asJava
-    }
+  private lazy val inputRDD: RDD[InternalRow] = reader match {
+    case r: SupportsScanColumnarBatch if r.enableBatchRead() =>
+      assert(!reader.isInstanceOf[ContinuousReader],
+        "continuous stream reader does not support columnar read yet.")
+      new DataSourceRDD(sparkContext, r.createBatchReadTasks()).asInstanceOf[RDD[InternalRow]]
+
+    case _: ContinuousReader =>
+      EpochCoordinatorRef.get(
+        sparkContext.getLocalProperty(ContinuousExecution.RUN_ID_KEY), sparkContext.env)
+        .askSync[Unit](SetReaderPartitions(readTasks.size()))
+      new ContinuousDataSourceRDD(sparkContext, sqlContext, readTasks)
+        .asInstanceOf[RDD[InternalRow]]
+
+    case _ =>
+      new DataSourceRDD(sparkContext, readTasks).asInstanceOf[RDD[InternalRow]]
+  }
 
-    val inputRDD = reader match {
-      case _: ContinuousReader =>
-        EpochCoordinatorRef.get(
-          sparkContext.getLocalProperty(ContinuousExecution.RUN_ID_KEY), sparkContext.env)
-          .askSync[Unit](SetReaderPartitions(readTasks.size()))
+  override def inputRDDs(): Seq[RDD[InternalRow]] = Seq(inputRDD)
 
-        new ContinuousDataSourceRDD(sparkContext, sqlContext, readTasks)
+  override val supportsBatch: Boolean = reader match {
+    case r: SupportsScanColumnarBatch if r.enableBatchRead() => true
+    case _ => false
+  }
 
-      case _ =>
-        new DataSourceRDD(sparkContext, readTasks)
-    }
+  override protected def needsUnsafeRowConversion: Boolean = false
 
-    val numOutputRows = longMetric("numOutputRows")
-    inputRDD.asInstanceOf[RDD[InternalRow]].map { r =>
-      numOutputRows += 1
-      r
+  override protected def doExecute(): RDD[InternalRow] = {
+    if (supportsBatch) {
+      WholeStageCodegenExec(this).execute()
+    } else {
+      val numOutputRows = longMetric("numOutputRows")
+      inputRDD.map { r =>
+        numOutputRows += 1
+        r
+      }
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
index d79e4bd65f563..b3f1a1a1aaab3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
@@ -52,7 +52,7 @@ class ContinuousDataSourceRDD(
   }
 
   override def compute(split: Partition, context: TaskContext): Iterator[UnsafeRow] = {
-    val reader = split.asInstanceOf[DataSourceRDDPartition].readTask.createDataReader()
+    val reader = split.asInstanceOf[DataSourceRDDPartition[UnsafeRow]].readTask.createDataReader()
 
     val runId = context.getLocalProperty(ContinuousExecution.RUN_ID_KEY)
 
@@ -132,7 +132,7 @@ class ContinuousDataSourceRDD(
   }
 
   override def getPreferredLocations(split: Partition): Seq[String] = {
-    split.asInstanceOf[DataSourceRDDPartition].readTask.preferredLocations()
+    split.asInstanceOf[DataSourceRDDPartition[UnsafeRow]].readTask.preferredLocations()
   }
 }
 
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java
new file mode 100644
index 0000000000000..44e5146d7c553
--- /dev/null
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package test.org.apache.spark.sql.sources.v2;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector;
+import org.apache.spark.sql.sources.v2.DataSourceV2;
+import org.apache.spark.sql.sources.v2.DataSourceV2Options;
+import org.apache.spark.sql.sources.v2.ReadSupport;
+import org.apache.spark.sql.sources.v2.reader.*;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructType;
+import org.apache.spark.sql.vectorized.ColumnVector;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
+
+
+public class JavaBatchDataSourceV2 implements DataSourceV2, ReadSupport {
+
+  class Reader implements DataSourceV2Reader, SupportsScanColumnarBatch {
+    private final StructType schema = new StructType().add("i", "int").add("j", "int");
+
+    @Override
+    public StructType readSchema() {
+      return schema;
+    }
+
+    @Override
+    public List<ReadTask<ColumnarBatch>> createBatchReadTasks() {
+      return java.util.Arrays.asList(new JavaBatchReadTask(0, 50), new JavaBatchReadTask(50, 90));
+    }
+  }
+
+  static class JavaBatchReadTask implements ReadTask<ColumnarBatch>, DataReader<ColumnarBatch> {
+    private int start;
+    private int end;
+
+    private static final int BATCH_SIZE = 20;
+
+    private OnHeapColumnVector i;
+    private OnHeapColumnVector j;
+    private ColumnarBatch batch;
+
+    JavaBatchReadTask(int start, int end) {
+      this.start = start;
+      this.end = end;
+    }
+
+    @Override
+    public DataReader<ColumnarBatch> createDataReader() {
+      this.i = new OnHeapColumnVector(BATCH_SIZE, DataTypes.IntegerType);
+      this.j = new OnHeapColumnVector(BATCH_SIZE, DataTypes.IntegerType);
+      ColumnVector[] vectors = new ColumnVector[2];
+      vectors[0] = i;
+      vectors[1] = j;
+      this.batch = new ColumnarBatch(new StructType().add("i", "int").add("j", "int"), vectors, BATCH_SIZE);
+      return this;
+    }
+
+    @Override
+    public boolean next() {
+      i.reset();
+      j.reset();
+      int count = 0;
+      while (start < end && count < BATCH_SIZE) {
+        i.putInt(count, start);
+        j.putInt(count, -start);
+        start += 1;
+        count += 1;
+      }
+
+      if (count == 0) {
+        return false;
+      } else {
+        batch.setNumRows(count);
+        return true;
+      }
+    }
+
+    @Override
+    public ColumnarBatch get() {
+      return batch;
+    }
+
+    @Override
+    public void close() throws IOException {
+      batch.close();
+    }
+  }
+
+
+  @Override
+  public DataSourceV2Reader createReader(DataSourceV2Options options) {
+    return new Reader();
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
index bc05dca578c47..22ca128c27768 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -121,31 +121,23 @@ class WholeStageCodegenSuite extends QueryTest with SharedSQLContext {
   test("cache for primitive type should be in WholeStageCodegen with InMemoryTableScanExec") {
     import testImplicits._
 
-    val dsInt = spark.range(3).cache
-    dsInt.count
+    val dsInt = spark.range(3).cache()
+    dsInt.count()
     val dsIntFilter = dsInt.filter(_ > 0)
     val planInt = dsIntFilter.queryExecution.executedPlan
-    assert(planInt.find(p =>
-      p.isInstanceOf[WholeStageCodegenExec] &&
-      p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec] &&
-      p.asInstanceOf[WholeStageCodegenExec].child.asInstanceOf[FilterExec].child
-        .isInstanceOf[InMemoryTableScanExec] &&
-      p.asInstanceOf[WholeStageCodegenExec].child.asInstanceOf[FilterExec].child
-        .asInstanceOf[InMemoryTableScanExec].supportCodegen).isDefined
-    )
+    assert(planInt.collect {
+      case WholeStageCodegenExec(FilterExec(_, i: InMemoryTableScanExec)) if i.supportsBatch => ()
+    }.length == 1)
     assert(dsIntFilter.collect() === Array(1, 2))
 
     // cache for string type is not supported for InMemoryTableScanExec
-    val dsString = spark.range(3).map(_.toString).cache
-    dsString.count
+    val dsString = spark.range(3).map(_.toString).cache()
+    dsString.count()
     val dsStringFilter = dsString.filter(_ == "1")
     val planString = dsStringFilter.queryExecution.executedPlan
-    assert(planString.find(p =>
-      p.isInstanceOf[WholeStageCodegenExec] &&
-      p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec] &&
-      !p.asInstanceOf[WholeStageCodegenExec].child.asInstanceOf[FilterExec].child
-        .isInstanceOf[InMemoryTableScanExec]).isDefined
-    )
+    assert(planString.collect {
+      case WholeStageCodegenExec(FilterExec(_, i: InMemoryTableScanExec)) if !i.supportsBatch => ()
+    }.length == 1)
     assert(dsStringFilter.collect() === Array("1"))
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
index ab37e4984bd1f..a89f7c55bf4f7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
@@ -24,10 +24,12 @@ import test.org.apache.spark.sql.sources.v2._
 import org.apache.spark.SparkException
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector
 import org.apache.spark.sql.sources.{Filter, GreaterThan}
 import org.apache.spark.sql.sources.v2.reader._
 import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types.{IntegerType, StructType}
+import org.apache.spark.sql.vectorized.ColumnarBatch
 
 class DataSourceV2Suite extends QueryTest with SharedSQLContext {
   import testImplicits._
@@ -56,7 +58,7 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext {
     }
   }
 
-  test("unsafe row implementation") {
+  test("unsafe row scan implementation") {
     Seq(classOf[UnsafeRowDataSourceV2], classOf[JavaUnsafeRowDataSourceV2]).foreach { cls =>
       withClue(cls.getName) {
         val df = spark.read.format(cls.getName).load()
@@ -67,6 +69,17 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext {
     }
   }
 
+  test("columnar batch scan implementation") {
+    Seq(classOf[BatchDataSourceV2], classOf[JavaBatchDataSourceV2]).foreach { cls =>
+      withClue(cls.getName) {
+        val df = spark.read.format(cls.getName).load()
+        checkAnswer(df, (0 until 90).map(i => Row(i, -i)))
+        checkAnswer(df.select('j), (0 until 90).map(i => Row(-i)))
+        checkAnswer(df.filter('i > 50), (51 until 90).map(i => Row(i, -i)))
+      }
+    }
+  }
+
   test("schema required data source") {
     Seq(classOf[SchemaRequiredDataSource], classOf[JavaSchemaRequiredDataSource]).foreach { cls =>
       withClue(cls.getName) {
@@ -275,7 +288,7 @@ class UnsafeRowReadTask(start: Int, end: Int)
 
   private var current = start - 1
 
-  override def createDataReader(): DataReader[UnsafeRow] = new UnsafeRowReadTask(start, end)
+  override def createDataReader(): DataReader[UnsafeRow] = this
 
   override def next(): Boolean = {
     current += 1
@@ -300,3 +313,56 @@ class SchemaRequiredDataSource extends DataSourceV2 with ReadSupportWithSchema {
   override def createReader(schema: StructType, options: DataSourceV2Options): DataSourceV2Reader =
     new Reader(schema)
 }
+
+class BatchDataSourceV2 extends DataSourceV2 with ReadSupport {
+
+  class Reader extends DataSourceV2Reader with SupportsScanColumnarBatch {
+    override def readSchema(): StructType = new StructType().add("i", "int").add("j", "int")
+
+    override def createBatchReadTasks(): JList[ReadTask[ColumnarBatch]] = {
+      java.util.Arrays.asList(new BatchReadTask(0, 50), new BatchReadTask(50, 90))
+    }
+  }
+
+  override def createReader(options: DataSourceV2Options): DataSourceV2Reader = new Reader
+}
+
+class BatchReadTask(start: Int, end: Int)
+  extends ReadTask[ColumnarBatch] with DataReader[ColumnarBatch] {
+
+  private final val BATCH_SIZE = 20
+  private lazy val i = new OnHeapColumnVector(BATCH_SIZE, IntegerType)
+  private lazy val j = new OnHeapColumnVector(BATCH_SIZE, IntegerType)
+  private lazy val batch = new ColumnarBatch(
+    new StructType().add("i", "int").add("j", "int"), Array(i, j), BATCH_SIZE)
+
+  private var current = start
+
+  override def createDataReader(): DataReader[ColumnarBatch] = this
+
+  override def next(): Boolean = {
+    i.reset()
+    j.reset()
+
+    var count = 0
+    while (current < end && count < BATCH_SIZE) {
+      i.putInt(count, current)
+      j.putInt(count, -current)
+      current += 1
+      count += 1
+    }
+
+    if (count == 0) {
+      false
+    } else {
+      batch.setNumRows(count)
+      true
+    }
+  }
+
+  override def get(): ColumnarBatch = {
+    batch
+  }
+
+  override def close(): Unit = batch.close()
+}

From 833a584bb675bf7c643bde62eb51963814c12384 Mon Sep 17 00:00:00 2001
From: Bago Amirbekian <bago@databricks.com>
Date: Tue, 16 Jan 2018 12:56:57 -0800
Subject: [PATCH 103/566] [SPARK-23045][ML][SPARKR] Update RFormula to use
 OneHotEncoderEstimator.

## What changes were proposed in this pull request?

RFormula should use VectorSizeHint & OneHotEncoderEstimator in its pipeline to avoid using the deprecated OneHotEncoder & to ensure the model produced can be used in streaming.

## How was this patch tested?

Unit tests.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Bago Amirbekian <bago@databricks.com>

Closes #20229 from MrBago/rFormula.

(cherry picked from commit 4371466b3f06ca171b10568e776c9446f7bae6dd)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 R/pkg/R/mllib_utils.R                         |  1 -
 .../apache/spark/ml/feature/RFormula.scala    | 20 +++++--
 .../spark/ml/feature/RFormulaSuite.scala      | 53 +++++++++++--------
 3 files changed, 46 insertions(+), 28 deletions(-)

diff --git a/R/pkg/R/mllib_utils.R b/R/pkg/R/mllib_utils.R
index 23dda42c325be..a53c92c2c4815 100644
--- a/R/pkg/R/mllib_utils.R
+++ b/R/pkg/R/mllib_utils.R
@@ -130,4 +130,3 @@ read.ml <- function(path) {
     stop("Unsupported model: ", jobj)
   }
 }
-
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index f384ffbf578bc..1155ea5fdd85b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -199,6 +199,7 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
     val parsedFormula = RFormulaParser.parse($(formula))
     val resolvedFormula = parsedFormula.resolve(dataset.schema)
     val encoderStages = ArrayBuffer[PipelineStage]()
+    val oneHotEncodeColumns = ArrayBuffer[(String, String)]()
 
     val prefixesToRewrite = mutable.Map[String, String]()
     val tempColumns = ArrayBuffer[String]()
@@ -242,16 +243,17 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
     val encodedTerms = resolvedFormula.terms.map {
       case Seq(term) if dataset.schema(term).dataType == StringType =>
         val encodedCol = tmpColumn("onehot")
-        var encoder = new OneHotEncoder()
-          .setInputCol(indexed(term))
-          .setOutputCol(encodedCol)
         // Formula w/o intercept, one of the categories in the first category feature is
         // being used as reference category, we will not drop any category for that feature.
         if (!hasIntercept && !keepReferenceCategory) {
-          encoder = encoder.setDropLast(false)
+          encoderStages += new OneHotEncoderEstimator(uid)
+            .setInputCols(Array(indexed(term)))
+            .setOutputCols(Array(encodedCol))
+            .setDropLast(false)
           keepReferenceCategory = true
+        } else {
+          oneHotEncodeColumns += indexed(term) -> encodedCol
         }
-        encoderStages += encoder
         prefixesToRewrite(encodedCol + "_") = term + "_"
         encodedCol
       case Seq(term) =>
@@ -265,6 +267,14 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
         interactionCol
     }
 
+    if (oneHotEncodeColumns.nonEmpty) {
+      val (inputCols, outputCols) = oneHotEncodeColumns.toArray.unzip
+      encoderStages += new OneHotEncoderEstimator(uid)
+        .setInputCols(inputCols)
+        .setOutputCols(outputCols)
+        .setDropLast(true)
+    }
+
     encoderStages += new VectorAssembler(uid)
       .setInputCols(encodedTerms.toArray)
       .setOutputCol($(featuresCol))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index f3f4b5a3d0233..bfe38d32dd77d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -29,6 +29,17 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
+  def testRFormulaTransform[A: Encoder](
+      dataframe: DataFrame,
+      formulaModel: RFormulaModel,
+      expected: DataFrame): Unit = {
+    val (first +: rest) = expected.schema.fieldNames.toSeq
+    val expectedRows = expected.collect()
+    testTransformerByGlobalCheckFunc[A](dataframe, formulaModel, first, rest: _*) { rows =>
+      assert(rows === expectedRows)
+    }
+  }
+
   test("params") {
     ParamsSuite.checkParams(new RFormula())
   }
@@ -47,7 +58,7 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
     // TODO(ekl) make schema comparisons ignore metadata, to avoid .toString
     assert(result.schema.toString == resultSchema.toString)
     assert(resultSchema == expected.schema)
-    assert(result.collect() === expected.collect())
+    testRFormulaTransform[(Int, Double, Double)](original, model, expected)
   }
 
   test("features column already exists") {
@@ -109,7 +120,7 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
       (7, 8.0, 9.0, Vectors.dense(8.0, 9.0))
     ).toDF("id", "a", "b", "features")
     assert(result.schema.toString == resultSchema.toString)
-    assert(result.collect() === expected.collect())
+    testRFormulaTransform[(Int, Double, Double)](original, model, expected)
   }
 
   test("encodes string terms") {
@@ -126,7 +137,7 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
         (4, "baz", 5, Vectors.dense(0.0, 0.0, 5.0), 4.0)
       ).toDF("id", "a", "b", "features", "label")
     assert(result.schema.toString == resultSchema.toString)
-    assert(result.collect() === expected.collect())
+    testRFormulaTransform[(Int, String, Int)](original, model, expected)
   }
 
   test("encodes string terms with string indexer order type") {
@@ -167,7 +178,7 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
       val result = model.transform(original)
       val resultSchema = model.transformSchema(original.schema)
       assert(result.schema.toString == resultSchema.toString)
-      assert(result.collect() === expected(idx).collect())
+      testRFormulaTransform[(Int, String, Int)](original, model, expected(idx))
       idx += 1
     }
   }
@@ -210,7 +221,7 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
     val result = model.transform(original)
     val resultSchema = model.transformSchema(original.schema)
     assert(result.schema.toString == resultSchema.toString)
-    assert(result.collect() === expected.collect())
+    testRFormulaTransform[(Int, String, Int)](original, model, expected)
   }
 
   test("formula w/o intercept, we should output reference category when encoding string terms") {
@@ -253,7 +264,7 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
       (4, "baz", "zz", 5, Vectors.dense(0.0, 1.0, 0.0, 1.0, 5.0), 4.0)
     ).toDF("id", "a", "b", "c", "features", "label")
     assert(result1.schema.toString == resultSchema1.toString)
-    assert(result1.collect() === expected1.collect())
+    testRFormulaTransform[(Int, String, String, Int)](original, model1, expected1)
 
     val attrs1 = AttributeGroup.fromStructField(result1.schema("features"))
     val expectedAttrs1 = new AttributeGroup(
@@ -280,7 +291,7 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
       (4, "baz", "zz", 5, Vectors.sparse(7, Array(2, 6), Array(1.0, 5.0)), 4.0)
     ).toDF("id", "a", "b", "c", "features", "label")
     assert(result2.schema.toString == resultSchema2.toString)
-    assert(result2.collect() === expected2.collect())
+    testRFormulaTransform[(Int, String, String, Int)](original, model2, expected2)
 
     val attrs2 = AttributeGroup.fromStructField(result2.schema("features"))
     val expectedAttrs2 = new AttributeGroup(
@@ -302,7 +313,6 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
       Seq(("male", "foo", 4), ("female", "bar", 4), ("female", "bar", 5), ("male", "baz", 5))
         .toDF("id", "a", "b")
     val model = formula.fit(original)
-    val result = model.transform(original)
     val expected = Seq(
         ("male", "foo", 4, Vectors.dense(0.0, 1.0, 4.0), 1.0),
         ("female", "bar", 4, Vectors.dense(1.0, 0.0, 4.0), 0.0),
@@ -310,7 +320,7 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
         ("male", "baz", 5, Vectors.dense(0.0, 0.0, 5.0), 1.0)
     ).toDF("id", "a", "b", "features", "label")
     // assert(result.schema.toString == resultSchema.toString)
-    assert(result.collect() === expected.collect())
+    testRFormulaTransform[(String, String, Int)](original, model, expected)
   }
 
   test("force to index label even it is numeric type") {
@@ -319,7 +329,6 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
       Seq((1.0, "foo", 4), (1.0, "bar", 4), (0.0, "bar", 5), (1.0, "baz", 5))
     ).toDF("id", "a", "b")
     val model = formula.fit(original)
-    val result = model.transform(original)
     val expected = spark.createDataFrame(
       Seq(
         (1.0, "foo", 4, Vectors.dense(0.0, 1.0, 4.0), 0.0),
@@ -327,7 +336,7 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
         (0.0, "bar", 5, Vectors.dense(1.0, 0.0, 5.0), 1.0),
         (1.0, "baz", 5, Vectors.dense(0.0, 0.0, 5.0), 0.0))
     ).toDF("id", "a", "b", "features", "label")
-    assert(result.collect() === expected.collect())
+    testRFormulaTransform[(Double, String, Int)](original, model, expected)
   }
 
   test("attribute generation") {
@@ -391,7 +400,7 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
       (1, 2, 4, 2, Vectors.dense(16.0), 1.0),
       (2, 3, 4, 1, Vectors.dense(12.0), 2.0)
     ).toDF("a", "b", "c", "d", "features", "label")
-    assert(result.collect() === expected.collect())
+    testRFormulaTransform[(Int, Int, Int, Int)](original, model, expected)
     val attrs = AttributeGroup.fromStructField(result.schema("features"))
     val expectedAttrs = new AttributeGroup(
       "features",
@@ -414,7 +423,7 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
       (4, "baz", 5, Vectors.dense(5.0, 0.0, 0.0), 4.0),
       (4, "baz", 5, Vectors.dense(5.0, 0.0, 0.0), 4.0)
     ).toDF("id", "a", "b", "features", "label")
-    assert(result.collect() === expected.collect())
+    testRFormulaTransform[(Int, String, Int)](original, model, expected)
     val attrs = AttributeGroup.fromStructField(result.schema("features"))
     val expectedAttrs = new AttributeGroup(
       "features",
@@ -436,7 +445,7 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
       (2, "bar", "zq", Vectors.dense(1.0, 0.0, 0.0, 0.0), 2.0),
       (3, "bar", "zz", Vectors.dense(0.0, 1.0, 0.0, 0.0), 3.0)
     ).toDF("id", "a", "b", "features", "label")
-    assert(result.collect() === expected.collect())
+    testRFormulaTransform[(Int, String, String)](original, model, expected)
     val attrs = AttributeGroup.fromStructField(result.schema("features"))
     val expectedAttrs = new AttributeGroup(
       "features",
@@ -511,8 +520,8 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
     intercept[SparkException] {
       formula1.fit(df1).transform(df2).collect()
     }
-    val result1 = formula1.setHandleInvalid("skip").fit(df1).transform(df2)
-    val result2 = formula1.setHandleInvalid("keep").fit(df1).transform(df2)
+    val model1 = formula1.setHandleInvalid("skip").fit(df1)
+    val model2 = formula1.setHandleInvalid("keep").fit(df1)
 
     val expected1 = Seq(
       (1, "foo", "zq", Vectors.dense(0.0, 1.0), 1.0),
@@ -524,16 +533,16 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
       (3, "bar", "zy", Vectors.dense(1.0, 0.0, 0.0, 0.0), 3.0)
     ).toDF("id", "a", "b", "features", "label")
 
-    assert(result1.collect() === expected1.collect())
-    assert(result2.collect() === expected2.collect())
+    testRFormulaTransform[(Int, String, String)](df2, model1, expected1)
+    testRFormulaTransform[(Int, String, String)](df2, model2, expected2)
 
     // Handle unseen labels.
     val formula2 = new RFormula().setFormula("b ~ a + id")
     intercept[SparkException] {
       formula2.fit(df1).transform(df2).collect()
     }
-    val result3 = formula2.setHandleInvalid("skip").fit(df1).transform(df2)
-    val result4 = formula2.setHandleInvalid("keep").fit(df1).transform(df2)
+    val model3 = formula2.setHandleInvalid("skip").fit(df1)
+    val model4 = formula2.setHandleInvalid("keep").fit(df1)
 
     val expected3 = Seq(
       (1, "foo", "zq", Vectors.dense(0.0, 1.0), 0.0),
@@ -545,8 +554,8 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
       (3, "bar", "zy", Vectors.dense(1.0, 0.0, 3.0), 2.0)
     ).toDF("id", "a", "b", "features", "label")
 
-    assert(result3.collect() === expected3.collect())
-    assert(result4.collect() === expected4.collect())
+    testRFormulaTransform[(Int, String, String)](df2, model3, expected3)
+    testRFormulaTransform[(Int, String, String)](df2, model4, expected4)
   }
 
   test("Use Vectors as inputs to formula.") {

From 41d1a323c0e876352060d81f8c281a1565619ca1 Mon Sep 17 00:00:00 2001
From: Dilip Biswal <dbiswal@us.ibm.com>
Date: Wed, 17 Jan 2018 09:57:30 +0800
Subject: [PATCH 104/566] [SPARK-23095][SQL] Decorrelation of scalar subquery
 fails with java.util.NoSuchElementException

## What changes were proposed in this pull request?
The following SQL involving scalar correlated query returns a map exception.
``` SQL
SELECT t1a
FROM   t1
WHERE  t1a = (SELECT   count(*)
              FROM     t2
              WHERE    t2c = t1c
              HAVING   count(*) >= 1)
```
``` SQL
key not found: ExprId(278,786682bb-41f9-4bd5-a397-928272cc8e4e)
java.util.NoSuchElementException: key not found: ExprId(278,786682bb-41f9-4bd5-a397-928272cc8e4e)
        at scala.collection.MapLike$class.default(MapLike.scala:228)
        at scala.collection.AbstractMap.default(Map.scala:59)
        at scala.collection.MapLike$class.apply(MapLike.scala:141)
        at scala.collection.AbstractMap.apply(Map.scala:59)
        at org.apache.spark.sql.catalyst.optimizer.RewriteCorrelatedScalarSubquery$.org$apache$spark$sql$catalyst$optimizer$RewriteCorrelatedScalarSubquery$$evalSubqueryOnZeroTups(subquery.scala:378)
        at org.apache.spark.sql.catalyst.optimizer.RewriteCorrelatedScalarSubquery$$anonfun$org$apache$spark$sql$catalyst$optimizer$RewriteCorrelatedScalarSubquery$$constructLeftJoins$1.apply(subquery.scala:430)
        at org.apache.spark.sql.catalyst.optimizer.RewriteCorrelatedScalarSubquery$$anonfun$org$apache$spark$sql$catalyst$optimizer$RewriteCorrelatedScalarSubquery$$constructLeftJoins$1.apply(subquery.scala:426)
```

In this case, after evaluating the HAVING clause "count(*) > 1" statically
against the binding of aggregtation result on empty input, we determine
that this query will not have a the count bug. We should simply return
the evalSubqueryOnZeroTups with empty value.
(Please fill in changes proposed in this fix)

## How was this patch tested?
A new test was added in the Subquery bucket.

Author: Dilip Biswal <dbiswal@us.ibm.com>

Closes #20283 from dilipbiswal/scalar-count-defect.

(cherry picked from commit 0c2ba427bc7323729e6ffb34f1f06a97f0bf0c1d)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../sql/catalyst/optimizer/subquery.scala     |  5 +-
 .../scalar-subquery-predicate.sql             | 10 ++++
 .../scalar-subquery-predicate.sql.out         | 57 ++++++++++++-------
 3 files changed, 49 insertions(+), 23 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
index 2673bea648d09..709db6d8bec7d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
@@ -369,13 +369,14 @@ object RewriteCorrelatedScalarSubquery extends Rule[LogicalPlan] {
           case ne => (ne.exprId, evalAggOnZeroTups(ne))
         }.toMap
 
-      case _ => sys.error(s"Unexpected operator in scalar subquery: $lp")
+      case _ =>
+        sys.error(s"Unexpected operator in scalar subquery: $lp")
     }
 
     val resultMap = evalPlan(plan)
 
     // By convention, the scalar subquery result is the leftmost field.
-    resultMap(plan.output.head.exprId)
+    resultMap.getOrElse(plan.output.head.exprId, None)
   }
 
   /**
diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql
index fb0d07fbdace7..1661209093fc4 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql
@@ -173,6 +173,16 @@ WHERE  t1a = (SELECT   max(t2a)
               HAVING   count(*) >= 0)
 OR     t1i > '2014-12-31';
 
+-- TC 02.03.01
+SELECT t1a
+FROM   t1
+WHERE  t1a = (SELECT   max(t2a)
+              FROM     t2
+              WHERE    t2c = t1c
+              GROUP BY t2c
+              HAVING   count(*) >= 1)
+OR     t1i > '2014-12-31';
+
 -- TC 02.04
 -- t1 on the right of an outer join
 -- can be reduced to inner join
diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out
index 8b29300e71f90..a2b86db3e4f4c 100644
--- a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 26
+-- Number of queries: 29
 
 
 -- !query 0
@@ -293,6 +293,21 @@ val1d
 
 
 -- !query 19
+SELECT t1a
+FROM   t1
+WHERE  t1a = (SELECT   max(t2a)
+              FROM     t2
+              WHERE    t2c = t1c
+              GROUP BY t2c
+              HAVING   count(*) >= 1)
+OR     t1i > '2014-12-31'
+-- !query 19 schema
+struct<t1a:string>
+-- !query 19 output
+val1c
+val1d
+
+-- !query 22
 SELECT count(t1a)
 FROM   t1 RIGHT JOIN t2
 ON     t1d = t2d
@@ -300,13 +315,13 @@ WHERE  t1a < (SELECT   max(t2a)
               FROM     t2
               WHERE    t2c = t1c
               GROUP BY t2c)
--- !query 19 schema
+-- !query 22 schema
 struct<count(t1a):bigint>
--- !query 19 output
+-- !query 22 output
 7
 
 
--- !query 20
+-- !query 23
 SELECT t1a
 FROM   t1
 WHERE  t1b <= (SELECT   max(t2b)
@@ -317,14 +332,14 @@ AND    t1b >= (SELECT   min(t2b)
                FROM     t2
                WHERE    t2c = t1c
                GROUP BY t2c)
--- !query 20 schema
+-- !query 23 schema
 struct<t1a:string>
--- !query 20 output
+-- !query 23 output
 val1b
 val1c
 
 
--- !query 21
+-- !query 24
 SELECT t1a
 FROM   t1
 WHERE  t1a <= (SELECT   max(t2a)
@@ -338,14 +353,14 @@ WHERE  t1a >= (SELECT   min(t2a)
                FROM     t2
                WHERE    t2c = t1c
                GROUP BY t2c)
--- !query 21 schema
+-- !query 24 schema
 struct<t1a:string>
--- !query 21 output
+-- !query 24 output
 val1b
 val1c
 
 
--- !query 22
+-- !query 25
 SELECT t1a
 FROM   t1
 WHERE  t1a <= (SELECT   max(t2a)
@@ -359,9 +374,9 @@ WHERE  t1a >= (SELECT   min(t2a)
                FROM     t2
                WHERE    t2c = t1c
                GROUP BY t2c)
--- !query 22 schema
+-- !query 25 schema
 struct<t1a:string>
--- !query 22 output
+-- !query 25 output
 val1a
 val1a
 val1b
@@ -372,7 +387,7 @@ val1d
 val1d
 
 
--- !query 23
+-- !query 26
 SELECT t1a
 FROM   t1
 WHERE  t1a <= (SELECT   max(t2a)
@@ -386,16 +401,16 @@ WHERE  t1a >= (SELECT   min(t2a)
                FROM     t2
                WHERE    t2c = t1c
                GROUP BY t2c)
--- !query 23 schema
+-- !query 26 schema
 struct<t1a:string>
--- !query 23 output
+-- !query 26 output
 val1a
 val1b
 val1c
 val1d
 
 
--- !query 24
+-- !query 27
 SELECT t1a
 FROM   t1
 WHERE  t1a <= (SELECT   max(t2a)
@@ -409,13 +424,13 @@ WHERE  t1a >= (SELECT   min(t2a)
                FROM     t2
                WHERE    t2c = t1c
                GROUP BY t2c)
--- !query 24 schema
+-- !query 27 schema
 struct<t1a:string>
--- !query 24 output
+-- !query 27 output
 val1a
 
 
--- !query 25
+-- !query 28
 SELECT   t1a
 FROM     t1
 GROUP BY t1a, t1c
@@ -423,8 +438,8 @@ HAVING   max(t1b) <= (SELECT   max(t2b)
                       FROM     t2
                       WHERE    t2c = t1c
                       GROUP BY t2c)
--- !query 25 schema
+-- !query 28 schema
 struct<t1a:string>
--- !query 25 output
+-- !query 28 output
 val1b
 val1c

From 08252bb38e48a940b4d8fb975fe12a020fe36b97 Mon Sep 17 00:00:00 2001
From: Gabor Somogyi <gabor.g.somogyi@gmail.com>
Date: Wed, 17 Jan 2018 10:03:25 +0800
Subject: [PATCH 105/566] [SPARK-22361][SQL][TEST] Add unit test for Window
 Frames

## What changes were proposed in this pull request?

There are already quite a few integration tests using window frames, but the unit tests coverage is not ideal.

In this PR the already existing tests are reorganized, extended and where gaps found additional cases added.

## How was this patch tested?

Automated: Pass the Jenkins.

Author: Gabor Somogyi <gabor.g.somogyi@gmail.com>

Closes #20019 from gaborgsomogyi/SPARK-22361.

(cherry picked from commit a9b845ebb5b51eb619cfa7d73b6153024a6a420d)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../parser/ExpressionParserSuite.scala        |  57 ++-
 .../sql/DataFrameWindowFramesSuite.scala      | 405 ++++++++++++++++++
 .../sql/DataFrameWindowFunctionsSuite.scala   | 243 -----------
 3 files changed, 454 insertions(+), 251 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
index 2b9783a3295c6..cb8a1fecb80a7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
@@ -249,8 +249,8 @@ class ExpressionParserSuite extends PlanTest {
     assertEqual("foo(*) over (partition by a, b)", windowed(Seq('a, 'b)))
     assertEqual("foo(*) over (distribute by a, b)", windowed(Seq('a, 'b)))
     assertEqual("foo(*) over (cluster by a, b)", windowed(Seq('a, 'b)))
-    assertEqual("foo(*) over (order by a desc, b asc)", windowed(Seq.empty, Seq('a.desc, 'b.asc )))
-    assertEqual("foo(*) over (sort by a desc, b asc)", windowed(Seq.empty, Seq('a.desc, 'b.asc )))
+    assertEqual("foo(*) over (order by a desc, b asc)", windowed(Seq.empty, Seq('a.desc, 'b.asc)))
+    assertEqual("foo(*) over (sort by a desc, b asc)", windowed(Seq.empty, Seq('a.desc, 'b.asc)))
     assertEqual("foo(*) over (partition by a, b order by c)", windowed(Seq('a, 'b), Seq('c.asc)))
     assertEqual("foo(*) over (distribute by a, b sort by c)", windowed(Seq('a, 'b), Seq('c.asc)))
 
@@ -263,21 +263,62 @@ class ExpressionParserSuite extends PlanTest {
       "sum(product + 1) over (partition by ((product / 2) + 1) order by 2)",
       WindowExpression('sum.function('product + 1),
         WindowSpecDefinition(Seq('product / 2 + 1), Seq(Literal(2).asc), UnspecifiedFrame)))
+  }
+
+  test("range/rows window function expressions") {
+    val func = 'foo.function(star())
+    def windowed(
+        partitioning: Seq[Expression] = Seq.empty,
+        ordering: Seq[SortOrder] = Seq.empty,
+        frame: WindowFrame = UnspecifiedFrame): Expression = {
+      WindowExpression(func, WindowSpecDefinition(partitioning, ordering, frame))
+    }
 
-    // Range/Row
     val frameTypes = Seq(("rows", RowFrame), ("range", RangeFrame))
     val boundaries = Seq(
-      ("10 preceding", -Literal(10), CurrentRow),
+      // No between combinations
+      ("unbounded preceding", UnboundedPreceding, CurrentRow),
       ("2147483648 preceding", -Literal(2147483648L), CurrentRow),
+      ("10 preceding", -Literal(10), CurrentRow),
+      ("3 + 1 preceding", -Add(Literal(3), Literal(1)), CurrentRow),
+      ("0 preceding", -Literal(0), CurrentRow),
+      ("current row", CurrentRow, CurrentRow),
+      ("0 following", Literal(0), CurrentRow),
       ("3 + 1 following", Add(Literal(3), Literal(1)), CurrentRow),
-      ("unbounded preceding", UnboundedPreceding, CurrentRow),
+      ("10 following", Literal(10), CurrentRow),
+      ("2147483649 following", Literal(2147483649L), CurrentRow),
       ("unbounded following", UnboundedFollowing, CurrentRow), // Will fail during analysis
+
+      // Between combinations
+      ("between unbounded preceding and 5 following",
+        UnboundedPreceding, Literal(5)),
+      ("between unbounded preceding and 3 + 1 following",
+        UnboundedPreceding, Add(Literal(3), Literal(1))),
+      ("between unbounded preceding and 2147483649 following",
+        UnboundedPreceding, Literal(2147483649L)),
       ("between unbounded preceding and current row", UnboundedPreceding, CurrentRow),
-      ("between unbounded preceding and unbounded following",
-        UnboundedPreceding, UnboundedFollowing),
+      ("between 2147483648 preceding and current row", -Literal(2147483648L), CurrentRow),
       ("between 10 preceding and current row", -Literal(10), CurrentRow),
+      ("between 3 + 1 preceding and current row", -Add(Literal(3), Literal(1)), CurrentRow),
+      ("between 0 preceding and current row", -Literal(0), CurrentRow),
+      ("between current row and current row", CurrentRow, CurrentRow),
+      ("between current row and 0 following", CurrentRow, Literal(0)),
       ("between current row and 5 following", CurrentRow, Literal(5)),
-      ("between 10 preceding and 5 following", -Literal(10), Literal(5))
+      ("between current row and 3 + 1 following", CurrentRow, Add(Literal(3), Literal(1))),
+      ("between current row and 2147483649 following", CurrentRow, Literal(2147483649L)),
+      ("between current row and unbounded following", CurrentRow, UnboundedFollowing),
+      ("between 2147483648 preceding and unbounded following",
+        -Literal(2147483648L), UnboundedFollowing),
+      ("between 10 preceding and unbounded following",
+        -Literal(10), UnboundedFollowing),
+      ("between 3 + 1 preceding and unbounded following",
+        -Add(Literal(3), Literal(1)), UnboundedFollowing),
+      ("between 0 preceding and unbounded following", -Literal(0), UnboundedFollowing),
+
+      // Between partial and full range
+      ("between 10 preceding and 5 following", -Literal(10), Literal(5)),
+      ("between unbounded preceding and unbounded following",
+        UnboundedPreceding, UnboundedFollowing)
     )
     frameTypes.foreach {
       case (frameTypeSql, frameType) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala
new file mode 100644
index 0000000000000..0ee9b0edc02b2
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala
@@ -0,0 +1,405 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import java.sql.{Date, Timestamp}
+
+import org.apache.spark.sql.expressions.Window
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.unsafe.types.CalendarInterval
+
+/**
+ * Window frame testing for DataFrame API.
+ */
+class DataFrameWindowFramesSuite extends QueryTest with SharedSQLContext {
+  import testImplicits._
+
+  test("lead/lag with empty data frame") {
+    val df = Seq.empty[(Int, String)].toDF("key", "value")
+    val window = Window.partitionBy($"key").orderBy($"value")
+
+    checkAnswer(
+      df.select(
+        lead("value", 1).over(window),
+        lag("value", 1).over(window)),
+      Nil)
+  }
+
+  test("lead/lag with positive offset") {
+    val df = Seq((1, "1"), (2, "2"), (1, "3"), (2, "4")).toDF("key", "value")
+    val window = Window.partitionBy($"key").orderBy($"value")
+
+    checkAnswer(
+      df.select(
+        $"key",
+        lead("value", 1).over(window),
+        lag("value", 1).over(window)),
+      Row(1, "3", null) :: Row(1, null, "1") :: Row(2, "4", null) :: Row(2, null, "2") :: Nil)
+  }
+
+  test("reverse lead/lag with positive offset") {
+    val df = Seq((1, "1"), (2, "2"), (1, "3"), (2, "4")).toDF("key", "value")
+    val window = Window.partitionBy($"key").orderBy($"value".desc)
+
+    checkAnswer(
+      df.select(
+        $"key",
+        lead("value", 1).over(window),
+        lag("value", 1).over(window)),
+      Row(1, "1", null) :: Row(1, null, "3") :: Row(2, "2", null) :: Row(2, null, "4") :: Nil)
+  }
+
+  test("lead/lag with negative offset") {
+    val df = Seq((1, "1"), (2, "2"), (1, "3"), (2, "4")).toDF("key", "value")
+    val window = Window.partitionBy($"key").orderBy($"value")
+
+    checkAnswer(
+      df.select(
+        $"key",
+        lead("value", -1).over(window),
+        lag("value", -1).over(window)),
+      Row(1, null, "3") :: Row(1, "1", null) :: Row(2, null, "4") :: Row(2, "2", null) :: Nil)
+  }
+
+  test("reverse lead/lag with negative offset") {
+    val df = Seq((1, "1"), (2, "2"), (1, "3"), (2, "4")).toDF("key", "value")
+    val window = Window.partitionBy($"key").orderBy($"value".desc)
+
+    checkAnswer(
+      df.select(
+        $"key",
+        lead("value", -1).over(window),
+        lag("value", -1).over(window)),
+      Row(1, null, "1") :: Row(1, "3", null) :: Row(2, null, "2") :: Row(2, "4", null) :: Nil)
+  }
+
+  test("lead/lag with default value") {
+    val default = "n/a"
+    val df = Seq((1, "1"), (2, "2"), (1, "3"), (2, "4"), (2, "5")).toDF("key", "value")
+    val window = Window.partitionBy($"key").orderBy($"value")
+
+    checkAnswer(
+      df.select(
+        $"key",
+        lead("value", 2, default).over(window),
+        lag("value", 2, default).over(window),
+        lead("value", -2, default).over(window),
+        lag("value", -2, default).over(window)),
+      Row(1, default, default, default, default) :: Row(1, default, default, default, default) ::
+        Row(2, "5", default, default, "5") :: Row(2, default, "2", "2", default) ::
+        Row(2, default, default, default, default) :: Nil)
+  }
+
+  test("rows/range between with empty data frame") {
+    val df = Seq.empty[(String, Int)].toDF("key", "value")
+    val window = Window.partitionBy($"key").orderBy($"value")
+
+    checkAnswer(
+      df.select(
+        'key,
+        first("value").over(
+          window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)),
+        first("value").over(
+          window.rangeBetween(Window.unboundedPreceding, Window.unboundedFollowing))),
+      Nil)
+  }
+
+  test("rows between should accept int/long values as boundary") {
+    val df = Seq((1L, "1"), (1L, "1"), (2147483650L, "1"), (3L, "2"), (2L, "1"), (2147483650L, "2"))
+      .toDF("key", "value")
+
+    checkAnswer(
+      df.select(
+        $"key",
+        count("key").over(
+          Window.partitionBy($"value").orderBy($"key").rowsBetween(0, 2147483647))),
+      Seq(Row(1, 3), Row(1, 4), Row(2, 2), Row(3, 2), Row(2147483650L, 1), Row(2147483650L, 1))
+    )
+
+    val e = intercept[AnalysisException](
+      df.select(
+        $"key",
+        count("key").over(
+          Window.partitionBy($"value").orderBy($"key").rowsBetween(0, 2147483648L))))
+    assert(e.message.contains("Boundary end is not a valid integer: 2147483648"))
+  }
+
+  test("range between should accept at most one ORDER BY expression when unbounded") {
+    val df = Seq((1, 1)).toDF("key", "value")
+    val window = Window.orderBy($"key", $"value")
+
+    checkAnswer(
+      df.select(
+        $"key",
+        min("key").over(
+          window.rangeBetween(Window.unboundedPreceding, Window.unboundedFollowing))),
+      Seq(Row(1, 1))
+    )
+
+    val e1 = intercept[AnalysisException](
+      df.select(
+        min("key").over(window.rangeBetween(Window.unboundedPreceding, 1))))
+    assert(e1.message.contains("A range window frame with value boundaries cannot be used in a " +
+      "window specification with multiple order by expressions"))
+
+    val e2 = intercept[AnalysisException](
+      df.select(
+        min("key").over(window.rangeBetween(-1, Window.unboundedFollowing))))
+    assert(e2.message.contains("A range window frame with value boundaries cannot be used in a " +
+      "window specification with multiple order by expressions"))
+
+    val e3 = intercept[AnalysisException](
+      df.select(
+        min("key").over(window.rangeBetween(-1, 1))))
+    assert(e3.message.contains("A range window frame with value boundaries cannot be used in a " +
+      "window specification with multiple order by expressions"))
+  }
+
+  test("range between should accept numeric values only when bounded") {
+    val df = Seq("non_numeric").toDF("value")
+    val window = Window.orderBy($"value")
+
+    checkAnswer(
+      df.select(
+        $"value",
+        min("value").over(
+          window.rangeBetween(Window.unboundedPreceding, Window.unboundedFollowing))),
+      Row("non_numeric", "non_numeric") :: Nil)
+
+    val e1 = intercept[AnalysisException](
+      df.select(
+        min("value").over(window.rangeBetween(Window.unboundedPreceding, 1))))
+    assert(e1.message.contains("The data type of the upper bound 'string' " +
+      "does not match the expected data type"))
+
+    val e2 = intercept[AnalysisException](
+      df.select(
+        min("value").over(window.rangeBetween(-1, Window.unboundedFollowing))))
+    assert(e2.message.contains("The data type of the lower bound 'string' " +
+      "does not match the expected data type"))
+
+    val e3 = intercept[AnalysisException](
+      df.select(
+        min("value").over(window.rangeBetween(-1, 1))))
+    assert(e3.message.contains("The data type of the lower bound 'string' " +
+      "does not match the expected data type"))
+  }
+
+  test("range between should accept int/long values as boundary") {
+    val df = Seq((1L, "1"), (1L, "1"), (2147483650L, "1"), (3L, "2"), (2L, "1"), (2147483650L, "2"))
+      .toDF("key", "value")
+
+    checkAnswer(
+      df.select(
+        $"key",
+        count("key").over(
+          Window.partitionBy($"value").orderBy($"key").rangeBetween(0, 2147483648L))),
+      Seq(Row(1, 3), Row(1, 3), Row(2, 2), Row(3, 2), Row(2147483650L, 1), Row(2147483650L, 1))
+    )
+    checkAnswer(
+      df.select(
+        $"key",
+        count("key").over(
+          Window.partitionBy($"value").orderBy($"key").rangeBetween(-2147483649L, 0))),
+      Seq(Row(1, 2), Row(1, 2), Row(2, 3), Row(2147483650L, 2), Row(2147483650L, 4), Row(3, 1))
+    )
+
+    def dt(date: String): Date = Date.valueOf(date)
+
+    val df2 = Seq((dt("2017-08-01"), "1"), (dt("2017-08-01"), "1"), (dt("2020-12-31"), "1"),
+      (dt("2017-08-03"), "2"), (dt("2017-08-02"), "1"), (dt("2020-12-31"), "2"))
+      .toDF("key", "value")
+    val window = Window.partitionBy($"value").orderBy($"key").rangeBetween(lit(0), lit(2))
+
+    checkAnswer(
+      df2.select(
+        $"key",
+        count("key").over(window)),
+      Seq(Row(dt("2017-08-01"), 3), Row(dt("2017-08-01"), 3), Row(dt("2020-12-31"), 1),
+        Row(dt("2017-08-03"), 1), Row(dt("2017-08-02"), 1), Row(dt("2020-12-31"), 1))
+    )
+  }
+
+  test("range between should accept double values as boundary") {
+    val df = Seq((1.0D, "1"), (1.0D, "1"), (100.001D, "1"), (3.3D, "2"), (2.02D, "1"),
+      (100.001D, "2")).toDF("key", "value")
+    val window = Window.partitionBy($"value").orderBy($"key").rangeBetween(currentRow, lit(2.5D))
+
+    checkAnswer(
+      df.select(
+        $"key",
+        count("key").over(window)),
+      Seq(Row(1.0, 3), Row(1.0, 3), Row(100.001, 1), Row(3.3, 1), Row(2.02, 1), Row(100.001, 1))
+    )
+  }
+
+  test("range between should accept interval values as boundary") {
+    def ts(timestamp: Long): Timestamp = new Timestamp(timestamp * 1000)
+
+    val df = Seq((ts(1501545600), "1"), (ts(1501545600), "1"), (ts(1609372800), "1"),
+      (ts(1503000000), "2"), (ts(1502000000), "1"), (ts(1609372800), "2"))
+      .toDF("key", "value")
+    val window = Window.partitionBy($"value").orderBy($"key")
+      .rangeBetween(currentRow, lit(CalendarInterval.fromString("interval 23 days 4 hours")))
+
+    checkAnswer(
+      df.select(
+        $"key",
+        count("key").over(window)),
+      Seq(Row(ts(1501545600), 3), Row(ts(1501545600), 3), Row(ts(1609372800), 1),
+        Row(ts(1503000000), 1), Row(ts(1502000000), 1), Row(ts(1609372800), 1))
+    )
+  }
+
+  test("unbounded rows/range between with aggregation") {
+    val df = Seq(("one", 1), ("two", 2), ("one", 3), ("two", 4)).toDF("key", "value")
+    val window = Window.partitionBy($"key").orderBy($"value")
+
+    checkAnswer(
+      df.select(
+        'key,
+        sum("value").over(window.
+          rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)),
+        sum("value").over(window.
+          rangeBetween(Window.unboundedPreceding, Window.unboundedFollowing))),
+      Row("one", 4, 4) :: Row("one", 4, 4) :: Row("two", 6, 6) :: Row("two", 6, 6) :: Nil)
+  }
+
+  test("unbounded preceding/following rows between with aggregation") {
+    val df = Seq((1, "1"), (2, "2"), (2, "3"), (1, "3"), (3, "2"), (4, "3")).toDF("key", "value")
+    val window = Window.partitionBy($"value").orderBy($"key")
+
+    checkAnswer(
+      df.select(
+        $"key",
+        last("key").over(
+          window.rowsBetween(Window.currentRow, Window.unboundedFollowing)),
+        last("key").over(
+          window.rowsBetween(Window.unboundedPreceding, Window.currentRow))),
+      Row(1, 1, 1) :: Row(2, 3, 2) :: Row(3, 3, 3) :: Row(1, 4, 1) :: Row(2, 4, 2) ::
+        Row(4, 4, 4) :: Nil)
+  }
+
+  test("reverse unbounded preceding/following rows between with aggregation") {
+    val df = Seq((1, "1"), (2, "2"), (2, "3"), (1, "3"), (3, "2"), (4, "3")).toDF("key", "value")
+    val window = Window.partitionBy($"value").orderBy($"key".desc)
+
+    checkAnswer(
+      df.select(
+        $"key",
+        last("key").over(
+          window.rowsBetween(Window.currentRow, Window.unboundedFollowing)),
+        last("key").over(
+          window.rowsBetween(Window.unboundedPreceding, Window.currentRow))),
+      Row(1, 1, 1) :: Row(3, 2, 3) :: Row(2, 2, 2) :: Row(4, 1, 4) :: Row(2, 1, 2) ::
+        Row(1, 1, 1) :: Nil)
+  }
+
+  test("unbounded preceding/following range between with aggregation") {
+    val df = Seq((5, "1"), (5, "2"), (4, "2"), (6, "2"), (3, "1"), (2, "2")).toDF("key", "value")
+    val window = Window.partitionBy("value").orderBy("key")
+
+    checkAnswer(
+      df.select(
+        $"key",
+        avg("key").over(window.rangeBetween(Window.unboundedPreceding, 1))
+          .as("avg_key1"),
+        avg("key").over(window.rangeBetween(Window.currentRow, Window.unboundedFollowing))
+          .as("avg_key2")),
+      Row(3, 3.0d, 4.0d) :: Row(5, 4.0d, 5.0d) :: Row(2, 2.0d, 17.0d / 4.0d) ::
+        Row(4, 11.0d / 3.0d, 5.0d) :: Row(5, 17.0d / 4.0d, 11.0d / 2.0d) ::
+        Row(6, 17.0d / 4.0d, 6.0d) :: Nil)
+  }
+
+  // This is here to illustrate the fact that reverse order also reverses offsets.
+  test("reverse preceding/following range between with aggregation") {
+    val df = Seq(1, 2, 4, 3, 2, 1).toDF("value")
+    val window = Window.orderBy($"value".desc)
+
+    checkAnswer(
+      df.select(
+        $"value",
+        sum($"value").over(window.rangeBetween(Window.unboundedPreceding, 1)),
+        sum($"value").over(window.rangeBetween(1, Window.unboundedFollowing))),
+      Row(1, 13, null) :: Row(2, 13, 2) :: Row(4, 7, 9) :: Row(3, 11, 6) ::
+        Row(2, 13, 2) :: Row(1, 13, null) :: Nil)
+  }
+
+  test("sliding rows between with aggregation") {
+    val df = Seq((1, "1"), (2, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
+    val window = Window.partitionBy($"value").orderBy($"key").rowsBetween(-1, 2)
+
+    checkAnswer(
+      df.select(
+        $"key",
+        avg("key").over(window)),
+      Row(1, 4.0d / 3.0d) :: Row(1, 4.0d / 3.0d) :: Row(2, 3.0d / 2.0d) :: Row(2, 2.0d) ::
+        Row(2, 2.0d) :: Nil)
+  }
+
+  test("reverse sliding rows between with aggregation") {
+    val df = Seq((1, "1"), (2, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
+    val window = Window.partitionBy($"value").orderBy($"key".desc).rowsBetween(-1, 2)
+
+    checkAnswer(
+      df.select(
+        $"key",
+        avg("key").over(window)),
+      Row(1, 1.0d) :: Row(1, 4.0d / 3.0d) :: Row(2, 4.0d / 3.0d) :: Row(2, 2.0d) ::
+        Row(2, 2.0d) :: Nil)
+  }
+
+  test("sliding range between with aggregation") {
+    val df = Seq((1, "1"), (1, "1"), (3, "1"), (2, "2"), (2, "1"), (2, "2")).toDF("key", "value")
+    val window = Window.partitionBy($"value").orderBy($"key").rangeBetween(-1, 1)
+
+    checkAnswer(
+      df.select(
+        $"key",
+        avg("key").over(window)),
+      Row(1, 4.0d / 3.0d) :: Row(1, 4.0d / 3.0d) :: Row(2, 7.0d / 4.0d) :: Row(3, 5.0d / 2.0d) ::
+        Row(2, 2.0d) :: Row(2, 2.0d) :: Nil)
+  }
+
+  test("reverse sliding range between with aggregation") {
+    val df = Seq(
+      (1, "Thin", "Cell Phone", 6000),
+      (2, "Normal", "Tablet", 1500),
+      (3, "Mini", "Tablet", 5500),
+      (4, "Ultra thin", "Cell Phone", 5500),
+      (5, "Very thin", "Cell Phone", 6000),
+      (6, "Big", "Tablet", 2500),
+      (7, "Bendable", "Cell Phone", 3000),
+      (8, "Foldable", "Cell Phone", 3000),
+      (9, "Pro", "Tablet", 4500),
+      (10, "Pro2", "Tablet", 6500)).
+      toDF("id", "product", "category", "revenue")
+    val window = Window.partitionBy($"category").orderBy($"revenue".desc).
+      rangeBetween(-2000L, 1000L)
+
+    checkAnswer(
+      df.select(
+        $"id",
+        avg($"revenue").over(window).cast("int")),
+      Row(1, 5833) :: Row(2, 2000) :: Row(3, 5500) ::
+        Row(4, 5833) :: Row(5, 5833) :: Row(6, 2833) ::
+        Row(7, 3000) :: Row(8, 3000) :: Row(9, 5500) ::
+        Row(10, 6000) :: Nil)
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala
index 01c988ecc3726..281147835abde 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala
@@ -55,56 +55,6 @@ class DataFrameWindowFunctionsSuite extends QueryTest with SharedSQLContext {
       Row(1, "1") :: Row(2, "2") :: Row(null, null) :: Row(null, null) :: Nil)
   }
 
-  test("Window.rowsBetween") {
-    val df = Seq(("one", 1), ("two", 2)).toDF("key", "value")
-    // Running (cumulative) sum
-    checkAnswer(
-      df.select('key, sum("value").over(
-        Window.rowsBetween(Window.unboundedPreceding, Window.currentRow))),
-      Row("one", 1) :: Row("two", 3) :: Nil
-    )
-  }
-
-  test("lead") {
-    val df = Seq((1, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-
-    checkAnswer(
-      df.select(
-        lead("value", 1).over(Window.partitionBy($"key").orderBy($"value"))),
-      Row("1") :: Row(null) :: Row("2") :: Row(null) :: Nil)
-  }
-
-  test("lag") {
-    val df = Seq((1, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-
-    checkAnswer(
-      df.select(
-        lag("value", 1).over(Window.partitionBy($"key").orderBy($"value"))),
-      Row(null) :: Row("1") :: Row(null) :: Row("2") :: Nil)
-  }
-
-  test("lead with default value") {
-    val df = Seq((1, "1"), (1, "1"), (2, "2"), (1, "1"),
-                 (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-    checkAnswer(
-      df.select(
-        lead("value", 2, "n/a").over(Window.partitionBy("key").orderBy("value"))),
-      Seq(Row("1"), Row("1"), Row("n/a"), Row("n/a"), Row("2"), Row("n/a"), Row("n/a")))
-  }
-
-  test("lag with default value") {
-    val df = Seq((1, "1"), (1, "1"), (2, "2"), (1, "1"),
-                 (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-    checkAnswer(
-      df.select(
-        lag("value", 2, "n/a").over(Window.partitionBy($"key").orderBy($"value"))),
-      Seq(Row("n/a"), Row("n/a"), Row("1"), Row("1"), Row("n/a"), Row("n/a"), Row("2")))
-  }
-
   test("rank functions in unspecific window") {
     val df = Seq((1, "1"), (2, "2"), (1, "2"), (2, "2")).toDF("key", "value")
     df.createOrReplaceTempView("window_table")
@@ -136,199 +86,6 @@ class DataFrameWindowFunctionsSuite extends QueryTest with SharedSQLContext {
     assert(e.message.contains("requires window to be ordered"))
   }
 
-  test("aggregation and rows between") {
-    val df = Seq((1, "1"), (2, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-    checkAnswer(
-      df.select(
-        avg("key").over(Window.partitionBy($"value").orderBy($"key").rowsBetween(-1, 2))),
-      Seq(Row(4.0d / 3.0d), Row(4.0d / 3.0d), Row(3.0d / 2.0d), Row(2.0d), Row(2.0d)))
-  }
-
-  test("aggregation and range between") {
-    val df = Seq((1, "1"), (1, "1"), (3, "1"), (2, "2"), (2, "1"), (2, "2")).toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-    checkAnswer(
-      df.select(
-        avg("key").over(Window.partitionBy($"value").orderBy($"key").rangeBetween(-1, 1))),
-      Seq(Row(4.0d / 3.0d), Row(4.0d / 3.0d), Row(7.0d / 4.0d), Row(5.0d / 2.0d),
-        Row(2.0d), Row(2.0d)))
-  }
-
-  test("row between should accept integer values as boundary") {
-    val df = Seq((1L, "1"), (1L, "1"), (2147483650L, "1"),
-      (3L, "2"), (2L, "1"), (2147483650L, "2"))
-      .toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-    checkAnswer(
-      df.select(
-        $"key",
-        count("key").over(
-          Window.partitionBy($"value").orderBy($"key").rowsBetween(0, 2147483647))),
-      Seq(Row(1, 3), Row(1, 4), Row(2, 2), Row(3, 2), Row(2147483650L, 1), Row(2147483650L, 1))
-    )
-
-    val e = intercept[AnalysisException](
-      df.select(
-        $"key",
-        count("key").over(
-          Window.partitionBy($"value").orderBy($"key").rowsBetween(0, 2147483648L))))
-    assert(e.message.contains("Boundary end is not a valid integer: 2147483648"))
-  }
-
-  test("range between should accept int/long values as boundary") {
-    val df = Seq((1L, "1"), (1L, "1"), (2147483650L, "1"),
-      (3L, "2"), (2L, "1"), (2147483650L, "2"))
-      .toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-    checkAnswer(
-      df.select(
-        $"key",
-        count("key").over(
-          Window.partitionBy($"value").orderBy($"key").rangeBetween(0, 2147483648L))),
-      Seq(Row(1, 3), Row(1, 3), Row(2, 2), Row(3, 2), Row(2147483650L, 1), Row(2147483650L, 1))
-    )
-    checkAnswer(
-      df.select(
-        $"key",
-        count("key").over(
-          Window.partitionBy($"value").orderBy($"key").rangeBetween(-2147483649L, 0))),
-      Seq(Row(1, 2), Row(1, 2), Row(2, 3), Row(2147483650L, 2), Row(2147483650L, 4), Row(3, 1))
-    )
-
-    def dt(date: String): Date = Date.valueOf(date)
-
-    val df2 = Seq((dt("2017-08-01"), "1"), (dt("2017-08-01"), "1"), (dt("2020-12-31"), "1"),
-      (dt("2017-08-03"), "2"), (dt("2017-08-02"), "1"), (dt("2020-12-31"), "2"))
-      .toDF("key", "value")
-    checkAnswer(
-      df2.select(
-        $"key",
-        count("key").over(
-          Window.partitionBy($"value").orderBy($"key").rangeBetween(lit(0), lit(2)))),
-      Seq(Row(dt("2017-08-01"), 3), Row(dt("2017-08-01"), 3), Row(dt("2020-12-31"), 1),
-        Row(dt("2017-08-03"), 1), Row(dt("2017-08-02"), 1), Row(dt("2020-12-31"), 1))
-    )
-  }
-
-  test("range between should accept double values as boundary") {
-    val df = Seq((1.0D, "1"), (1.0D, "1"), (100.001D, "1"),
-      (3.3D, "2"), (2.02D, "1"), (100.001D, "2"))
-      .toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-    checkAnswer(
-      df.select(
-        $"key",
-        count("key").over(
-          Window.partitionBy($"value").orderBy($"key")
-            .rangeBetween(currentRow, lit(2.5D)))),
-      Seq(Row(1.0, 3), Row(1.0, 3), Row(100.001, 1), Row(3.3, 1), Row(2.02, 1), Row(100.001, 1))
-    )
-  }
-
-  test("range between should accept interval values as boundary") {
-    def ts(timestamp: Long): Timestamp = new Timestamp(timestamp * 1000)
-
-    val df = Seq((ts(1501545600), "1"), (ts(1501545600), "1"), (ts(1609372800), "1"),
-      (ts(1503000000), "2"), (ts(1502000000), "1"), (ts(1609372800), "2"))
-      .toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-    checkAnswer(
-      df.select(
-        $"key",
-        count("key").over(
-          Window.partitionBy($"value").orderBy($"key")
-            .rangeBetween(currentRow,
-              lit(CalendarInterval.fromString("interval 23 days 4 hours"))))),
-      Seq(Row(ts(1501545600), 3), Row(ts(1501545600), 3), Row(ts(1609372800), 1),
-        Row(ts(1503000000), 1), Row(ts(1502000000), 1), Row(ts(1609372800), 1))
-    )
-  }
-
-  test("aggregation and rows between with unbounded") {
-    val df = Seq((1, "1"), (2, "2"), (2, "3"), (1, "3"), (3, "2"), (4, "3")).toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-    checkAnswer(
-      df.select(
-        $"key",
-        last("key").over(
-          Window.partitionBy($"value").orderBy($"key")
-            .rowsBetween(Window.currentRow, Window.unboundedFollowing)),
-        last("key").over(
-          Window.partitionBy($"value").orderBy($"key")
-            .rowsBetween(Window.unboundedPreceding, Window.currentRow)),
-        last("key").over(Window.partitionBy($"value").orderBy($"key").rowsBetween(-1, 1))),
-      Seq(Row(1, 1, 1, 1), Row(2, 3, 2, 3), Row(3, 3, 3, 3), Row(1, 4, 1, 2), Row(2, 4, 2, 4),
-        Row(4, 4, 4, 4)))
-  }
-
-  test("aggregation and range between with unbounded") {
-    val df = Seq((5, "1"), (5, "2"), (4, "2"), (6, "2"), (3, "1"), (2, "2")).toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-    checkAnswer(
-      df.select(
-        $"key",
-        last("value").over(
-          Window.partitionBy($"value").orderBy($"key").rangeBetween(-2, -1))
-          .equalTo("2")
-          .as("last_v"),
-        avg("key").over(Window.partitionBy("value").orderBy("key").rangeBetween(Long.MinValue, 1))
-          .as("avg_key1"),
-        avg("key").over(Window.partitionBy("value").orderBy("key").rangeBetween(0, Long.MaxValue))
-          .as("avg_key2"),
-        avg("key").over(Window.partitionBy("value").orderBy("key").rangeBetween(-1, 0))
-          .as("avg_key3")
-      ),
-      Seq(Row(3, null, 3.0d, 4.0d, 3.0d),
-        Row(5, false, 4.0d, 5.0d, 5.0d),
-        Row(2, null, 2.0d, 17.0d / 4.0d, 2.0d),
-        Row(4, true, 11.0d / 3.0d, 5.0d, 4.0d),
-        Row(5, true, 17.0d / 4.0d, 11.0d / 2.0d, 4.5d),
-        Row(6, true, 17.0d / 4.0d, 6.0d, 11.0d / 2.0d)))
-  }
-
-  test("reverse sliding range frame") {
-    val df = Seq(
-      (1, "Thin", "Cell Phone", 6000),
-      (2, "Normal", "Tablet", 1500),
-      (3, "Mini", "Tablet", 5500),
-      (4, "Ultra thin", "Cell Phone", 5500),
-      (5, "Very thin", "Cell Phone", 6000),
-      (6, "Big", "Tablet", 2500),
-      (7, "Bendable", "Cell Phone", 3000),
-      (8, "Foldable", "Cell Phone", 3000),
-      (9, "Pro", "Tablet", 4500),
-      (10, "Pro2", "Tablet", 6500)).
-      toDF("id", "product", "category", "revenue")
-    val window = Window.
-      partitionBy($"category").
-      orderBy($"revenue".desc).
-      rangeBetween(-2000L, 1000L)
-    checkAnswer(
-      df.select(
-        $"id",
-        avg($"revenue").over(window).cast("int")),
-      Row(1, 5833) :: Row(2, 2000) :: Row(3, 5500) ::
-        Row(4, 5833) :: Row(5, 5833) :: Row(6, 2833) ::
-        Row(7, 3000) :: Row(8, 3000) :: Row(9, 5500) ::
-        Row(10, 6000) :: Nil)
-  }
-
-  // This is here to illustrate the fact that reverse order also reverses offsets.
-  test("reverse unbounded range frame") {
-    val df = Seq(1, 2, 4, 3, 2, 1).
-      map(Tuple1.apply).
-      toDF("value")
-    val window = Window.orderBy($"value".desc)
-    checkAnswer(
-      df.select(
-        $"value",
-        sum($"value").over(window.rangeBetween(Long.MinValue, 1)),
-        sum($"value").over(window.rangeBetween(1, Long.MaxValue))),
-      Row(1, 13, null) :: Row(2, 13, 2) :: Row(4, 7, 9) ::
-        Row(3, 11, 6) :: Row(2, 13, 2) :: Row(1, 13, null) :: Nil)
-  }
-
   test("statistical functions") {
     val df = Seq(("a", 1), ("a", 1), ("a", 2), ("a", 2), ("b", 4), ("b", 3), ("b", 2)).
       toDF("key", "value")

From 0a441d2edb0a3f6c6c7c370db8917e1c07f211e7 Mon Sep 17 00:00:00 2001
From: Jose Torres <jose@databricks.com>
Date: Tue, 16 Jan 2018 18:11:27 -0800
Subject: [PATCH 106/566] [SPARK-22908][SS] Roll forward continuous processing
 Kafka support with fix to continuous Kafka data reader

## What changes were proposed in this pull request?

The Kafka reader is now interruptible and can close itself.
## How was this patch tested?

I locally ran one of the ContinuousKafkaSourceSuite tests in a tight loop. Before the fix, my machine ran out of open file descriptors a few iterations in; now it works fine.

Author: Jose Torres <jose@databricks.com>

Closes #20253 from jose-torres/fix-data-reader.

(cherry picked from commit 16670578519a7b787b0c63888b7d2873af12d5b9)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../sql/kafka010/KafkaContinuousReader.scala  | 260 +++++++++
 .../sql/kafka010/KafkaContinuousWriter.scala  | 119 ++++
 .../sql/kafka010/KafkaOffsetReader.scala      |  21 +-
 .../spark/sql/kafka010/KafkaSource.scala      |  17 +-
 .../sql/kafka010/KafkaSourceOffset.scala      |   7 +-
 .../sql/kafka010/KafkaSourceProvider.scala    | 105 +++-
 .../spark/sql/kafka010/KafkaWriteTask.scala   |  71 ++-
 .../spark/sql/kafka010/KafkaWriter.scala      |   5 +-
 .../kafka010/KafkaContinuousSinkSuite.scala   | 476 ++++++++++++++++
 .../kafka010/KafkaContinuousSourceSuite.scala |  96 ++++
 .../sql/kafka010/KafkaContinuousTest.scala    |  94 +++
 .../spark/sql/kafka010/KafkaSourceSuite.scala | 539 +++++++++---------
 .../apache/spark/sql/DataFrameReader.scala    |  32 +-
 .../apache/spark/sql/DataFrameWriter.scala    |  25 +-
 .../datasources/v2/WriteToDataSourceV2.scala  |   8 +-
 .../execution/streaming/StreamExecution.scala |  15 +-
 .../ContinuousDataSourceRDDIter.scala         |   4 +-
 .../continuous/ContinuousExecution.scala      |  67 ++-
 .../continuous/EpochCoordinator.scala         |  21 +-
 .../sql/streaming/DataStreamWriter.scala      |  26 +-
 .../spark/sql/streaming/StreamTest.scala      |  36 +-
 21 files changed, 1628 insertions(+), 416 deletions(-)
 create mode 100644 external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
 create mode 100644 external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala
 create mode 100644 external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
 create mode 100644 external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala
 create mode 100644 external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
new file mode 100644
index 0000000000000..fc977977504f7
--- /dev/null
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.{util => ju}
+import java.util.concurrent.TimeoutException
+
+import org.apache.kafka.clients.consumer.{ConsumerRecord, OffsetOutOfRangeException}
+import org.apache.kafka.common.TopicPartition
+
+import org.apache.spark.TaskContext
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter}
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.kafka010.KafkaSource.{INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE, INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE}
+import org.apache.spark.sql.sources.v2.reader._
+import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousDataReader, ContinuousReader, Offset, PartitionOffset}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.unsafe.types.UTF8String
+
+/**
+ * A [[ContinuousReader]] for data from kafka.
+ *
+ * @param offsetReader  a reader used to get kafka offsets. Note that the actual data will be
+ *                      read by per-task consumers generated later.
+ * @param kafkaParams   String params for per-task Kafka consumers.
+ * @param sourceOptions The [[org.apache.spark.sql.sources.v2.DataSourceV2Options]] params which
+ *                      are not Kafka consumer params.
+ * @param metadataPath Path to a directory this reader can use for writing metadata.
+ * @param initialOffsets The Kafka offsets to start reading data at.
+ * @param failOnDataLoss Flag indicating whether reading should fail in data loss
+ *                       scenarios, where some offsets after the specified initial ones can't be
+ *                       properly read.
+ */
+class KafkaContinuousReader(
+    offsetReader: KafkaOffsetReader,
+    kafkaParams: ju.Map[String, Object],
+    sourceOptions: Map[String, String],
+    metadataPath: String,
+    initialOffsets: KafkaOffsetRangeLimit,
+    failOnDataLoss: Boolean)
+  extends ContinuousReader with SupportsScanUnsafeRow with Logging {
+
+  private lazy val session = SparkSession.getActiveSession.get
+  private lazy val sc = session.sparkContext
+
+  private val pollTimeoutMs = sourceOptions.getOrElse("kafkaConsumer.pollTimeoutMs", "512").toLong
+
+  // Initialized when creating read tasks. If this diverges from the partitions at the latest
+  // offsets, we need to reconfigure.
+  // Exposed outside this object only for unit tests.
+  private[sql] var knownPartitions: Set[TopicPartition] = _
+
+  override def readSchema: StructType = KafkaOffsetReader.kafkaSchema
+
+  private var offset: Offset = _
+  override def setOffset(start: ju.Optional[Offset]): Unit = {
+    offset = start.orElse {
+      val offsets = initialOffsets match {
+        case EarliestOffsetRangeLimit => KafkaSourceOffset(offsetReader.fetchEarliestOffsets())
+        case LatestOffsetRangeLimit => KafkaSourceOffset(offsetReader.fetchLatestOffsets())
+        case SpecificOffsetRangeLimit(p) => offsetReader.fetchSpecificOffsets(p, reportDataLoss)
+      }
+      logInfo(s"Initial offsets: $offsets")
+      offsets
+    }
+  }
+
+  override def getStartOffset(): Offset = offset
+
+  override def deserializeOffset(json: String): Offset = {
+    KafkaSourceOffset(JsonUtils.partitionOffsets(json))
+  }
+
+  override def createUnsafeRowReadTasks(): ju.List[ReadTask[UnsafeRow]] = {
+    import scala.collection.JavaConverters._
+
+    val oldStartPartitionOffsets = KafkaSourceOffset.getPartitionOffsets(offset)
+
+    val currentPartitionSet = offsetReader.fetchEarliestOffsets().keySet
+    val newPartitions = currentPartitionSet.diff(oldStartPartitionOffsets.keySet)
+    val newPartitionOffsets = offsetReader.fetchEarliestOffsets(newPartitions.toSeq)
+
+    val deletedPartitions = oldStartPartitionOffsets.keySet.diff(currentPartitionSet)
+    if (deletedPartitions.nonEmpty) {
+      reportDataLoss(s"Some partitions were deleted: $deletedPartitions")
+    }
+
+    val startOffsets = newPartitionOffsets ++
+      oldStartPartitionOffsets.filterKeys(!deletedPartitions.contains(_))
+    knownPartitions = startOffsets.keySet
+
+    startOffsets.toSeq.map {
+      case (topicPartition, start) =>
+        KafkaContinuousReadTask(
+          topicPartition, start, kafkaParams, pollTimeoutMs, failOnDataLoss)
+          .asInstanceOf[ReadTask[UnsafeRow]]
+    }.asJava
+  }
+
+  /** Stop this source and free any resources it has allocated. */
+  def stop(): Unit = synchronized {
+    offsetReader.close()
+  }
+
+  override def commit(end: Offset): Unit = {}
+
+  override def mergeOffsets(offsets: Array[PartitionOffset]): Offset = {
+    val mergedMap = offsets.map {
+      case KafkaSourcePartitionOffset(p, o) => Map(p -> o)
+    }.reduce(_ ++ _)
+    KafkaSourceOffset(mergedMap)
+  }
+
+  override def needsReconfiguration(): Boolean = {
+    knownPartitions != null && offsetReader.fetchLatestOffsets().keySet != knownPartitions
+  }
+
+  override def toString(): String = s"KafkaSource[$offsetReader]"
+
+  /**
+   * If `failOnDataLoss` is true, this method will throw an `IllegalStateException`.
+   * Otherwise, just log a warning.
+   */
+  private def reportDataLoss(message: String): Unit = {
+    if (failOnDataLoss) {
+      throw new IllegalStateException(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE")
+    } else {
+      logWarning(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE")
+    }
+  }
+}
+
+/**
+ * A read task for continuous Kafka processing. This will be serialized and transformed into a
+ * full reader on executors.
+ *
+ * @param topicPartition The (topic, partition) pair this task is responsible for.
+ * @param startOffset The offset to start reading from within the partition.
+ * @param kafkaParams Kafka consumer params to use.
+ * @param pollTimeoutMs The timeout for Kafka consumer polling.
+ * @param failOnDataLoss Flag indicating whether data reader should fail if some offsets
+ *                       are skipped.
+ */
+case class KafkaContinuousReadTask(
+    topicPartition: TopicPartition,
+    startOffset: Long,
+    kafkaParams: ju.Map[String, Object],
+    pollTimeoutMs: Long,
+    failOnDataLoss: Boolean) extends ReadTask[UnsafeRow] {
+  override def createDataReader(): KafkaContinuousDataReader = {
+    new KafkaContinuousDataReader(
+      topicPartition, startOffset, kafkaParams, pollTimeoutMs, failOnDataLoss)
+  }
+}
+
+/**
+ * A per-task data reader for continuous Kafka processing.
+ *
+ * @param topicPartition The (topic, partition) pair this data reader is responsible for.
+ * @param startOffset The offset to start reading from within the partition.
+ * @param kafkaParams Kafka consumer params to use.
+ * @param pollTimeoutMs The timeout for Kafka consumer polling.
+ * @param failOnDataLoss Flag indicating whether data reader should fail if some offsets
+ *                       are skipped.
+ */
+class KafkaContinuousDataReader(
+    topicPartition: TopicPartition,
+    startOffset: Long,
+    kafkaParams: ju.Map[String, Object],
+    pollTimeoutMs: Long,
+    failOnDataLoss: Boolean) extends ContinuousDataReader[UnsafeRow] {
+  private val topic = topicPartition.topic
+  private val kafkaPartition = topicPartition.partition
+  private val consumer = CachedKafkaConsumer.createUncached(topic, kafkaPartition, kafkaParams)
+
+  private val sharedRow = new UnsafeRow(7)
+  private val bufferHolder = new BufferHolder(sharedRow)
+  private val rowWriter = new UnsafeRowWriter(bufferHolder, 7)
+
+  private var nextKafkaOffset = startOffset
+  private var currentRecord: ConsumerRecord[Array[Byte], Array[Byte]] = _
+
+  override def next(): Boolean = {
+    var r: ConsumerRecord[Array[Byte], Array[Byte]] = null
+    while (r == null) {
+      if (TaskContext.get().isInterrupted() || TaskContext.get().isCompleted()) return false
+      // Our consumer.get is not interruptible, so we have to set a low poll timeout, leaving
+      // interrupt points to end the query rather than waiting for new data that might never come.
+      try {
+        r = consumer.get(
+          nextKafkaOffset,
+          untilOffset = Long.MaxValue,
+          pollTimeoutMs,
+          failOnDataLoss)
+      } catch {
+        // We didn't read within the timeout. We're supposed to block indefinitely for new data, so
+        // swallow and ignore this.
+        case _: TimeoutException =>
+
+        // This is a failOnDataLoss exception. Retry if nextKafkaOffset is within the data range,
+        // or if it's the endpoint of the data range (i.e. the "true" next offset).
+        case e: IllegalStateException  if e.getCause.isInstanceOf[OffsetOutOfRangeException] =>
+          val range = consumer.getAvailableOffsetRange()
+          if (range.latest >= nextKafkaOffset && range.earliest <= nextKafkaOffset) {
+            // retry
+          } else {
+            throw e
+          }
+      }
+    }
+    nextKafkaOffset = r.offset + 1
+    currentRecord = r
+    true
+  }
+
+  override def get(): UnsafeRow = {
+    bufferHolder.reset()
+
+    if (currentRecord.key == null) {
+      rowWriter.setNullAt(0)
+    } else {
+      rowWriter.write(0, currentRecord.key)
+    }
+    rowWriter.write(1, currentRecord.value)
+    rowWriter.write(2, UTF8String.fromString(currentRecord.topic))
+    rowWriter.write(3, currentRecord.partition)
+    rowWriter.write(4, currentRecord.offset)
+    rowWriter.write(5,
+      DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(currentRecord.timestamp)))
+    rowWriter.write(6, currentRecord.timestampType.id)
+    sharedRow.setTotalSize(bufferHolder.totalSize)
+    sharedRow
+  }
+
+  override def getOffset(): KafkaSourcePartitionOffset = {
+    KafkaSourcePartitionOffset(topicPartition, nextKafkaOffset)
+  }
+
+  override def close(): Unit = {
+    consumer.close()
+  }
+}
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala
new file mode 100644
index 0000000000000..9843f469c5b25
--- /dev/null
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import org.apache.kafka.clients.producer.{Callback, ProducerRecord, RecordMetadata}
+import scala.collection.JavaConverters._
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Literal, UnsafeProjection}
+import org.apache.spark.sql.kafka010.KafkaSourceProvider.{kafkaParamsForProducer, TOPIC_OPTION_KEY}
+import org.apache.spark.sql.kafka010.KafkaWriter.validateQuery
+import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
+import org.apache.spark.sql.sources.v2.writer._
+import org.apache.spark.sql.streaming.OutputMode
+import org.apache.spark.sql.types.{BinaryType, StringType, StructType}
+
+/**
+ * Dummy commit message. The DataSourceV2 framework requires a commit message implementation but we
+ * don't need to really send one.
+ */
+case object KafkaWriterCommitMessage extends WriterCommitMessage
+
+/**
+ * A [[ContinuousWriter]] for Kafka writing. Responsible for generating the writer factory.
+ * @param topic The topic this writer is responsible for. If None, topic will be inferred from
+ *              a `topic` field in the incoming data.
+ * @param producerParams Parameters for Kafka producers in each task.
+ * @param schema The schema of the input data.
+ */
+class KafkaContinuousWriter(
+    topic: Option[String], producerParams: Map[String, String], schema: StructType)
+  extends ContinuousWriter with SupportsWriteInternalRow {
+
+  validateQuery(schema.toAttributes, producerParams.toMap[String, Object].asJava, topic)
+
+  override def createInternalRowWriterFactory(): KafkaContinuousWriterFactory =
+    KafkaContinuousWriterFactory(topic, producerParams, schema)
+
+  override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {}
+  override def abort(messages: Array[WriterCommitMessage]): Unit = {}
+}
+
+/**
+ * A [[DataWriterFactory]] for Kafka writing. Will be serialized and sent to executors to generate
+ * the per-task data writers.
+ * @param topic The topic that should be written to. If None, topic will be inferred from
+ *              a `topic` field in the incoming data.
+ * @param producerParams Parameters for Kafka producers in each task.
+ * @param schema The schema of the input data.
+ */
+case class KafkaContinuousWriterFactory(
+    topic: Option[String], producerParams: Map[String, String], schema: StructType)
+  extends DataWriterFactory[InternalRow] {
+
+  override def createDataWriter(partitionId: Int, attemptNumber: Int): DataWriter[InternalRow] = {
+    new KafkaContinuousDataWriter(topic, producerParams, schema.toAttributes)
+  }
+}
+
+/**
+ * A [[DataWriter]] for Kafka writing. One data writer will be created in each partition to
+ * process incoming rows.
+ *
+ * @param targetTopic The topic that this data writer is targeting. If None, topic will be inferred
+ *                    from a `topic` field in the incoming data.
+ * @param producerParams Parameters to use for the Kafka producer.
+ * @param inputSchema The attributes in the input data.
+ */
+class KafkaContinuousDataWriter(
+    targetTopic: Option[String], producerParams: Map[String, String], inputSchema: Seq[Attribute])
+  extends KafkaRowWriter(inputSchema, targetTopic) with DataWriter[InternalRow] {
+  import scala.collection.JavaConverters._
+
+  private lazy val producer = CachedKafkaProducer.getOrCreate(
+    new java.util.HashMap[String, Object](producerParams.asJava))
+
+  def write(row: InternalRow): Unit = {
+    checkForErrors()
+    sendRow(row, producer)
+  }
+
+  def commit(): WriterCommitMessage = {
+    // Send is asynchronous, but we can't commit until all rows are actually in Kafka.
+    // This requires flushing and then checking that no callbacks produced errors.
+    // We also check for errors before to fail as soon as possible - the check is cheap.
+    checkForErrors()
+    producer.flush()
+    checkForErrors()
+    KafkaWriterCommitMessage
+  }
+
+  def abort(): Unit = {}
+
+  def close(): Unit = {
+    checkForErrors()
+    if (producer != null) {
+      producer.flush()
+      checkForErrors()
+      CachedKafkaProducer.close(new java.util.HashMap[String, Object](producerParams.asJava))
+    }
+  }
+}
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
index 3e65949a6fd1b..551641cfdbca8 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
@@ -117,10 +117,14 @@ private[kafka010] class KafkaOffsetReader(
    * Resolves the specific offsets based on Kafka seek positions.
    * This method resolves offset value -1 to the latest and -2 to the
    * earliest Kafka seek position.
+   *
+   * @param partitionOffsets the specific offsets to resolve
+   * @param reportDataLoss callback to either report or log data loss depending on setting
    */
   def fetchSpecificOffsets(
-      partitionOffsets: Map[TopicPartition, Long]): Map[TopicPartition, Long] =
-    runUninterruptibly {
+      partitionOffsets: Map[TopicPartition, Long],
+      reportDataLoss: String => Unit): KafkaSourceOffset = {
+    val fetched = runUninterruptibly {
       withRetriesWithoutInterrupt {
         // Poll to get the latest assigned partitions
         consumer.poll(0)
@@ -145,6 +149,19 @@ private[kafka010] class KafkaOffsetReader(
       }
     }
 
+    partitionOffsets.foreach {
+      case (tp, off) if off != KafkaOffsetRangeLimit.LATEST &&
+        off != KafkaOffsetRangeLimit.EARLIEST =>
+        if (fetched(tp) != off) {
+          reportDataLoss(
+            s"startingOffsets for $tp was $off but consumer reset to ${fetched(tp)}")
+        }
+      case _ =>
+        // no real way to check that beginning or end is reasonable
+    }
+    KafkaSourceOffset(fetched)
+  }
+
   /**
    * Fetch the earliest offsets for the topic partitions that are indicated
    * in the [[ConsumerStrategy]].
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
index 864a92b8f813f..169a5d006fb04 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -130,7 +130,7 @@ private[kafka010] class KafkaSource(
       val offsets = startingOffsets match {
         case EarliestOffsetRangeLimit => KafkaSourceOffset(kafkaReader.fetchEarliestOffsets())
         case LatestOffsetRangeLimit => KafkaSourceOffset(kafkaReader.fetchLatestOffsets())
-        case SpecificOffsetRangeLimit(p) => fetchAndVerify(p)
+        case SpecificOffsetRangeLimit(p) => kafkaReader.fetchSpecificOffsets(p, reportDataLoss)
       }
       metadataLog.add(0, offsets)
       logInfo(s"Initial offsets: $offsets")
@@ -138,21 +138,6 @@ private[kafka010] class KafkaSource(
     }.partitionToOffsets
   }
 
-  private def fetchAndVerify(specificOffsets: Map[TopicPartition, Long]) = {
-    val result = kafkaReader.fetchSpecificOffsets(specificOffsets)
-    specificOffsets.foreach {
-      case (tp, off) if off != KafkaOffsetRangeLimit.LATEST &&
-          off != KafkaOffsetRangeLimit.EARLIEST =>
-        if (result(tp) != off) {
-          reportDataLoss(
-            s"startingOffsets for $tp was $off but consumer reset to ${result(tp)}")
-        }
-      case _ =>
-      // no real way to check that beginning or end is reasonable
-    }
-    KafkaSourceOffset(result)
-  }
-
   private var currentPartitionOffsets: Option[Map[TopicPartition, Long]] = None
 
   override def schema: StructType = KafkaOffsetReader.kafkaSchema
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
index b5da415b3097e..c82154cfbad7f 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
@@ -20,17 +20,22 @@ package org.apache.spark.sql.kafka010
 import org.apache.kafka.common.TopicPartition
 
 import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset}
+import org.apache.spark.sql.sources.v2.streaming.reader.{Offset => OffsetV2, PartitionOffset}
 
 /**
  * An [[Offset]] for the [[KafkaSource]]. This one tracks all partitions of subscribed topics and
  * their offsets.
  */
 private[kafka010]
-case class KafkaSourceOffset(partitionToOffsets: Map[TopicPartition, Long]) extends Offset {
+case class KafkaSourceOffset(partitionToOffsets: Map[TopicPartition, Long]) extends OffsetV2 {
 
   override val json = JsonUtils.partitionOffsets(partitionToOffsets)
 }
 
+private[kafka010]
+case class KafkaSourcePartitionOffset(topicPartition: TopicPartition, partitionOffset: Long)
+  extends PartitionOffset
+
 /** Companion object of the [[KafkaSourceOffset]] */
 private[kafka010] object KafkaSourceOffset {
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
index 3cb4d8cad12cc..3914370a96595 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.kafka010
 
 import java.{util => ju}
-import java.util.{Locale, UUID}
+import java.util.{Locale, Optional, UUID}
 
 import scala.collection.JavaConverters._
 
@@ -27,9 +27,12 @@ import org.apache.kafka.clients.producer.ProducerConfig
 import org.apache.kafka.common.serialization.{ByteArrayDeserializer, ByteArraySerializer}
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
-import org.apache.spark.sql.execution.streaming.{Sink, Source}
+import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SparkSession, SQLContext}
+import org.apache.spark.sql.execution.streaming.{Offset, Sink, Source}
 import org.apache.spark.sql.sources._
+import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, ContinuousWriteSupport}
+import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 
@@ -43,6 +46,8 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
     with StreamSinkProvider
     with RelationProvider
     with CreatableRelationProvider
+    with ContinuousWriteSupport
+    with ContinuousReadSupport
     with Logging {
   import KafkaSourceProvider._
 
@@ -101,6 +106,43 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
       failOnDataLoss(caseInsensitiveParams))
   }
 
+  override def createContinuousReader(
+      schema: Optional[StructType],
+      metadataPath: String,
+      options: DataSourceV2Options): KafkaContinuousReader = {
+    val parameters = options.asMap().asScala.toMap
+    validateStreamOptions(parameters)
+    // Each running query should use its own group id. Otherwise, the query may be only assigned
+    // partial data since Kafka will assign partitions to multiple consumers having the same group
+    // id. Hence, we should generate a unique id for each query.
+    val uniqueGroupId = s"spark-kafka-source-${UUID.randomUUID}-${metadataPath.hashCode}"
+
+    val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
+    val specifiedKafkaParams =
+      parameters
+        .keySet
+        .filter(_.toLowerCase(Locale.ROOT).startsWith("kafka."))
+        .map { k => k.drop(6).toString -> parameters(k) }
+        .toMap
+
+    val startingStreamOffsets = KafkaSourceProvider.getKafkaOffsetRangeLimit(caseInsensitiveParams,
+      STARTING_OFFSETS_OPTION_KEY, LatestOffsetRangeLimit)
+
+    val kafkaOffsetReader = new KafkaOffsetReader(
+      strategy(caseInsensitiveParams),
+      kafkaParamsForDriver(specifiedKafkaParams),
+      parameters,
+      driverGroupIdPrefix = s"$uniqueGroupId-driver")
+
+    new KafkaContinuousReader(
+      kafkaOffsetReader,
+      kafkaParamsForExecutors(specifiedKafkaParams, uniqueGroupId),
+      parameters,
+      metadataPath,
+      startingStreamOffsets,
+      failOnDataLoss(caseInsensitiveParams))
+  }
+
   /**
    * Returns a new base relation with the given parameters.
    *
@@ -181,26 +223,22 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
     }
   }
 
-  private def kafkaParamsForProducer(parameters: Map[String, String]): Map[String, String] = {
-    val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
-    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}")) {
-      throw new IllegalArgumentException(
-        s"Kafka option '${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}' is not supported as keys "
-          + "are serialized with ByteArraySerializer.")
-    }
+  override def createContinuousWriter(
+      queryId: String,
+      schema: StructType,
+      mode: OutputMode,
+      options: DataSourceV2Options): Optional[ContinuousWriter] = {
+    import scala.collection.JavaConverters._
 
-    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}"))
-    {
-      throw new IllegalArgumentException(
-        s"Kafka option '${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}' is not supported as "
-          + "value are serialized with ByteArraySerializer.")
-    }
-    parameters
-      .keySet
-      .filter(_.toLowerCase(Locale.ROOT).startsWith("kafka."))
-      .map { k => k.drop(6).toString -> parameters(k) }
-      .toMap + (ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName,
-        ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName)
+    val spark = SparkSession.getActiveSession.get
+    val topic = Option(options.get(TOPIC_OPTION_KEY).orElse(null)).map(_.trim)
+    // We convert the options argument from V2 -> Java map -> scala mutable -> scala immutable.
+    val producerParams = kafkaParamsForProducer(options.asMap.asScala.toMap)
+
+    KafkaWriter.validateQuery(
+      schema.toAttributes, new java.util.HashMap[String, Object](producerParams.asJava), topic)
+
+    Optional.of(new KafkaContinuousWriter(topic, producerParams, schema))
   }
 
   private def strategy(caseInsensitiveParams: Map[String, String]) =
@@ -450,4 +488,27 @@ private[kafka010] object KafkaSourceProvider extends Logging {
 
     def build(): ju.Map[String, Object] = map
   }
+
+  private[kafka010] def kafkaParamsForProducer(
+      parameters: Map[String, String]): Map[String, String] = {
+    val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
+    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}")) {
+      throw new IllegalArgumentException(
+        s"Kafka option '${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}' is not supported as keys "
+          + "are serialized with ByteArraySerializer.")
+    }
+
+    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}"))
+    {
+      throw new IllegalArgumentException(
+        s"Kafka option '${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}' is not supported as "
+          + "value are serialized with ByteArraySerializer.")
+    }
+    parameters
+      .keySet
+      .filter(_.toLowerCase(Locale.ROOT).startsWith("kafka."))
+      .map { k => k.drop(6).toString -> parameters(k) }
+      .toMap + (ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName,
+      ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName)
+  }
 }
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
index 6fd333e2f43ba..baa60febf661d 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
@@ -33,10 +33,8 @@ import org.apache.spark.sql.types.{BinaryType, StringType}
 private[kafka010] class KafkaWriteTask(
     producerConfiguration: ju.Map[String, Object],
     inputSchema: Seq[Attribute],
-    topic: Option[String]) {
+    topic: Option[String]) extends KafkaRowWriter(inputSchema, topic) {
   // used to synchronize with Kafka callbacks
-  @volatile private var failedWrite: Exception = null
-  private val projection = createProjection
   private var producer: KafkaProducer[Array[Byte], Array[Byte]] = _
 
   /**
@@ -46,23 +44,7 @@ private[kafka010] class KafkaWriteTask(
     producer = CachedKafkaProducer.getOrCreate(producerConfiguration)
     while (iterator.hasNext && failedWrite == null) {
       val currentRow = iterator.next()
-      val projectedRow = projection(currentRow)
-      val topic = projectedRow.getUTF8String(0)
-      val key = projectedRow.getBinary(1)
-      val value = projectedRow.getBinary(2)
-      if (topic == null) {
-        throw new NullPointerException(s"null topic present in the data. Use the " +
-        s"${KafkaSourceProvider.TOPIC_OPTION_KEY} option for setting a default topic.")
-      }
-      val record = new ProducerRecord[Array[Byte], Array[Byte]](topic.toString, key, value)
-      val callback = new Callback() {
-        override def onCompletion(recordMetadata: RecordMetadata, e: Exception): Unit = {
-          if (failedWrite == null && e != null) {
-            failedWrite = e
-          }
-        }
-      }
-      producer.send(record, callback)
+      sendRow(currentRow, producer)
     }
   }
 
@@ -74,8 +56,49 @@ private[kafka010] class KafkaWriteTask(
       producer = null
     }
   }
+}
+
+private[kafka010] abstract class KafkaRowWriter(
+    inputSchema: Seq[Attribute], topic: Option[String]) {
+
+  // used to synchronize with Kafka callbacks
+  @volatile protected var failedWrite: Exception = _
+  protected val projection = createProjection
+
+  private val callback = new Callback() {
+    override def onCompletion(recordMetadata: RecordMetadata, e: Exception): Unit = {
+      if (failedWrite == null && e != null) {
+        failedWrite = e
+      }
+    }
+  }
 
-  private def createProjection: UnsafeProjection = {
+  /**
+   * Send the specified row to the producer, with a callback that will save any exception
+   * to failedWrite. Note that send is asynchronous; subclasses must flush() their producer before
+   * assuming the row is in Kafka.
+   */
+  protected def sendRow(
+      row: InternalRow, producer: KafkaProducer[Array[Byte], Array[Byte]]): Unit = {
+    val projectedRow = projection(row)
+    val topic = projectedRow.getUTF8String(0)
+    val key = projectedRow.getBinary(1)
+    val value = projectedRow.getBinary(2)
+    if (topic == null) {
+      throw new NullPointerException(s"null topic present in the data. Use the " +
+        s"${KafkaSourceProvider.TOPIC_OPTION_KEY} option for setting a default topic.")
+    }
+    val record = new ProducerRecord[Array[Byte], Array[Byte]](topic.toString, key, value)
+    producer.send(record, callback)
+  }
+
+  protected def checkForErrors(): Unit = {
+    if (failedWrite != null) {
+      throw failedWrite
+    }
+  }
+
+  private def createProjection = {
     val topicExpression = topic.map(Literal(_)).orElse {
       inputSchema.find(_.name == KafkaWriter.TOPIC_ATTRIBUTE_NAME)
     }.getOrElse {
@@ -112,11 +135,5 @@ private[kafka010] class KafkaWriteTask(
       Seq(topicExpression, Cast(keyExpression, BinaryType),
         Cast(valueExpression, BinaryType)), inputSchema)
   }
-
-  private def checkForErrors(): Unit = {
-    if (failedWrite != null) {
-      throw failedWrite
-    }
-  }
 }
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
index 5e9ae35b3f008..15cd44812cb0c 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
@@ -43,10 +43,9 @@ private[kafka010] object KafkaWriter extends Logging {
   override def toString: String = "KafkaWriter"
 
   def validateQuery(
-      queryExecution: QueryExecution,
+      schema: Seq[Attribute],
       kafkaParameters: ju.Map[String, Object],
       topic: Option[String] = None): Unit = {
-    val schema = queryExecution.analyzed.output
     schema.find(_.name == TOPIC_ATTRIBUTE_NAME).getOrElse(
       if (topic.isEmpty) {
         throw new AnalysisException(s"topic option required when no " +
@@ -84,7 +83,7 @@ private[kafka010] object KafkaWriter extends Logging {
       kafkaParameters: ju.Map[String, Object],
       topic: Option[String] = None): Unit = {
     val schema = queryExecution.analyzed.output
-    validateQuery(queryExecution, kafkaParameters, topic)
+    validateQuery(schema, kafkaParameters, topic)
     queryExecution.toRdd.foreachPartition { iter =>
       val writeTask = new KafkaWriteTask(kafkaParameters, schema, topic)
       Utils.tryWithSafeFinally(block = writeTask.execute(iter))(
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
new file mode 100644
index 0000000000000..8487a69851237
--- /dev/null
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
@@ -0,0 +1,476 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.util.Locale
+import java.util.concurrent.atomic.AtomicInteger
+
+import org.apache.kafka.clients.producer.ProducerConfig
+import org.apache.kafka.common.serialization.ByteArraySerializer
+import org.scalatest.time.SpanSugar._
+import scala.collection.JavaConverters._
+
+import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SaveMode}
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, SpecificInternalRow, UnsafeProjection}
+import org.apache.spark.sql.execution.streaming.MemoryStream
+import org.apache.spark.sql.streaming._
+import org.apache.spark.sql.types.{BinaryType, DataType}
+import org.apache.spark.util.Utils
+
+/**
+ * This is a temporary port of KafkaSinkSuite, since we do not yet have a V2 memory stream.
+ * Once we have one, this will be changed to a specialization of KafkaSinkSuite and we won't have
+ * to duplicate all the code.
+ */
+class KafkaContinuousSinkSuite extends KafkaContinuousTest {
+  import testImplicits._
+
+  override val streamingTimeout = 30.seconds
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    testUtils = new KafkaTestUtils(
+      withBrokerProps = Map("auto.create.topics.enable" -> "false"))
+    testUtils.setup()
+  }
+
+  override def afterAll(): Unit = {
+    if (testUtils != null) {
+      testUtils.teardown()
+      testUtils = null
+    }
+    super.afterAll()
+  }
+
+  test("streaming - write to kafka with topic field") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    val writer = createKafkaWriter(
+      input.toDF(),
+      withTopic = None,
+      withOutputMode = Some(OutputMode.Append))(
+      withSelectExpr = s"'$topic' as topic", "value")
+
+    val reader = createKafkaReader(topic)
+      .selectExpr("CAST(key as STRING) key", "CAST(value as STRING) value")
+      .selectExpr("CAST(key as INT) key", "CAST(value as INT) value")
+      .as[(Int, Int)]
+      .map(_._2)
+
+    try {
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      eventually(timeout(streamingTimeout)) {
+        checkDatasetUnorderly(reader, 1, 2, 3, 4, 5)
+      }
+      testUtils.sendMessages(inputTopic, Array("6", "7", "8", "9", "10"))
+      eventually(timeout(streamingTimeout)) {
+        checkDatasetUnorderly(reader, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+      }
+    } finally {
+      writer.stop()
+    }
+  }
+
+  test("streaming - write w/o topic field, with topic option") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    val writer = createKafkaWriter(
+      input.toDF(),
+      withTopic = Some(topic),
+      withOutputMode = Some(OutputMode.Append()))()
+
+    val reader = createKafkaReader(topic)
+      .selectExpr("CAST(key as STRING) key", "CAST(value as STRING) value")
+      .selectExpr("CAST(key as INT) key", "CAST(value as INT) value")
+      .as[(Int, Int)]
+      .map(_._2)
+
+    try {
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      eventually(timeout(streamingTimeout)) {
+        checkDatasetUnorderly(reader, 1, 2, 3, 4, 5)
+      }
+      testUtils.sendMessages(inputTopic, Array("6", "7", "8", "9", "10"))
+      eventually(timeout(streamingTimeout)) {
+        checkDatasetUnorderly(reader, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+      }
+    } finally {
+      writer.stop()
+    }
+  }
+
+  test("streaming - topic field and topic option") {
+    /* The purpose of this test is to ensure that the topic option
+     * overrides the topic field. We begin by writing some data that
+     * includes a topic field and value (e.g., 'foo') along with a topic
+     * option. Then when we read from the topic specified in the option
+     * we should see the data i.e., the data was written to the topic
+     * option, and not to the topic in the data e.g., foo
+     */
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    val writer = createKafkaWriter(
+      input.toDF(),
+      withTopic = Some(topic),
+      withOutputMode = Some(OutputMode.Append()))(
+      withSelectExpr = "'foo' as topic", "CAST(value as STRING) value")
+
+    val reader = createKafkaReader(topic)
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .selectExpr("CAST(key AS INT)", "CAST(value AS INT)")
+      .as[(Int, Int)]
+      .map(_._2)
+
+    try {
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      eventually(timeout(streamingTimeout)) {
+        checkDatasetUnorderly(reader, 1, 2, 3, 4, 5)
+      }
+      testUtils.sendMessages(inputTopic, Array("6", "7", "8", "9", "10"))
+      eventually(timeout(streamingTimeout)) {
+        checkDatasetUnorderly(reader, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+      }
+    } finally {
+      writer.stop()
+    }
+  }
+
+  test("null topic attribute") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    /* No topic field or topic option */
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      writer = createKafkaWriter(input.toDF())(
+        withSelectExpr = "CAST(null as STRING) as topic", "value"
+      )
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      eventually(timeout(streamingTimeout)) {
+        assert(writer.exception.isDefined)
+        ex = writer.exception.get
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getCause.getCause.getMessage
+      .toLowerCase(Locale.ROOT)
+      .contains("null topic present in the data."))
+  }
+
+  test("streaming - write data with bad schema") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    /* No topic field or topic option */
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      writer = createKafkaWriter(input.toDF())(
+        withSelectExpr = "value as key", "value"
+      )
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      eventually(timeout(streamingTimeout)) {
+        assert(writer.exception.isDefined)
+        ex = writer.exception.get
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage
+      .toLowerCase(Locale.ROOT)
+      .contains("topic option required when no 'topic' attribute is present"))
+
+    try {
+      /* No value field */
+      writer = createKafkaWriter(input.toDF())(
+        withSelectExpr = s"'$topic' as topic", "value as key"
+      )
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      eventually(timeout(streamingTimeout)) {
+        assert(writer.exception.isDefined)
+        ex = writer.exception.get
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+      "required attribute 'value' not found"))
+  }
+
+  test("streaming - write data with valid schema but wrong types") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+      .selectExpr("CAST(value as STRING) value")
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      /* topic field wrong type */
+      writer = createKafkaWriter(input.toDF())(
+        withSelectExpr = s"CAST('1' as INT) as topic", "value"
+      )
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      eventually(timeout(streamingTimeout)) {
+        assert(writer.exception.isDefined)
+        ex = writer.exception.get
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains("topic type must be a string"))
+
+    try {
+      /* value field wrong type */
+      writer = createKafkaWriter(input.toDF())(
+        withSelectExpr = s"'$topic' as topic", "CAST(value as INT) as value"
+      )
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      eventually(timeout(streamingTimeout)) {
+        assert(writer.exception.isDefined)
+        ex = writer.exception.get
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+      "value attribute type must be a string or binarytype"))
+
+    try {
+      /* key field wrong type */
+      writer = createKafkaWriter(input.toDF())(
+        withSelectExpr = s"'$topic' as topic", "CAST(value as INT) as key", "value"
+      )
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      eventually(timeout(streamingTimeout)) {
+        assert(writer.exception.isDefined)
+        ex = writer.exception.get
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+      "key attribute type must be a string or binarytype"))
+  }
+
+  test("streaming - write to non-existing topic") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+    val topic = newTopic()
+
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(input.toDF(), withTopic = Some(topic))()
+        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+        eventually(timeout(streamingTimeout)) {
+          assert(writer.exception.isDefined)
+        }
+        throw writer.exception.get
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains("job aborted"))
+  }
+
+  test("streaming - exception on config serializer") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+    testUtils.sendMessages(inputTopic, Array("0"))
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .load()
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      writer = createKafkaWriter(
+        input.toDF(),
+        withOptions = Map("kafka.key.serializer" -> "foo"))()
+      eventually(timeout(streamingTimeout)) {
+        assert(writer.exception.isDefined)
+        ex = writer.exception.get
+      }
+      assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+        "kafka option 'key.serializer' is not supported"))
+    } finally {
+      writer.stop()
+    }
+
+    try {
+      writer = createKafkaWriter(
+        input.toDF(),
+        withOptions = Map("kafka.value.serializer" -> "foo"))()
+      eventually(timeout(streamingTimeout)) {
+        assert(writer.exception.isDefined)
+        ex = writer.exception.get
+      }
+      assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+        "kafka option 'value.serializer' is not supported"))
+    } finally {
+      writer.stop()
+    }
+  }
+
+  test("generic - write big data with small producer buffer") {
+    /* This test ensures that we understand the semantics of Kafka when
+    * is comes to blocking on a call to send when the send buffer is full.
+    * This test will configure the smallest possible producer buffer and
+    * indicate that we should block when it is full. Thus, no exception should
+    * be thrown in the case of a full buffer.
+    */
+    val topic = newTopic()
+    testUtils.createTopic(topic, 1)
+    val options = new java.util.HashMap[String, String]
+    options.put("bootstrap.servers", testUtils.brokerAddress)
+    options.put("buffer.memory", "16384") // min buffer size
+    options.put("block.on.buffer.full", "true")
+    options.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, classOf[ByteArraySerializer].getName)
+    options.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, classOf[ByteArraySerializer].getName)
+    val inputSchema = Seq(AttributeReference("value", BinaryType)())
+    val data = new Array[Byte](15000) // large value
+    val writeTask = new KafkaContinuousDataWriter(Some(topic), options.asScala.toMap, inputSchema)
+    try {
+      val fieldTypes: Array[DataType] = Array(BinaryType)
+      val converter = UnsafeProjection.create(fieldTypes)
+      val row = new SpecificInternalRow(fieldTypes)
+      row.update(0, data)
+      val iter = Seq.fill(1000)(converter.apply(row)).iterator
+      iter.foreach(writeTask.write(_))
+      writeTask.commit()
+    } finally {
+      writeTask.close()
+    }
+  }
+
+  private def createKafkaReader(topic: String): DataFrame = {
+    spark.read
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("startingOffsets", "earliest")
+      .option("endingOffsets", "latest")
+      .option("subscribe", topic)
+      .load()
+  }
+
+  private def createKafkaWriter(
+      input: DataFrame,
+      withTopic: Option[String] = None,
+      withOutputMode: Option[OutputMode] = None,
+      withOptions: Map[String, String] = Map[String, String]())
+      (withSelectExpr: String*): StreamingQuery = {
+    var stream: DataStreamWriter[Row] = null
+    val checkpointDir = Utils.createTempDir()
+    var df = input.toDF()
+    if (withSelectExpr.length > 0) {
+      df = df.selectExpr(withSelectExpr: _*)
+    }
+    stream = df.writeStream
+      .format("kafka")
+      .option("checkpointLocation", checkpointDir.getCanonicalPath)
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      // We need to reduce blocking time to efficiently test non-existent partition behavior.
+      .option("kafka.max.block.ms", "1000")
+      .trigger(Trigger.Continuous(1000))
+      .queryName("kafkaStream")
+    withTopic.foreach(stream.option("topic", _))
+    withOutputMode.foreach(stream.outputMode(_))
+    withOptions.foreach(opt => stream.option(opt._1, opt._2))
+    stream.start()
+  }
+}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala
new file mode 100644
index 0000000000000..b3dade414f625
--- /dev/null
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.util.Properties
+import java.util.concurrent.atomic.AtomicInteger
+
+import org.scalatest.time.SpanSugar._
+import scala.collection.mutable
+import scala.util.Random
+
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row}
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
+import org.apache.spark.sql.execution.streaming.StreamExecution
+import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
+import org.apache.spark.sql.streaming.{StreamTest, Trigger}
+import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}
+
+// Run tests in KafkaSourceSuiteBase in continuous execution mode.
+class KafkaContinuousSourceSuite extends KafkaSourceSuiteBase with KafkaContinuousTest
+
+class KafkaContinuousSourceTopicDeletionSuite extends KafkaContinuousTest {
+  import testImplicits._
+
+  override val brokerProps = Map("auto.create.topics.enable" -> "false")
+
+  test("subscribing topic by pattern with topic deletions") {
+    val topicPrefix = newTopic()
+    val topic = topicPrefix + "-seems"
+    val topic2 = topicPrefix + "-bad"
+    testUtils.createTopic(topic, partitions = 5)
+    testUtils.sendMessages(topic, Array("-1"))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("subscribePattern", s"$topicPrefix-.*")
+      .option("failOnDataLoss", "false")
+
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val mapped = kafka.map(kv => kv._2.toInt + 1)
+
+    testStream(mapped)(
+      makeSureGetOffsetCalled,
+      AddKafkaData(Set(topic), 1, 2, 3),
+      CheckAnswer(2, 3, 4),
+      Execute { query =>
+        testUtils.deleteTopic(topic)
+        testUtils.createTopic(topic2, partitions = 5)
+        eventually(timeout(streamingTimeout)) {
+          assert(
+            query.lastExecution.logical.collectFirst {
+              case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
+            }.exists { r =>
+              // Ensure the new topic is present and the old topic is gone.
+              r.knownPartitions.exists(_.topic == topic2)
+            },
+            s"query never reconfigured to new topic $topic2")
+        }
+      },
+      AddKafkaData(Set(topic2), 4, 5, 6),
+      CheckAnswer(2, 3, 4, 5, 6, 7)
+    )
+  }
+}
+
+class KafkaContinuousSourceStressForDontFailOnDataLossSuite
+    extends KafkaSourceStressForDontFailOnDataLossSuite {
+  override protected def startStream(ds: Dataset[Int]) = {
+    ds.writeStream
+      .format("memory")
+      .queryName("memory")
+      .start()
+  }
+}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala
new file mode 100644
index 0000000000000..5a1a14f7a307a
--- /dev/null
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.util.concurrent.atomic.AtomicInteger
+
+import org.apache.spark.SparkContext
+import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd, SparkListenerTaskStart}
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
+import org.apache.spark.sql.execution.streaming.StreamExecution
+import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
+import org.apache.spark.sql.streaming.Trigger
+import org.apache.spark.sql.test.TestSparkSession
+
+// Trait to configure StreamTest for kafka continuous execution tests.
+trait KafkaContinuousTest extends KafkaSourceTest {
+  override val defaultTrigger = Trigger.Continuous(1000)
+  override val defaultUseV2Sink = true
+
+  // We need more than the default local[2] to be able to schedule all partitions simultaneously.
+  override protected def createSparkSession = new TestSparkSession(
+    new SparkContext(
+      "local[10]",
+      "continuous-stream-test-sql-context",
+      sparkConf.set("spark.sql.testkey", "true")))
+
+  // In addition to setting the partitions in Kafka, we have to wait until the query has
+  // reconfigured to the new count so the test framework can hook in properly.
+  override protected def setTopicPartitions(
+      topic: String, newCount: Int, query: StreamExecution) = {
+    testUtils.addPartitions(topic, newCount)
+    eventually(timeout(streamingTimeout)) {
+      assert(
+        query.lastExecution.logical.collectFirst {
+          case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
+        }.exists(_.knownPartitions.size == newCount),
+        s"query never reconfigured to $newCount partitions")
+    }
+  }
+
+  // Continuous processing tasks end asynchronously, so test that they actually end.
+  private val tasksEndedListener = new SparkListener() {
+    val activeTaskIdCount = new AtomicInteger(0)
+
+    override def onTaskStart(start: SparkListenerTaskStart): Unit = {
+      activeTaskIdCount.incrementAndGet()
+    }
+
+    override def onTaskEnd(end: SparkListenerTaskEnd): Unit = {
+      activeTaskIdCount.decrementAndGet()
+    }
+  }
+
+  override def beforeEach(): Unit = {
+    super.beforeEach()
+    spark.sparkContext.addSparkListener(tasksEndedListener)
+  }
+
+  override def afterEach(): Unit = {
+    eventually(timeout(streamingTimeout)) {
+      assert(tasksEndedListener.activeTaskIdCount.get() == 0)
+    }
+    spark.sparkContext.removeSparkListener(tasksEndedListener)
+    super.afterEach()
+  }
+
+
+  test("ensure continuous stream is being used") {
+    val query = spark.readStream
+      .format("rate")
+      .option("numPartitions", "1")
+      .option("rowsPerSecond", "1")
+      .load()
+
+    testStream(query)(
+      Execute(q => assert(q.isInstanceOf[ContinuousExecution]))
+    )
+  }
+}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index a0f5695fc485c..1acff61e11d2a 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -34,11 +34,14 @@ import org.scalatest.concurrent.PatienceConfiguration.Timeout
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.SparkContext
-import org.apache.spark.sql.ForeachWriter
+import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row}
+import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, WriteToDataSourceV2Exec}
 import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
+import org.apache.spark.sql.execution.streaming.sources.ContinuousMemoryWriter
 import org.apache.spark.sql.functions.{count, window}
 import org.apache.spark.sql.kafka010.KafkaSourceProvider._
-import org.apache.spark.sql.streaming.{ProcessingTime, StreamTest}
+import org.apache.spark.sql.streaming.{ProcessingTime, StreamTest, Trigger}
 import org.apache.spark.sql.streaming.util.StreamManualClock
 import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}
 import org.apache.spark.util.Utils
@@ -49,9 +52,11 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
 
   override val streamingTimeout = 30.seconds
 
+  protected val brokerProps = Map[String, Object]()
+
   override def beforeAll(): Unit = {
     super.beforeAll()
-    testUtils = new KafkaTestUtils
+    testUtils = new KafkaTestUtils(brokerProps)
     testUtils.setup()
   }
 
@@ -59,18 +64,25 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
     if (testUtils != null) {
       testUtils.teardown()
       testUtils = null
-      super.afterAll()
     }
+    super.afterAll()
   }
 
   protected def makeSureGetOffsetCalled = AssertOnQuery { q =>
     // Because KafkaSource's initialPartitionOffsets is set lazily, we need to make sure
-    // its "getOffset" is called before pushing any data. Otherwise, because of the race contion,
+    // its "getOffset" is called before pushing any data. Otherwise, because of the race condition,
     // we don't know which data should be fetched when `startingOffsets` is latest.
-    q.processAllAvailable()
+    q match {
+      case c: ContinuousExecution => c.awaitEpoch(0)
+      case m: MicroBatchExecution => m.processAllAvailable()
+    }
     true
   }
 
+  protected def setTopicPartitions(topic: String, newCount: Int, query: StreamExecution) : Unit = {
+    testUtils.addPartitions(topic, newCount)
+  }
+
   /**
    * Add data to Kafka.
    *
@@ -82,10 +94,11 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
       message: String = "",
       topicAction: (String, Option[Int]) => Unit = (_, _) => {}) extends AddData {
 
-    override def addData(query: Option[StreamExecution]): (Source, Offset) = {
-      if (query.get.isActive) {
+    override def addData(query: Option[StreamExecution]): (BaseStreamingSource, Offset) = {
+      query match {
         // Make sure no Spark job is running when deleting a topic
-        query.get.processAllAvailable()
+        case Some(m: MicroBatchExecution) => m.processAllAvailable()
+        case _ =>
       }
 
       val existingTopics = testUtils.getAllTopicsAndPartitionSize().toMap
@@ -97,16 +110,18 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
         topicAction(existingTopicPartitions._1, Some(existingTopicPartitions._2))
       }
 
-      // Read all topics again in case some topics are delete.
-      val allTopics = testUtils.getAllTopicsAndPartitionSize().toMap.keys
       require(
         query.nonEmpty,
         "Cannot add data when there is no query for finding the active kafka source")
 
       val sources = query.get.logicalPlan.collect {
-        case StreamingExecutionRelation(source, _) if source.isInstanceOf[KafkaSource] =>
-          source.asInstanceOf[KafkaSource]
-      }
+        case StreamingExecutionRelation(source: KafkaSource, _) => source
+      } ++ (query.get.lastExecution match {
+        case null => Seq()
+        case e => e.logical.collect {
+          case DataSourceV2Relation(_, reader: KafkaContinuousReader) => reader
+        }
+      })
       if (sources.isEmpty) {
         throw new Exception(
           "Could not find Kafka source in the StreamExecution logical plan to add data to")
@@ -137,14 +152,158 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
     override def toString: String =
       s"AddKafkaData(topics = $topics, data = $data, message = $message)"
   }
-}
 
+  private val topicId = new AtomicInteger(0)
+  protected def newTopic(): String = s"topic-${topicId.getAndIncrement()}"
+}
 
-class KafkaSourceSuite extends KafkaSourceTest {
+class KafkaMicroBatchSourceSuite extends KafkaSourceSuiteBase {
 
   import testImplicits._
 
-  private val topicId = new AtomicInteger(0)
+  test("(de)serialization of initial offsets") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 5)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", topic)
+
+    testStream(reader.load)(
+      makeSureGetOffsetCalled,
+      StopStream,
+      StartStream(),
+      StopStream)
+  }
+
+  test("maxOffsetsPerTrigger") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 3)
+    testUtils.sendMessages(topic, (100 to 200).map(_.toString).toArray, Some(0))
+    testUtils.sendMessages(topic, (10 to 20).map(_.toString).toArray, Some(1))
+    testUtils.sendMessages(topic, Array("1"), Some(2))
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("maxOffsetsPerTrigger", 10)
+      .option("subscribe", topic)
+      .option("startingOffsets", "earliest")
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val mapped: org.apache.spark.sql.Dataset[_] = kafka.map(kv => kv._2.toInt)
+
+    val clock = new StreamManualClock
+
+    val waitUntilBatchProcessed = AssertOnQuery { q =>
+      eventually(Timeout(streamingTimeout)) {
+        if (!q.exception.isDefined) {
+          assert(clock.isStreamWaitingAt(clock.getTimeMillis()))
+        }
+      }
+      if (q.exception.isDefined) {
+        throw q.exception.get
+      }
+      true
+    }
+
+    testStream(mapped)(
+      StartStream(ProcessingTime(100), clock),
+      waitUntilBatchProcessed,
+      // 1 from smallest, 1 from middle, 8 from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107),
+      AdvanceManualClock(100),
+      waitUntilBatchProcessed,
+      // smallest now empty, 1 more from middle, 9 more from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
+        11, 108, 109, 110, 111, 112, 113, 114, 115, 116
+      ),
+      StopStream,
+      StartStream(ProcessingTime(100), clock),
+      waitUntilBatchProcessed,
+      // smallest now empty, 1 more from middle, 9 more from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
+        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+        12, 117, 118, 119, 120, 121, 122, 123, 124, 125
+      ),
+      AdvanceManualClock(100),
+      waitUntilBatchProcessed,
+      // smallest now empty, 1 more from middle, 9 more from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
+        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+        12, 117, 118, 119, 120, 121, 122, 123, 124, 125,
+        13, 126, 127, 128, 129, 130, 131, 132, 133, 134
+      )
+    )
+  }
+
+  test("input row metrics") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 5)
+    testUtils.sendMessages(topic, Array("-1"))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val kafka = spark
+      .readStream
+      .format("kafka")
+      .option("subscribe", topic)
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+
+    val mapped = kafka.map(kv => kv._2.toInt + 1)
+    testStream(mapped)(
+      StartStream(trigger = ProcessingTime(1)),
+      makeSureGetOffsetCalled,
+      AddKafkaData(Set(topic), 1, 2, 3),
+      CheckAnswer(2, 3, 4),
+      AssertOnQuery { query =>
+        val recordsRead = query.recentProgress.map(_.numInputRows).sum
+        recordsRead == 3
+      }
+    )
+  }
+
+  test("subscribing topic by pattern with topic deletions") {
+    val topicPrefix = newTopic()
+    val topic = topicPrefix + "-seems"
+    val topic2 = topicPrefix + "-bad"
+    testUtils.createTopic(topic, partitions = 5)
+    testUtils.sendMessages(topic, Array("-1"))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("subscribePattern", s"$topicPrefix-.*")
+      .option("failOnDataLoss", "false")
+
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val mapped = kafka.map(kv => kv._2.toInt + 1)
+
+    testStream(mapped)(
+      makeSureGetOffsetCalled,
+      AddKafkaData(Set(topic), 1, 2, 3),
+      CheckAnswer(2, 3, 4),
+      Assert {
+        testUtils.deleteTopic(topic)
+        testUtils.createTopic(topic2, partitions = 5)
+        true
+      },
+      AddKafkaData(Set(topic2), 4, 5, 6),
+      CheckAnswer(2, 3, 4, 5, 6, 7)
+    )
+  }
 
   testWithUninterruptibleThread(
     "deserialization of initial offset with Spark 2.1.0") {
@@ -237,86 +396,94 @@ class KafkaSourceSuite extends KafkaSourceTest {
     }
   }
 
-  test("(de)serialization of initial offsets") {
+  test("KafkaSource with watermark") {
+    val now = System.currentTimeMillis()
     val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 64)
+    testUtils.createTopic(newTopic(), partitions = 1)
+    testUtils.sendMessages(topic, Array(1).map(_.toString))
 
-    val reader = spark
+    val kafka = spark
       .readStream
       .format("kafka")
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("startingOffsets", s"earliest")
       .option("subscribe", topic)
+      .load()
 
-    testStream(reader.load)(
-      makeSureGetOffsetCalled,
-      StopStream,
-      StartStream(),
-      StopStream)
+    val windowedAggregation = kafka
+      .withWatermark("timestamp", "10 seconds")
+      .groupBy(window($"timestamp", "5 seconds") as 'window)
+      .agg(count("*") as 'count)
+      .select($"window".getField("start") as 'window, $"count")
+
+    val query = windowedAggregation
+      .writeStream
+      .format("memory")
+      .outputMode("complete")
+      .queryName("kafkaWatermark")
+      .start()
+    query.processAllAvailable()
+    val rows = spark.table("kafkaWatermark").collect()
+    assert(rows.length === 1, s"Unexpected results: ${rows.toList}")
+    val row = rows(0)
+    // We cannot check the exact window start time as it depands on the time that messages were
+    // inserted by the producer. So here we just use a low bound to make sure the internal
+    // conversion works.
+    assert(
+      row.getAs[java.sql.Timestamp]("window").getTime >= now - 5 * 1000,
+      s"Unexpected results: $row")
+    assert(row.getAs[Int]("count") === 1, s"Unexpected results: $row")
+    query.stop()
   }
 
-  test("maxOffsetsPerTrigger") {
+  test("delete a topic when a Spark job is running") {
+    KafkaSourceSuite.collectedData.clear()
+
     val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 3)
-    testUtils.sendMessages(topic, (100 to 200).map(_.toString).toArray, Some(0))
-    testUtils.sendMessages(topic, (10 to 20).map(_.toString).toArray, Some(1))
-    testUtils.sendMessages(topic, Array("1"), Some(2))
+    testUtils.createTopic(topic, partitions = 1)
+    testUtils.sendMessages(topic, (1 to 10).map(_.toString).toArray)
 
     val reader = spark
       .readStream
       .format("kafka")
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
       .option("kafka.metadata.max.age.ms", "1")
-      .option("maxOffsetsPerTrigger", 10)
       .option("subscribe", topic)
+      // If a topic is deleted and we try to poll data starting from offset 0,
+      // the Kafka consumer will just block until timeout and return an empty result.
+      // So set the timeout to 1 second to make this test fast.
+      .option("kafkaConsumer.pollTimeoutMs", "1000")
       .option("startingOffsets", "earliest")
+      .option("failOnDataLoss", "false")
     val kafka = reader.load()
       .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
       .as[(String, String)]
-    val mapped: org.apache.spark.sql.Dataset[_] = kafka.map(kv => kv._2.toInt)
-
-    val clock = new StreamManualClock
-
-    val waitUntilBatchProcessed = AssertOnQuery { q =>
-      eventually(Timeout(streamingTimeout)) {
-        if (!q.exception.isDefined) {
-          assert(clock.isStreamWaitingAt(clock.getTimeMillis()))
-        }
+    KafkaSourceSuite.globalTestUtils = testUtils
+    // The following ForeachWriter will delete the topic before fetching data from Kafka
+    // in executors.
+    val query = kafka.map(kv => kv._2.toInt).writeStream.foreach(new ForeachWriter[Int] {
+      override def open(partitionId: Long, version: Long): Boolean = {
+        KafkaSourceSuite.globalTestUtils.deleteTopic(topic)
+        true
       }
-      if (q.exception.isDefined) {
-        throw q.exception.get
+
+      override def process(value: Int): Unit = {
+        KafkaSourceSuite.collectedData.add(value)
       }
-      true
-    }
 
-    testStream(mapped)(
-      StartStream(ProcessingTime(100), clock),
-      waitUntilBatchProcessed,
-      // 1 from smallest, 1 from middle, 8 from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107),
-      AdvanceManualClock(100),
-      waitUntilBatchProcessed,
-      // smallest now empty, 1 more from middle, 9 more from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
-        11, 108, 109, 110, 111, 112, 113, 114, 115, 116
-      ),
-      StopStream,
-      StartStream(ProcessingTime(100), clock),
-      waitUntilBatchProcessed,
-      // smallest now empty, 1 more from middle, 9 more from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
-        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
-        12, 117, 118, 119, 120, 121, 122, 123, 124, 125
-      ),
-      AdvanceManualClock(100),
-      waitUntilBatchProcessed,
-      // smallest now empty, 1 more from middle, 9 more from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
-        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
-        12, 117, 118, 119, 120, 121, 122, 123, 124, 125,
-        13, 126, 127, 128, 129, 130, 131, 132, 133, 134
-      )
-    )
+      override def close(errorOrNull: Throwable): Unit = {}
+    }).start()
+    query.processAllAvailable()
+    query.stop()
+    // `failOnDataLoss` is `false`, we should not fail the query
+    assert(query.exception.isEmpty)
   }
+}
+
+class KafkaSourceSuiteBase extends KafkaSourceTest {
+
+  import testImplicits._
 
   test("SPARK-22956: currentPartitionOffsets should be set when no new data comes in") {
     def getSpecificDF(range: Range.Inclusive): org.apache.spark.sql.Dataset[Int] = {
@@ -393,7 +560,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
       .format("kafka")
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
       .option("kafka.metadata.max.age.ms", "1")
-      .option("subscribePattern", s"topic-.*")
+      .option("subscribePattern", s"$topic.*")
 
     val kafka = reader.load()
       .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
@@ -487,65 +654,6 @@ class KafkaSourceSuite extends KafkaSourceTest {
     }
   }
 
-  test("subscribing topic by pattern with topic deletions") {
-    val topicPrefix = newTopic()
-    val topic = topicPrefix + "-seems"
-    val topic2 = topicPrefix + "-bad"
-    testUtils.createTopic(topic, partitions = 5)
-    testUtils.sendMessages(topic, Array("-1"))
-    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
-
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("subscribePattern", s"$topicPrefix-.*")
-      .option("failOnDataLoss", "false")
-
-    val kafka = reader.load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-    val mapped = kafka.map(kv => kv._2.toInt + 1)
-
-    testStream(mapped)(
-      makeSureGetOffsetCalled,
-      AddKafkaData(Set(topic), 1, 2, 3),
-      CheckAnswer(2, 3, 4),
-      Assert {
-        testUtils.deleteTopic(topic)
-        testUtils.createTopic(topic2, partitions = 5)
-        true
-      },
-      AddKafkaData(Set(topic2), 4, 5, 6),
-      CheckAnswer(2, 3, 4, 5, 6, 7)
-    )
-  }
-
-  test("starting offset is latest by default") {
-    val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 5)
-    testUtils.sendMessages(topic, Array("0"))
-    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
-
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", topic)
-
-    val kafka = reader.load()
-      .selectExpr("CAST(value AS STRING)")
-      .as[String]
-    val mapped = kafka.map(_.toInt)
-
-    testStream(mapped)(
-      makeSureGetOffsetCalled,
-      AddKafkaData(Set(topic), 1, 2, 3),
-      CheckAnswer(1, 2, 3)  // should not have 0
-    )
-  }
-
   test("bad source options") {
     def testBadOptions(options: (String, String)*)(expectedMsgs: String*): Unit = {
       val ex = intercept[IllegalArgumentException] {
@@ -605,77 +713,6 @@ class KafkaSourceSuite extends KafkaSourceTest {
     testUnsupportedConfig("kafka.auto.offset.reset", "latest")
   }
 
-  test("input row metrics") {
-    val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 5)
-    testUtils.sendMessages(topic, Array("-1"))
-    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
-
-    val kafka = spark
-      .readStream
-      .format("kafka")
-      .option("subscribe", topic)
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-
-    val mapped = kafka.map(kv => kv._2.toInt + 1)
-    testStream(mapped)(
-      StartStream(trigger = ProcessingTime(1)),
-      makeSureGetOffsetCalled,
-      AddKafkaData(Set(topic), 1, 2, 3),
-      CheckAnswer(2, 3, 4),
-      AssertOnQuery { query =>
-        val recordsRead = query.recentProgress.map(_.numInputRows).sum
-        recordsRead == 3
-      }
-    )
-  }
-
-  test("delete a topic when a Spark job is running") {
-    KafkaSourceSuite.collectedData.clear()
-
-    val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 1)
-    testUtils.sendMessages(topic, (1 to 10).map(_.toString).toArray)
-
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("subscribe", topic)
-      // If a topic is deleted and we try to poll data starting from offset 0,
-      // the Kafka consumer will just block until timeout and return an empty result.
-      // So set the timeout to 1 second to make this test fast.
-      .option("kafkaConsumer.pollTimeoutMs", "1000")
-      .option("startingOffsets", "earliest")
-      .option("failOnDataLoss", "false")
-    val kafka = reader.load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-    KafkaSourceSuite.globalTestUtils = testUtils
-    // The following ForeachWriter will delete the topic before fetching data from Kafka
-    // in executors.
-    val query = kafka.map(kv => kv._2.toInt).writeStream.foreach(new ForeachWriter[Int] {
-      override def open(partitionId: Long, version: Long): Boolean = {
-        KafkaSourceSuite.globalTestUtils.deleteTopic(topic)
-        true
-      }
-
-      override def process(value: Int): Unit = {
-        KafkaSourceSuite.collectedData.add(value)
-      }
-
-      override def close(errorOrNull: Throwable): Unit = {}
-    }).start()
-    query.processAllAvailable()
-    query.stop()
-    // `failOnDataLoss` is `false`, we should not fail the query
-    assert(query.exception.isEmpty)
-  }
-
   test("get offsets from case insensitive parameters") {
     for ((optionKey, optionValue, answer) <- Seq(
       (STARTING_OFFSETS_OPTION_KEY, "earLiEst", EarliestOffsetRangeLimit),
@@ -694,8 +731,6 @@ class KafkaSourceSuite extends KafkaSourceTest {
     }
   }
 
-  private def newTopic(): String = s"topic-${topicId.getAndIncrement()}"
-
   private def assignString(topic: String, partitions: Iterable[Int]): String = {
     JsonUtils.partitions(partitions.map(p => new TopicPartition(topic, p)))
   }
@@ -741,6 +776,10 @@ class KafkaSourceSuite extends KafkaSourceTest {
 
     testStream(mapped)(
       makeSureGetOffsetCalled,
+      Execute { q =>
+        // wait to reach the last offset in every partition
+        q.awaitOffset(0, KafkaSourceOffset(partitionOffsets.mapValues(_ => 3L)))
+      },
       CheckAnswer(-20, -21, -22, 0, 1, 2, 11, 12, 22),
       StopStream,
       StartStream(),
@@ -771,10 +810,13 @@ class KafkaSourceSuite extends KafkaSourceTest {
       .format("memory")
       .outputMode("append")
       .queryName("kafkaColumnTypes")
+      .trigger(defaultTrigger)
       .start()
-    query.processAllAvailable()
-    val rows = spark.table("kafkaColumnTypes").collect()
-    assert(rows.length === 1, s"Unexpected results: ${rows.toList}")
+    var rows: Array[Row] = Array()
+    eventually(timeout(streamingTimeout)) {
+      rows = spark.table("kafkaColumnTypes").collect()
+      assert(rows.length === 1, s"Unexpected results: ${rows.toList}")
+    }
     val row = rows(0)
     assert(row.getAs[Array[Byte]]("key") === null, s"Unexpected results: $row")
     assert(row.getAs[Array[Byte]]("value") === "1".getBytes(UTF_8), s"Unexpected results: $row")
@@ -788,47 +830,6 @@ class KafkaSourceSuite extends KafkaSourceTest {
     query.stop()
   }
 
-  test("KafkaSource with watermark") {
-    val now = System.currentTimeMillis()
-    val topic = newTopic()
-    testUtils.createTopic(newTopic(), partitions = 1)
-    testUtils.sendMessages(topic, Array(1).map(_.toString))
-
-    val kafka = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("startingOffsets", s"earliest")
-      .option("subscribe", topic)
-      .load()
-
-    val windowedAggregation = kafka
-      .withWatermark("timestamp", "10 seconds")
-      .groupBy(window($"timestamp", "5 seconds") as 'window)
-      .agg(count("*") as 'count)
-      .select($"window".getField("start") as 'window, $"count")
-
-    val query = windowedAggregation
-      .writeStream
-      .format("memory")
-      .outputMode("complete")
-      .queryName("kafkaWatermark")
-      .start()
-    query.processAllAvailable()
-    val rows = spark.table("kafkaWatermark").collect()
-    assert(rows.length === 1, s"Unexpected results: ${rows.toList}")
-    val row = rows(0)
-    // We cannot check the exact window start time as it depands on the time that messages were
-    // inserted by the producer. So here we just use a low bound to make sure the internal
-    // conversion works.
-    assert(
-      row.getAs[java.sql.Timestamp]("window").getTime >= now - 5 * 1000,
-      s"Unexpected results: $row")
-    assert(row.getAs[Int]("count") === 1, s"Unexpected results: $row")
-    query.stop()
-  }
-
   private def testFromLatestOffsets(
       topic: String,
       addPartitions: Boolean,
@@ -865,9 +866,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
       AddKafkaData(Set(topic), 7, 8),
       CheckAnswer(2, 3, 4, 5, 6, 7, 8, 9),
       AssertOnQuery("Add partitions") { query: StreamExecution =>
-        if (addPartitions) {
-          testUtils.addPartitions(topic, 10)
-        }
+        if (addPartitions) setTopicPartitions(topic, 10, query)
         true
       },
       AddKafkaData(Set(topic), 9, 10, 11, 12, 13, 14, 15, 16),
@@ -908,9 +907,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
       StartStream(),
       CheckAnswer(2, 3, 4, 5, 6, 7, 8, 9),
       AssertOnQuery("Add partitions") { query: StreamExecution =>
-        if (addPartitions) {
-          testUtils.addPartitions(topic, 10)
-        }
+        if (addPartitions) setTopicPartitions(topic, 10, query)
         true
       },
       AddKafkaData(Set(topic), 9, 10, 11, 12, 13, 14, 15, 16),
@@ -1042,20 +1039,8 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with Shared
     }
   }
 
-  test("stress test for failOnDataLoss=false") {
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("subscribePattern", "failOnDataLoss.*")
-      .option("startingOffsets", "earliest")
-      .option("failOnDataLoss", "false")
-      .option("fetchOffset.retryIntervalMs", "3000")
-    val kafka = reader.load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-    val query = kafka.map(kv => kv._2.toInt).writeStream.foreach(new ForeachWriter[Int] {
+  protected def startStream(ds: Dataset[Int]) = {
+    ds.writeStream.foreach(new ForeachWriter[Int] {
 
       override def open(partitionId: Long, version: Long): Boolean = {
         true
@@ -1069,6 +1054,22 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with Shared
       override def close(errorOrNull: Throwable): Unit = {
       }
     }).start()
+  }
+
+  test("stress test for failOnDataLoss=false") {
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("subscribePattern", "failOnDataLoss.*")
+      .option("startingOffsets", "earliest")
+      .option("failOnDataLoss", "false")
+      .option("fetchOffset.retryIntervalMs", "3000")
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val query = startStream(kafka.map(kv => kv._2.toInt))
 
     val testTime = 1.minutes
     val startTime = System.currentTimeMillis()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index e8d683a578f35..b714a46b5f786 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -191,6 +191,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
           ds = ds.asInstanceOf[DataSourceV2],
           conf = sparkSession.sessionState.conf)).asJava)
 
+      // Streaming also uses the data source V2 API. So it may be that the data source implements
+      // v2, but has no v2 implementation for batch reads. In that case, we fall back to loading
+      // the dataframe as a v1 source.
       val reader = (ds, userSpecifiedSchema) match {
         case (ds: ReadSupportWithSchema, Some(schema)) =>
           ds.createReader(schema, options)
@@ -208,23 +211,30 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
           }
           reader
 
-        case _ =>
-          throw new AnalysisException(s"$cls does not support data reading.")
+        case _ => null // fall back to v1
       }
 
-      Dataset.ofRows(sparkSession, DataSourceV2Relation(reader))
+      if (reader == null) {
+        loadV1Source(paths: _*)
+      } else {
+        Dataset.ofRows(sparkSession, DataSourceV2Relation(reader))
+      }
     } else {
-      // Code path for data source v1.
-      sparkSession.baseRelationToDataFrame(
-        DataSource.apply(
-          sparkSession,
-          paths = paths,
-          userSpecifiedSchema = userSpecifiedSchema,
-          className = source,
-          options = extraOptions.toMap).resolveRelation())
+      loadV1Source(paths: _*)
     }
   }
 
+  private def loadV1Source(paths: String*) = {
+    // Code path for data source v1.
+    sparkSession.baseRelationToDataFrame(
+      DataSource.apply(
+        sparkSession,
+        paths = paths,
+        userSpecifiedSchema = userSpecifiedSchema,
+        className = source,
+        options = extraOptions.toMap).resolveRelation())
+  }
+
   /**
    * Construct a `DataFrame` representing the database table accessible via JDBC URL
    * url named table and connection properties.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 3304f368e1050..97f12ff625c42 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -255,17 +255,24 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
             }
           }
 
-        case _ => throw new AnalysisException(s"$cls does not support data writing.")
+        // Streaming also uses the data source V2 API. So it may be that the data source implements
+        // v2, but has no v2 implementation for batch writes. In that case, we fall back to saving
+        // as though it's a V1 source.
+        case _ => saveToV1Source()
       }
     } else {
-      // Code path for data source v1.
-      runCommand(df.sparkSession, "save") {
-        DataSource(
-          sparkSession = df.sparkSession,
-          className = source,
-          partitionColumns = partitioningColumns.getOrElse(Nil),
-          options = extraOptions.toMap).planForWriting(mode, AnalysisBarrier(df.logicalPlan))
-      }
+      saveToV1Source()
+    }
+  }
+
+  private def saveToV1Source(): Unit = {
+    // Code path for data source v1.
+    runCommand(df.sparkSession, "save") {
+      DataSource(
+        sparkSession = df.sparkSession,
+        className = source,
+        partitionColumns = partitioningColumns.getOrElse(Nil),
+        options = extraOptions.toMap).planForWriting(mode, AnalysisBarrier(df.logicalPlan))
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
index f0bdf84bb7a84..a4a857f2d4d9b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
@@ -81,9 +81,11 @@ case class WriteToDataSourceV2Exec(writer: DataSourceV2Writer, query: SparkPlan)
         (index, message: WriterCommitMessage) => messages(index) = message
       )
 
-      logInfo(s"Data source writer $writer is committing.")
-      writer.commit(messages)
-      logInfo(s"Data source writer $writer committed.")
+      if (!writer.isInstanceOf[ContinuousWriter]) {
+        logInfo(s"Data source writer $writer is committing.")
+        writer.commit(messages)
+        logInfo(s"Data source writer $writer committed.")
+      }
     } catch {
       case _: InterruptedException if writer.isInstanceOf[ContinuousWriter] =>
         // Interruption is how continuous queries are ended, so accept and ignore the exception.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 24a8b000df0c1..cf27e1a70650a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -142,7 +142,8 @@ abstract class StreamExecution(
 
   override val id: UUID = UUID.fromString(streamMetadata.id)
 
-  override val runId: UUID = UUID.randomUUID
+  override def runId: UUID = currentRunId
+  protected var currentRunId = UUID.randomUUID
 
   /**
    * Pretty identified string of printing in logs. Format is
@@ -418,11 +419,17 @@ abstract class StreamExecution(
    * Blocks the current thread until processing for data from the given `source` has reached at
    * least the given `Offset`. This method is intended for use primarily when writing tests.
    */
-  private[sql] def awaitOffset(source: BaseStreamingSource, newOffset: Offset): Unit = {
+  private[sql] def awaitOffset(sourceIndex: Int, newOffset: Offset): Unit = {
     assertAwaitThread()
     def notDone = {
       val localCommittedOffsets = committedOffsets
-      !localCommittedOffsets.contains(source) || localCommittedOffsets(source) != newOffset
+      if (sources == null) {
+        // sources might not be initialized yet
+        false
+      } else {
+        val source = sources(sourceIndex)
+        !localCommittedOffsets.contains(source) || localCommittedOffsets(source) != newOffset
+      }
     }
 
     while (notDone) {
@@ -436,7 +443,7 @@ abstract class StreamExecution(
         awaitProgressLock.unlock()
       }
     }
-    logDebug(s"Unblocked at $newOffset for $source")
+    logDebug(s"Unblocked at $newOffset for ${sources(sourceIndex)}")
   }
 
   /** A flag to indicate that a batch has completed with no new data available. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
index b3f1a1a1aaab3..66eb42d4658f6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
@@ -77,7 +77,6 @@ class ContinuousDataSourceRDD(
     dataReaderThread.start()
 
     context.addTaskCompletionListener(_ => {
-      reader.close()
       dataReaderThread.interrupt()
       epochPollExecutor.shutdown()
     })
@@ -177,6 +176,7 @@ class DataReaderThread(
   private[continuous] var failureReason: Throwable = _
 
   override def run(): Unit = {
+    TaskContext.setTaskContext(context)
     val baseReader = ContinuousDataSourceRDD.getBaseReader(reader)
     try {
       while (!context.isInterrupted && !context.isCompleted()) {
@@ -201,6 +201,8 @@ class DataReaderThread(
         failedFlag.set(true)
         // Don't rethrow the exception in this thread. It's not needed, and the default Spark
         // exception handler will kill the executor.
+    } finally {
+      reader.close()
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
index 9657b5e26d770..667410ef9f1c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -17,7 +17,9 @@
 
 package org.apache.spark.sql.execution.streaming.continuous
 
+import java.util.UUID
 import java.util.concurrent.TimeUnit
+import java.util.function.UnaryOperator
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, Map => MutableMap}
@@ -52,7 +54,7 @@ class ContinuousExecution(
     sparkSession, name, checkpointRoot, analyzedPlan, sink,
     trigger, triggerClock, outputMode, deleteCheckpointOnStop) {
 
-  @volatile protected var continuousSources: Seq[ContinuousReader] = Seq.empty
+  @volatile protected var continuousSources: Seq[ContinuousReader] = _
   override protected def sources: Seq[BaseStreamingSource] = continuousSources
 
   override lazy val logicalPlan: LogicalPlan = {
@@ -78,15 +80,17 @@ class ContinuousExecution(
   }
 
   override protected def runActivatedStream(sparkSessionForStream: SparkSession): Unit = {
-    do {
-      try {
-        runContinuous(sparkSessionForStream)
-      } catch {
-        case _: InterruptedException if state.get().equals(RECONFIGURING) =>
-          // swallow exception and run again
-          state.set(ACTIVE)
+    val stateUpdate = new UnaryOperator[State] {
+      override def apply(s: State) = s match {
+        // If we ended the query to reconfigure, reset the state to active.
+        case RECONFIGURING => ACTIVE
+        case _ => s
       }
-    } while (state.get() == ACTIVE)
+    }
+
+    do {
+      runContinuous(sparkSessionForStream)
+    } while (state.updateAndGet(stateUpdate) == ACTIVE)
   }
 
   /**
@@ -120,12 +124,16 @@ class ContinuousExecution(
         }
         committedOffsets = nextOffsets.toStreamProgress(sources)
 
-        // Forcibly align commit and offset logs by slicing off any spurious offset logs from
-        // a previous run. We can't allow commits to an epoch that a previous run reached but
-        // this run has not.
-        offsetLog.purgeAfter(latestEpochId)
+        // Get to an epoch ID that has definitely never been sent to a sink before. Since sink
+        // commit happens between offset log write and commit log write, this means an epoch ID
+        // which is not in the offset log.
+        val (latestOffsetEpoch, _) = offsetLog.getLatest().getOrElse {
+          throw new IllegalStateException(
+            s"Offset log had no latest element. This shouldn't be possible because nextOffsets is" +
+              s"an element.")
+        }
+        currentBatchId = latestOffsetEpoch + 1
 
-        currentBatchId = latestEpochId + 1
         logDebug(s"Resuming at epoch $currentBatchId with committed offsets $committedOffsets")
         nextOffsets
       case None =>
@@ -141,6 +149,7 @@ class ContinuousExecution(
    * @param sparkSessionForQuery Isolated [[SparkSession]] to run the continuous query with.
    */
   private def runContinuous(sparkSessionForQuery: SparkSession): Unit = {
+    currentRunId = UUID.randomUUID
     // A list of attributes that will need to be updated.
     val replacements = new ArrayBuffer[(Attribute, Attribute)]
     // Translate from continuous relation to the underlying data source.
@@ -225,13 +234,11 @@ class ContinuousExecution(
           triggerExecutor.execute(() => {
             startTrigger()
 
-            if (reader.needsReconfiguration()) {
-              state.set(RECONFIGURING)
+            if (reader.needsReconfiguration() && state.compareAndSet(ACTIVE, RECONFIGURING)) {
               stopSources()
               if (queryExecutionThread.isAlive) {
                 sparkSession.sparkContext.cancelJobGroup(runId.toString)
                 queryExecutionThread.interrupt()
-                // No need to join - this thread is about to end anyway.
               }
               false
             } else if (isActive) {
@@ -259,6 +266,7 @@ class ContinuousExecution(
           sparkSessionForQuery, lastExecution)(lastExecution.toRdd)
       }
     } finally {
+      epochEndpoint.askSync[Unit](StopContinuousExecutionWrites)
       SparkEnv.get.rpcEnv.stop(epochEndpoint)
 
       epochUpdateThread.interrupt()
@@ -273,17 +281,22 @@ class ContinuousExecution(
       epoch: Long, reader: ContinuousReader, partitionOffsets: Seq[PartitionOffset]): Unit = {
     assert(continuousSources.length == 1, "only one continuous source supported currently")
 
-    if (partitionOffsets.contains(null)) {
-      // If any offset is null, that means the corresponding partition hasn't seen any data yet, so
-      // there's nothing meaningful to add to the offset log.
-    }
     val globalOffset = reader.mergeOffsets(partitionOffsets.toArray)
-    synchronized {
-      if (queryExecutionThread.isAlive) {
-        offsetLog.add(epoch, OffsetSeq.fill(globalOffset))
-      } else {
-        return
-      }
+    val oldOffset = synchronized {
+      offsetLog.add(epoch, OffsetSeq.fill(globalOffset))
+      offsetLog.get(epoch - 1)
+    }
+
+    // If offset hasn't changed since last epoch, there's been no new data.
+    if (oldOffset.contains(OffsetSeq.fill(globalOffset))) {
+      noNewData = true
+    }
+
+    awaitProgressLock.lock()
+    try {
+      awaitProgressLockCondition.signalAll()
+    } finally {
+      awaitProgressLock.unlock()
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
index 98017c3ac6a33..40dcbecade814 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
@@ -39,6 +39,15 @@ private[continuous] sealed trait EpochCoordinatorMessage extends Serializable
  */
 private[sql] case object IncrementAndGetEpoch extends EpochCoordinatorMessage
 
+/**
+ * The RpcEndpoint stop() will wait to clear out the message queue before terminating the
+ * object. This can lead to a race condition where the query restarts at epoch n, a new
+ * EpochCoordinator starts at epoch n, and then the old epoch coordinator commits epoch n + 1.
+ * The framework doesn't provide a handle to wait on the message queue, so we use a synchronous
+ * message to stop any writes to the ContinuousExecution object.
+ */
+private[sql] case object StopContinuousExecutionWrites extends EpochCoordinatorMessage
+
 // Init messages
 /**
  * Set the reader and writer partition counts. Tasks may not be started until the coordinator
@@ -116,6 +125,8 @@ private[continuous] class EpochCoordinator(
     override val rpcEnv: RpcEnv)
   extends ThreadSafeRpcEndpoint with Logging {
 
+  private var queryWritesStopped: Boolean = false
+
   private var numReaderPartitions: Int = _
   private var numWriterPartitions: Int = _
 
@@ -147,12 +158,16 @@ private[continuous] class EpochCoordinator(
         partitionCommits.remove(k)
       }
       for (k <- partitionOffsets.keys.filter { case (e, _) => e < epoch }) {
-        partitionCommits.remove(k)
+        partitionOffsets.remove(k)
       }
     }
   }
 
   override def receive: PartialFunction[Any, Unit] = {
+    // If we just drop these messages, we won't do any writes to the query. The lame duck tasks
+    // won't shed errors or anything.
+    case _ if queryWritesStopped => ()
+
     case CommitPartitionEpoch(partitionId, epoch, message) =>
       logDebug(s"Got commit from partition $partitionId at epoch $epoch: $message")
       if (!partitionCommits.isDefinedAt((epoch, partitionId))) {
@@ -188,5 +203,9 @@ private[continuous] class EpochCoordinator(
     case SetWriterPartitions(numPartitions) =>
       numWriterPartitions = numPartitions
       context.reply(())
+
+    case StopContinuousExecutionWrites =>
+      queryWritesStopped = true
+      context.reply(())
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
index db588ae282f38..b5b4a05ab4973 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.continuous.ContinuousTrigger
 import org.apache.spark.sql.execution.streaming.sources.{MemoryPlanV2, MemorySinkV2}
+import org.apache.spark.sql.sources.v2.streaming.ContinuousWriteSupport
 
 /**
  * Interface used to write a streaming `Dataset` to external storage systems (e.g. file systems,
@@ -279,18 +280,29 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
         useTempCheckpointLocation = true,
         trigger = trigger)
     } else {
-      val dataSource =
-        DataSource(
-          df.sparkSession,
-          className = source,
-          options = extraOptions.toMap,
-          partitionColumns = normalizedParCols.getOrElse(Nil))
+      val sink = trigger match {
+        case _: ContinuousTrigger =>
+          val ds = DataSource.lookupDataSource(source, df.sparkSession.sessionState.conf)
+          ds.newInstance() match {
+            case w: ContinuousWriteSupport => w
+            case _ => throw new AnalysisException(
+              s"Data source $source does not support continuous writing")
+          }
+        case _ =>
+          val ds = DataSource(
+            df.sparkSession,
+            className = source,
+            options = extraOptions.toMap,
+            partitionColumns = normalizedParCols.getOrElse(Nil))
+          ds.createSink(outputMode)
+      }
+
       df.sparkSession.sessionState.streamingQueryManager.startQuery(
         extraOptions.get("queryName"),
         extraOptions.get("checkpointLocation"),
         df,
         extraOptions.toMap,
-        dataSource.createSink(outputMode),
+        sink,
         outputMode,
         useTempCheckpointLocation = source == "console",
         recoverFromCheckpointLocation = true,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index d46461fa9bf6d..0762895fdc620 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -38,8 +38,9 @@ import org.apache.spark.sql.{Dataset, Encoder, QueryTest, Row}
 import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.execution.streaming.continuous.{ContinuousExecution, EpochCoordinatorRef, IncrementAndGetEpoch}
+import org.apache.spark.sql.execution.streaming.continuous.{ContinuousExecution, ContinuousTrigger, EpochCoordinatorRef, IncrementAndGetEpoch}
 import org.apache.spark.sql.execution.streaming.sources.MemorySinkV2
 import org.apache.spark.sql.execution.streaming.state.StateStore
 import org.apache.spark.sql.streaming.StreamingQueryListener._
@@ -80,6 +81,9 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
     StateStore.stop() // stop the state store maintenance thread and unload store providers
   }
 
+  protected val defaultTrigger = Trigger.ProcessingTime(0)
+  protected val defaultUseV2Sink = false
+
   /** How long to wait for an active stream to catch up when checking a result. */
   val streamingTimeout = 10.seconds
 
@@ -189,7 +193,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
 
   /** Starts the stream, resuming if data has already been processed. It must not be running. */
   case class StartStream(
-      trigger: Trigger = Trigger.ProcessingTime(0),
+      trigger: Trigger = defaultTrigger,
       triggerClock: Clock = new SystemClock,
       additionalConfs: Map[String, String] = Map.empty,
       checkpointLocation: String = null)
@@ -276,7 +280,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
   def testStream(
       _stream: Dataset[_],
       outputMode: OutputMode = OutputMode.Append,
-      useV2Sink: Boolean = false)(actions: StreamAction*): Unit = synchronized {
+      useV2Sink: Boolean = defaultUseV2Sink)(actions: StreamAction*): Unit = synchronized {
     import org.apache.spark.sql.streaming.util.StreamManualClock
 
     // `synchronized` is added to prevent the user from calling multiple `testStream`s concurrently
@@ -403,18 +407,11 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
 
     def fetchStreamAnswer(currentStream: StreamExecution, lastOnly: Boolean) = {
       verify(currentStream != null, "stream not running")
-      // Get the map of source index to the current source objects
-      val indexToSource = currentStream
-        .logicalPlan
-        .collect { case StreamingExecutionRelation(s, _) => s }
-        .zipWithIndex
-        .map(_.swap)
-        .toMap
 
       // Block until all data added has been processed for all the source
       awaiting.foreach { case (sourceIndex, offset) =>
         failAfter(streamingTimeout) {
-          currentStream.awaitOffset(indexToSource(sourceIndex), offset)
+          currentStream.awaitOffset(sourceIndex, offset)
         }
       }
 
@@ -473,6 +470,12 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
             // after starting the query.
             try {
               currentStream.awaitInitialization(streamingTimeout.toMillis)
+              currentStream match {
+                case s: ContinuousExecution => eventually("IncrementalExecution was not created") {
+                    s.lastExecution.executedPlan // will fail if lastExecution is null
+                  }
+                case _ =>
+              }
             } catch {
               case _: StreamingQueryException =>
                 // Ignore the exception. `StopStream` or `ExpectFailure` will catch it as well.
@@ -600,7 +603,10 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
 
               def findSourceIndex(plan: LogicalPlan): Option[Int] = {
                 plan
-                  .collect { case StreamingExecutionRelation(s, _) => s }
+                  .collect {
+                    case StreamingExecutionRelation(s, _) => s
+                    case DataSourceV2Relation(_, r) => r
+                  }
                   .zipWithIndex
                   .find(_._1 == source)
                   .map(_._2)
@@ -613,9 +619,13 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
                   findSourceIndex(query.logicalPlan)
                 }.orElse {
                   findSourceIndex(stream.logicalPlan)
+                }.orElse {
+                  queryToUse.flatMap { q =>
+                    findSourceIndex(q.lastExecution.logical)
+                  }
                 }.getOrElse {
                   throw new IllegalArgumentException(
-                    "Could find index of the source to which data was added")
+                    "Could not find index of the source to which data was added")
                 }
 
               // Store the expected offset of added data to wait for it later

From b9339eee1304c0309be4ea74f8cdc3d37a8048d3 Mon Sep 17 00:00:00 2001
From: Sameer Agarwal <sameerag@apache.org>
Date: Tue, 16 Jan 2018 22:17:37 -0800
Subject: [PATCH 107/566] Revert "[SPARK-23020][CORE] Fix races in launcher
 code, test."

This reverts commit 20c69816a63071b82b1035d4b48798c358206421.
---
 .../spark/launcher/SparkLauncherSuite.java    | 49 +++++++------------
 .../spark/launcher/AbstractAppHandle.java     | 22 ++-------
 .../spark/launcher/ChildProcAppHandle.java    | 18 +++----
 .../spark/launcher/InProcessAppHandle.java    | 17 +++----
 .../spark/launcher/LauncherConnection.java    | 14 +++---
 .../apache/spark/launcher/LauncherServer.java | 46 +++--------------
 .../org/apache/spark/launcher/BaseSuite.java  | 42 +++-------------
 .../spark/launcher/LauncherServerSuite.java   | 20 +++++---
 8 files changed, 72 insertions(+), 156 deletions(-)

diff --git a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
index a042375c6ae91..9d2f563b2e367 100644
--- a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
+++ b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
@@ -17,7 +17,6 @@
 
 package org.apache.spark.launcher;
 
-import java.time.Duration;
 import java.util.Arrays;
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -32,7 +31,6 @@
 import static org.mockito.Mockito.*;
 
 import org.apache.spark.SparkContext;
-import org.apache.spark.SparkContext$;
 import org.apache.spark.internal.config.package$;
 import org.apache.spark.util.Utils;
 
@@ -139,9 +137,7 @@ public void testInProcessLauncher() throws Exception {
       // Here DAGScheduler is stopped, while SparkContext.clearActiveContext may not be called yet.
       // Wait for a reasonable amount of time to avoid creating two active SparkContext in JVM.
       // See SPARK-23019 and SparkContext.stop() for details.
-      eventually(Duration.ofSeconds(5), Duration.ofMillis(10), () -> {
-        assertTrue("SparkContext is still alive.", SparkContext$.MODULE$.getActive().isEmpty());
-      });
+      TimeUnit.MILLISECONDS.sleep(500);
     }
   }
 
@@ -150,35 +146,26 @@ private void inProcessLauncherTestImpl() throws Exception {
     SparkAppHandle.Listener listener = mock(SparkAppHandle.Listener.class);
     doAnswer(invocation -> {
       SparkAppHandle h = (SparkAppHandle) invocation.getArguments()[0];
-      synchronized (transitions) {
-        transitions.add(h.getState());
-      }
+      transitions.add(h.getState());
       return null;
     }).when(listener).stateChanged(any(SparkAppHandle.class));
 
-    SparkAppHandle handle = null;
-    try {
-      handle = new InProcessLauncher()
-        .setMaster("local")
-        .setAppResource(SparkLauncher.NO_RESOURCE)
-        .setMainClass(InProcessTestApp.class.getName())
-        .addAppArgs("hello")
-        .startApplication(listener);
-
-      waitFor(handle);
-      assertEquals(SparkAppHandle.State.FINISHED, handle.getState());
-
-      // Matches the behavior of LocalSchedulerBackend.
-      List<SparkAppHandle.State> expected = Arrays.asList(
-        SparkAppHandle.State.CONNECTED,
-        SparkAppHandle.State.RUNNING,
-        SparkAppHandle.State.FINISHED);
-      assertEquals(expected, transitions);
-    } finally {
-      if (handle != null) {
-        handle.kill();
-      }
-    }
+    SparkAppHandle handle = new InProcessLauncher()
+      .setMaster("local")
+      .setAppResource(SparkLauncher.NO_RESOURCE)
+      .setMainClass(InProcessTestApp.class.getName())
+      .addAppArgs("hello")
+      .startApplication(listener);
+
+    waitFor(handle);
+    assertEquals(SparkAppHandle.State.FINISHED, handle.getState());
+
+    // Matches the behavior of LocalSchedulerBackend.
+    List<SparkAppHandle.State> expected = Arrays.asList(
+      SparkAppHandle.State.CONNECTED,
+      SparkAppHandle.State.RUNNING,
+      SparkAppHandle.State.FINISHED);
+    assertEquals(expected, transitions);
   }
 
   public static class SparkLauncherTestApp {
diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java
index daf0972f824dd..df1e7316861d4 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java
@@ -33,7 +33,7 @@ abstract class AbstractAppHandle implements SparkAppHandle {
   private List<Listener> listeners;
   private State state;
   private String appId;
-  private volatile boolean disposed;
+  private boolean disposed;
 
   protected AbstractAppHandle(LauncherServer server) {
     this.server = server;
@@ -70,7 +70,8 @@ public void stop() {
 
   @Override
   public synchronized void disconnect() {
-    if (!isDisposed()) {
+    if (!disposed) {
+      disposed = true;
       if (connection != null) {
         try {
           connection.close();
@@ -78,7 +79,7 @@ public synchronized void disconnect() {
           // no-op.
         }
       }
-      dispose();
+      server.unregister(this);
     }
   }
 
@@ -94,21 +95,6 @@ boolean isDisposed() {
     return disposed;
   }
 
-  /**
-   * Mark the handle as disposed, and set it as LOST in case the current state is not final.
-   */
-  synchronized void dispose() {
-    if (!isDisposed()) {
-      // Unregister first to make sure that the connection with the app has been really
-      // terminated.
-      server.unregister(this);
-      if (!getState().isFinal()) {
-        setState(State.LOST);
-      }
-      this.disposed = true;
-    }
-  }
-
   void setState(State s) {
     setState(s, false);
   }
diff --git a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
index 2b99461652e1f..8b3f427b7750e 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
@@ -48,16 +48,14 @@ public synchronized void disconnect() {
 
   @Override
   public synchronized void kill() {
-    if (!isDisposed()) {
-      setState(State.KILLED);
-      disconnect();
-      if (childProc != null) {
-        if (childProc.isAlive()) {
-          childProc.destroyForcibly();
-        }
-        childProc = null;
+    disconnect();
+    if (childProc != null) {
+      if (childProc.isAlive()) {
+        childProc.destroyForcibly();
       }
+      childProc = null;
     }
+    setState(State.KILLED);
   }
 
   void setChildProc(Process childProc, String loggerName, InputStream logStream) {
@@ -96,6 +94,8 @@ void monitorChild() {
         return;
       }
 
+      disconnect();
+
       int ec;
       try {
         ec = proc.exitValue();
@@ -118,8 +118,6 @@ void monitorChild() {
       if (newState != null) {
         setState(newState, true);
       }
-
-      disconnect();
     }
   }
 
diff --git a/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
index f04263cb74a58..acd64c962604f 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
@@ -39,16 +39,15 @@ class InProcessAppHandle extends AbstractAppHandle {
 
   @Override
   public synchronized void kill() {
-    if (!isDisposed()) {
-      LOG.warning("kill() may leave the underlying app running in in-process mode.");
-      setState(State.KILLED);
-      disconnect();
-
-      // Interrupt the thread. This is not guaranteed to kill the app, though.
-      if (app != null) {
-        app.interrupt();
-      }
+    LOG.warning("kill() may leave the underlying app running in in-process mode.");
+    disconnect();
+
+    // Interrupt the thread. This is not guaranteed to kill the app, though.
+    if (app != null) {
+      app.interrupt();
     }
+
+    setState(State.KILLED);
   }
 
   synchronized void start(String appName, Method main, String[] args) {
diff --git a/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java b/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java
index fd6f229b2349c..b4a8719e26053 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java
@@ -95,15 +95,15 @@ protected synchronized void send(Message msg) throws IOException {
   }
 
   @Override
-  public synchronized void close() throws IOException {
+  public void close() throws IOException {
     if (!closed) {
-      closed = true;
-      socket.close();
+      synchronized (this) {
+        if (!closed) {
+          closed = true;
+          socket.close();
+        }
+      }
     }
   }
 
-  boolean isOpen() {
-    return !closed;
-  }
-
 }
diff --git a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
index 660c4443b20b9..b8999a1d7a4f4 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
@@ -217,33 +217,6 @@ void unregister(AbstractAppHandle handle) {
         break;
       }
     }
-
-    // If there is a live connection for this handle, we need to wait for it to finish before
-    // returning, otherwise there might be a race between the connection thread processing
-    // buffered data and the handle cleaning up after itself, leading to potentially the wrong
-    // state being reported for the handle.
-    ServerConnection conn = null;
-    synchronized (clients) {
-      for (ServerConnection c : clients) {
-        if (c.handle == handle) {
-          conn = c;
-          break;
-        }
-      }
-    }
-
-    if (conn != null) {
-      synchronized (conn) {
-        if (conn.isOpen()) {
-          try {
-            conn.wait();
-          } catch (InterruptedException ie) {
-            // Ignore.
-          }
-        }
-      }
-    }
-
     unref();
   }
 
@@ -315,7 +288,7 @@ private String createSecret() {
   private class ServerConnection extends LauncherConnection {
 
     private TimerTask timeout;
-    volatile AbstractAppHandle handle;
+    private AbstractAppHandle handle;
 
     ServerConnection(Socket socket, TimerTask timeout) throws IOException {
       super(socket);
@@ -365,21 +338,16 @@ protected void handle(Message msg) throws IOException {
 
     @Override
     public void close() throws IOException {
-      if (!isOpen()) {
-        return;
-      }
-
       synchronized (clients) {
         clients.remove(this);
       }
-
-      synchronized (this) {
-        super.close();
-        notifyAll();
-      }
-
+      super.close();
       if (handle != null) {
-        handle.dispose();
+        if (!handle.getState().isFinal()) {
+          LOG.log(Level.WARNING, "Lost connection to spark application.");
+          handle.setState(SparkAppHandle.State.LOST);
+        }
+        handle.disconnect();
       }
     }
 
diff --git a/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java b/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java
index 3722a59d9438e..3e1a90eae98d4 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java
@@ -17,7 +17,6 @@
 
 package org.apache.spark.launcher;
 
-import java.time.Duration;
 import java.util.concurrent.TimeUnit;
 
 import org.junit.After;
@@ -48,46 +47,19 @@ public void postChecks() {
     assertNull(server);
   }
 
-  protected void waitFor(final SparkAppHandle handle) throws Exception {
+  protected void waitFor(SparkAppHandle handle) throws Exception {
+    long deadline = System.nanoTime() + TimeUnit.SECONDS.toNanos(10);
     try {
-      eventually(Duration.ofSeconds(10), Duration.ofMillis(10), () -> {
-        assertTrue("Handle is not in final state.", handle.getState().isFinal());
-      });
+      while (!handle.getState().isFinal()) {
+        assertTrue("Timed out waiting for handle to transition to final state.",
+          System.nanoTime() < deadline);
+        TimeUnit.MILLISECONDS.sleep(10);
+      }
     } finally {
       if (!handle.getState().isFinal()) {
         handle.kill();
       }
     }
-
-    // Wait until the handle has been marked as disposed, to make sure all cleanup tasks
-    // have been performed.
-    AbstractAppHandle ahandle = (AbstractAppHandle) handle;
-    eventually(Duration.ofSeconds(10), Duration.ofMillis(10), () -> {
-      assertTrue("Handle is still not marked as disposed.", ahandle.isDisposed());
-    });
-  }
-
-  /**
-   * Call a closure that performs a check every "period" until it succeeds, or the timeout
-   * elapses.
-   */
-  protected void eventually(Duration timeout, Duration period, Runnable check) throws Exception {
-    assertTrue("Timeout needs to be larger than period.", timeout.compareTo(period) > 0);
-    long deadline = System.nanoTime() + timeout.toNanos();
-    int count = 0;
-    while (true) {
-      try {
-        count++;
-        check.run();
-        return;
-      } catch (Throwable t) {
-        if (System.nanoTime() >= deadline) {
-          String msg = String.format("Failed check after %d tries: %s.", count, t.getMessage());
-          throw new IllegalStateException(msg, t);
-        }
-        Thread.sleep(period.toMillis());
-      }
-    }
   }
 
 }
diff --git a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
index 75c1af0c71e2a..7e2b09ce25c9b 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
@@ -23,14 +23,12 @@
 import java.net.InetAddress;
 import java.net.Socket;
 import java.net.SocketException;
-import java.time.Duration;
 import java.util.Arrays;
 import java.util.List;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.Semaphore;
 import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicBoolean;
 
 import org.junit.Test;
 import static org.junit.Assert.*;
@@ -199,20 +197,28 @@ private void close(Closeable c) {
    * server-side close immediately.
    */
   private void waitForError(TestClient client, String secret) throws Exception {
-    final AtomicBoolean helloSent = new AtomicBoolean();
-    eventually(Duration.ofSeconds(1), Duration.ofMillis(10), () -> {
+    boolean helloSent = false;
+    int maxTries = 10;
+    for (int i = 0; i < maxTries; i++) {
       try {
-        if (!helloSent.get()) {
+        if (!helloSent) {
           client.send(new Hello(secret, "1.4.0"));
-          helloSent.set(true);
+          helloSent = true;
         } else {
           client.send(new SetAppId("appId"));
         }
         fail("Expected error but message went through.");
       } catch (IllegalStateException | IOException e) {
         // Expected.
+        break;
+      } catch (AssertionError e) {
+        if (i < maxTries - 1) {
+          Thread.sleep(100);
+        } else {
+          throw new AssertionError("Test failed after " + maxTries + " attempts.", e);
+        }
       }
-    });
+    }
   }
 
   private static class TestClient extends LauncherConnection {

From 00c744e40be3a96f1fe7c377725703fc7b9ca3e3 Mon Sep 17 00:00:00 2001
From: Jose Torres <jose@databricks.com>
Date: Tue, 16 Jan 2018 22:27:28 -0800
Subject: [PATCH 108/566] Fix merge between 07ae39d0ec and 1667057851

## What changes were proposed in this pull request?

The first commit added a new test, and the second refactored the class the test was in. The automatic merge put the test in the wrong place.

## How was this patch tested?
-

Author: Jose Torres <jose@databricks.com>

Closes #20289 from jose-torres/fix.

(cherry picked from commit a963980a6d2b4bef2c546aa33acf0aa501d2507b)
Signed-off-by: Shixiong Zhu <zsxwing@gmail.com>
---
 .../apache/spark/sql/kafka010/KafkaSourceSuite.scala   | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index 1acff61e11d2a..62f6a34a6b67a 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -479,11 +479,6 @@ class KafkaMicroBatchSourceSuite extends KafkaSourceSuiteBase {
     // `failOnDataLoss` is `false`, we should not fail the query
     assert(query.exception.isEmpty)
   }
-}
-
-class KafkaSourceSuiteBase extends KafkaSourceTest {
-
-  import testImplicits._
 
   test("SPARK-22956: currentPartitionOffsets should be set when no new data comes in") {
     def getSpecificDF(range: Range.Inclusive): org.apache.spark.sql.Dataset[Int] = {
@@ -549,6 +544,11 @@ class KafkaSourceSuiteBase extends KafkaSourceTest {
       CheckLastBatch(120 to 124: _*)
     )
   }
+}
+
+class KafkaSourceSuiteBase extends KafkaSourceTest {
+
+  import testImplicits._
 
   test("cannot stop Kafka stream") {
     val topic = newTopic()

From 8ef323c572cee181e3bdbddeeb7119eda03d78f4 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Wed, 17 Jan 2018 14:32:18 +0800
Subject: [PATCH 109/566] [SPARK-23072][SQL][TEST] Add a Unicode schema test
 for file-based data sources

## What changes were proposed in this pull request?

After [SPARK-20682](https://github.com/apache/spark/pull/19651), Apache Spark 2.3 is able to read ORC files with Unicode schema. Previously, it raises `org.apache.spark.sql.catalyst.parser.ParseException`.

This PR adds a Unicode schema test for CSV/JSON/ORC/Parquet file-based data sources. Note that TEXT data source only has [a single column with a fixed name 'value'](https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala#L71).

## How was this patch tested?

Pass the newly added test case.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20266 from dongjoon-hyun/SPARK-23072.

(cherry picked from commit a0aedb0ded4183cc33b27e369df1cbf862779e26)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/FileBasedDataSourceSuite.scala  | 81 +++++++++++++++++++
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 16 ----
 .../sql/hive/MetastoreDataSourcesSuite.scala  | 14 ----
 .../sql/hive/execution/SQLQuerySuite.scala    |  8 --
 4 files changed, 81 insertions(+), 38 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
new file mode 100644
index 0000000000000..22fb496bc838e
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.test.SharedSQLContext
+
+class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext {
+  import testImplicits._
+
+  private val allFileBasedDataSources = Seq("orc", "parquet", "csv", "json", "text")
+
+  allFileBasedDataSources.foreach { format =>
+    test(s"Writing empty datasets should not fail - $format") {
+      withTempPath { dir =>
+        Seq("str").toDS().limit(0).write.format(format).save(dir.getCanonicalPath)
+      }
+    }
+  }
+
+  // `TEXT` data source always has a single column whose name is `value`.
+  allFileBasedDataSources.filterNot(_ == "text").foreach { format =>
+    test(s"SPARK-23072 Write and read back unicode column names - $format") {
+      withTempPath { path =>
+        val dir = path.getCanonicalPath
+
+        // scalastyle:off nonascii
+        val df = Seq("a").toDF("한글")
+        // scalastyle:on nonascii
+
+        df.write.format(format).option("header", "true").save(dir)
+        val answerDf = spark.read.format(format).option("header", "true").load(dir)
+
+        assert(df.schema.sameType(answerDf.schema))
+        checkAnswer(df, answerDf)
+      }
+    }
+  }
+
+  // Only ORC/Parquet support this. `CSV` and `JSON` returns an empty schema.
+  // `TEXT` data source always has a single column whose name is `value`.
+  Seq("orc", "parquet").foreach { format =>
+    test(s"SPARK-15474 Write and read back non-emtpy schema with empty dataframe - $format") {
+      withTempPath { file =>
+        val path = file.getCanonicalPath
+        val emptyDf = Seq((true, 1, "str")).toDF().limit(0)
+        emptyDf.write.format(format).save(path)
+
+        val df = spark.read.format(format).load(path)
+        assert(df.schema.sameType(emptyDf.schema))
+        checkAnswer(df, emptyDf)
+      }
+    }
+  }
+
+  allFileBasedDataSources.foreach { format =>
+    test(s"SPARK-22146 read files containing special characters using $format") {
+      val nameWithSpecialChars = s"sp&cial%chars"
+      withTempDir { dir =>
+        val tmpFile = s"$dir/$nameWithSpecialChars"
+        spark.createDataset(Seq("a", "b")).write.format(format).save(tmpFile)
+        val fileContent = spark.read.format(format).load(tmpFile)
+        checkAnswer(fileContent, Seq(Row("a"), Row("b")))
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 96bf65fce9c4a..7c9840a34eaa3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2757,20 +2757,4 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       }
     }
   }
-
-  // Only New OrcFileFormat supports this
-  Seq(classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat].getCanonicalName,
-      "parquet").foreach { format =>
-    test(s"SPARK-15474 Write and read back non-emtpy schema with empty dataframe - $format") {
-      withTempPath { file =>
-        val path = file.getCanonicalPath
-        val emptyDf = Seq((true, 1, "str")).toDF.limit(0)
-        emptyDf.write.format(format).save(path)
-
-        val df = spark.read.format(format).load(path)
-        assert(df.schema.sameType(emptyDf.schema))
-        checkAnswer(df, emptyDf)
-      }
-    }
-  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index c8caba83bf365..fade143a1755e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -23,14 +23,12 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.SparkContext
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
 import org.apache.spark.sql.execution.command.CreateTableCommand
 import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.hive.HiveExternalCatalog._
-import org.apache.spark.sql.hive.client.HiveClient
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.StaticSQLConf._
@@ -1344,18 +1342,6 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
     }
   }
 
-  Seq("orc", "parquet", "csv", "json", "text").foreach { format =>
-    test(s"SPARK-22146: read files containing special characters using $format") {
-      val nameWithSpecialChars = s"sp&cial%chars"
-      withTempDir { dir =>
-        val tmpFile = s"$dir/$nameWithSpecialChars"
-        spark.createDataset(Seq("a", "b")).write.format(format).save(tmpFile)
-        val fileContent = spark.read.format(format).load(tmpFile)
-        checkAnswer(fileContent, Seq(Row("a"), Row("b")))
-      }
-    }
-  }
-
   private def withDebugMode(f: => Unit): Unit = {
     val previousValue = sparkSession.sparkContext.conf.get(DEBUG_MODE)
     try {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 47adc77a52d51..33bcae91fdaf4 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -2159,12 +2159,4 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       }
     }
   }
-
-  Seq("orc", "parquet", "csv", "json", "text").foreach { format =>
-    test(s"Writing empty datasets should not fail - $format") {
-      withTempDir { dir =>
-        Seq("str").toDS.limit(0).write.format(format).save(dir.getCanonicalPath + "/tmp")
-      }
-    }
-  }
 }

From bfbc2d41b8a9278b347b6df2d516fe4679b41076 Mon Sep 17 00:00:00 2001
From: Henry Robinson <henry@cloudera.com>
Date: Wed, 17 Jan 2018 16:01:41 +0800
Subject: [PATCH 110/566] [SPARK-23062][SQL] Improve EXCEPT documentation

## What changes were proposed in this pull request?

Make the default behavior of EXCEPT (i.e. EXCEPT DISTINCT) more
explicit in the documentation, and call out the change in behavior
from 1.x.

Author: Henry Robinson <henry@cloudera.com>

Closes #20254 from henryr/spark-23062.

(cherry picked from commit 1f3d933e0bd2b1e934a233ed699ad39295376e71)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 R/pkg/R/DataFrame.R                                        | 2 +-
 python/pyspark/sql/dataframe.py                            | 3 ++-
 sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 6caa125e1e14a..29f3e986eaab6 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2853,7 +2853,7 @@ setMethod("intersect",
 #' except
 #'
 #' Return a new SparkDataFrame containing rows in this SparkDataFrame
-#' but not in another SparkDataFrame. This is equivalent to \code{EXCEPT} in SQL.
+#' but not in another SparkDataFrame. This is equivalent to \code{EXCEPT DISTINCT} in SQL.
 #'
 #' @param x a SparkDataFrame.
 #' @param y a SparkDataFrame.
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 95eca76fa9888..2d5e9b91468cf 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1364,7 +1364,8 @@ def subtract(self, other):
         """ Return a new :class:`DataFrame` containing rows in this frame
         but not in another frame.
 
-        This is equivalent to `EXCEPT` in SQL.
+        This is equivalent to `EXCEPT DISTINCT` in SQL.
+
         """
         return DataFrame(getattr(self._jdf, "except")(other._jdf), self.sql_ctx)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 34f0ab5aa6699..912f411fa3845 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1903,7 +1903,7 @@ class Dataset[T] private[sql](
 
   /**
    * Returns a new Dataset containing rows in this Dataset but not in another Dataset.
-   * This is equivalent to `EXCEPT` in SQL.
+   * This is equivalent to `EXCEPT DISTINCT` in SQL.
    *
    * @note Equality checking is performed directly on the encoded representation of the data
    * and thus is not affected by a custom `equals` function defined on `T`.

From cbb6bda437b0d2832496b5c45f8264e5527f1cce Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Wed, 17 Jan 2018 21:53:36 +0800
Subject: [PATCH 111/566] [SPARK-21783][SQL] Turn on ORC filter push-down by
 default

## What changes were proposed in this pull request?

ORC filter push-down is disabled by default from the beginning, [SPARK-2883](https://github.com/apache/spark/commit/aa31e431fc09f0477f1c2351c6275769a31aca90#diff-41ef65b9ef5b518f77e2a03559893f4dR149
).

Now, Apache Spark starts to depend on Apache ORC 1.4.1. For Apache Spark 2.3, this PR turns on ORC filter push-down by default like Parquet ([SPARK-9207](https://issues.apache.org/jira/browse/SPARK-21783)) as a part of [SPARK-20901](https://issues.apache.org/jira/browse/SPARK-20901), "Feature parity for ORC with Parquet".

## How was this patch tested?

Pass the existing tests.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20265 from dongjoon-hyun/SPARK-21783.

(cherry picked from commit 0f8a28617a0742d5a99debfbae91222c2e3b5cec)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/sql/internal/SQLConf.scala   |   2 +-
 .../spark/sql/FilterPushdownBenchmark.scala   | 243 ++++++++++++++++++
 2 files changed, 244 insertions(+), 1 deletion(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 6746fbcaf2483..16fbb0c3e9e21 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -410,7 +410,7 @@ object SQLConf {
   val ORC_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.orc.filterPushdown")
     .doc("When true, enable filter pushdown for ORC files.")
     .booleanConf
-    .createWithDefault(false)
+    .createWithDefault(true)
 
   val HIVE_VERIFY_PARTITION_PATH = buildConf("spark.sql.hive.verifyPartitionPath")
     .doc("When true, check all the partition paths under the table\'s root directory " +
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala
new file mode 100644
index 0000000000000..c6dd7dadc9d93
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import java.io.File
+
+import scala.util.{Random, Try}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.functions.monotonically_increasing_id
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.util.{Benchmark, Utils}
+
+
+/**
+ * Benchmark to measure read performance with Filter pushdown.
+ */
+object FilterPushdownBenchmark {
+  val conf = new SparkConf()
+  conf.set("orc.compression", "snappy")
+  conf.set("spark.sql.parquet.compression.codec", "snappy")
+
+  private val spark = SparkSession.builder()
+    .master("local[1]")
+    .appName("FilterPushdownBenchmark")
+    .config(conf)
+    .getOrCreate()
+
+  def withTempPath(f: File => Unit): Unit = {
+    val path = Utils.createTempDir()
+    path.delete()
+    try f(path) finally Utils.deleteRecursively(path)
+  }
+
+  def withTempTable(tableNames: String*)(f: => Unit): Unit = {
+    try f finally tableNames.foreach(spark.catalog.dropTempView)
+  }
+
+  def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
+    val (keys, values) = pairs.unzip
+    val currentValues = keys.map(key => Try(spark.conf.get(key)).toOption)
+    (keys, values).zipped.foreach(spark.conf.set)
+    try f finally {
+      keys.zip(currentValues).foreach {
+        case (key, Some(value)) => spark.conf.set(key, value)
+        case (key, None) => spark.conf.unset(key)
+      }
+    }
+  }
+
+  private def prepareTable(dir: File, numRows: Int, width: Int): Unit = {
+    import spark.implicits._
+    val selectExpr = (1 to width).map(i => s"CAST(value AS STRING) c$i")
+    val df = spark.range(numRows).map(_ => Random.nextLong).selectExpr(selectExpr: _*)
+      .withColumn("id", monotonically_increasing_id())
+
+    val dirORC = dir.getCanonicalPath + "/orc"
+    val dirParquet = dir.getCanonicalPath + "/parquet"
+
+    df.write.mode("overwrite").orc(dirORC)
+    df.write.mode("overwrite").parquet(dirParquet)
+
+    spark.read.orc(dirORC).createOrReplaceTempView("orcTable")
+    spark.read.parquet(dirParquet).createOrReplaceTempView("parquetTable")
+  }
+
+  def filterPushDownBenchmark(
+      values: Int,
+      title: String,
+      whereExpr: String,
+      selectExpr: String = "*"): Unit = {
+    val benchmark = new Benchmark(title, values, minNumIters = 5)
+
+    Seq(false, true).foreach { pushDownEnabled =>
+      val name = s"Parquet Vectorized ${if (pushDownEnabled) s"(Pushdown)" else ""}"
+      benchmark.addCase(name) { _ =>
+        withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> s"$pushDownEnabled") {
+          spark.sql(s"SELECT $selectExpr FROM parquetTable WHERE $whereExpr").collect()
+        }
+      }
+    }
+
+    Seq(false, true).foreach { pushDownEnabled =>
+      val name = s"Native ORC Vectorized ${if (pushDownEnabled) s"(Pushdown)" else ""}"
+      benchmark.addCase(name) { _ =>
+        withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> s"$pushDownEnabled") {
+          spark.sql(s"SELECT $selectExpr FROM orcTable WHERE $whereExpr").collect()
+        }
+      }
+    }
+
+    /*
+    Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
+    Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+
+    Select 0 row (id IS NULL):              Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                            7882 / 7957          2.0         501.1       1.0X
+    Parquet Vectorized (Pushdown)                   55 /   60        285.2           3.5     142.9X
+    Native ORC Vectorized                         5592 / 5627          2.8         355.5       1.4X
+    Native ORC Vectorized (Pushdown)                66 /   70        237.2           4.2     118.9X
+
+    Select 0 row (7864320 < id < 7864320):  Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                            7884 / 7909          2.0         501.2       1.0X
+    Parquet Vectorized (Pushdown)                  739 /  752         21.3          47.0      10.7X
+    Native ORC Vectorized                         5614 / 5646          2.8         356.9       1.4X
+    Native ORC Vectorized (Pushdown)                81 /   83        195.2           5.1      97.8X
+
+    Select 1 row (id = 7864320):            Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                            7905 / 8027          2.0         502.6       1.0X
+    Parquet Vectorized (Pushdown)                  740 /  766         21.2          47.1      10.7X
+    Native ORC Vectorized                         5684 / 5738          2.8         361.4       1.4X
+    Native ORC Vectorized (Pushdown)                78 /   81        202.4           4.9     101.7X
+
+    Select 1 row (id <=> 7864320):          Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                            7928 / 7993          2.0         504.1       1.0X
+    Parquet Vectorized (Pushdown)                  747 /  772         21.0          47.5      10.6X
+    Native ORC Vectorized                         5728 / 5753          2.7         364.2       1.4X
+    Native ORC Vectorized (Pushdown)                76 /   78        207.9           4.8     104.8X
+
+    Select 1 row (7864320 <= id <= 7864320):Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                            7939 / 8021          2.0         504.8       1.0X
+    Parquet Vectorized (Pushdown)                  746 /  770         21.1          47.4      10.6X
+    Native ORC Vectorized                         5690 / 5734          2.8         361.7       1.4X
+    Native ORC Vectorized (Pushdown)                76 /   79        206.7           4.8     104.3X
+
+    Select 1 row (7864319 < id < 7864321):  Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                            7972 / 8019          2.0         506.9       1.0X
+    Parquet Vectorized (Pushdown)                  742 /  764         21.2          47.2      10.7X
+    Native ORC Vectorized                         5704 / 5743          2.8         362.6       1.4X
+    Native ORC Vectorized (Pushdown)                76 /   78        207.9           4.8     105.4X
+
+    Select 10% rows (id < 1572864):         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                            8733 / 8808          1.8         555.2       1.0X
+    Parquet Vectorized (Pushdown)                 2213 / 2267          7.1         140.7       3.9X
+    Native ORC Vectorized                         6420 / 6463          2.4         408.2       1.4X
+    Native ORC Vectorized (Pushdown)              1313 / 1331         12.0          83.5       6.7X
+
+    Select 50% rows (id < 7864320):         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                          11518 / 11591          1.4         732.3       1.0X
+    Parquet Vectorized (Pushdown)                 7962 / 7991          2.0         506.2       1.4X
+    Native ORC Vectorized                         8927 / 8985          1.8         567.6       1.3X
+    Native ORC Vectorized (Pushdown)              6102 / 6160          2.6         387.9       1.9X
+
+    Select 90% rows (id < 14155776):        Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                          14255 / 14389          1.1         906.3       1.0X
+    Parquet Vectorized (Pushdown)               13564 / 13594          1.2         862.4       1.1X
+    Native ORC Vectorized                       11442 / 11608          1.4         727.5       1.2X
+    Native ORC Vectorized (Pushdown)            10991 / 11029          1.4         698.8       1.3X
+
+    Select all rows (id IS NOT NULL):       Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                          14917 / 14938          1.1         948.4       1.0X
+    Parquet Vectorized (Pushdown)               14910 / 14964          1.1         948.0       1.0X
+    Native ORC Vectorized                       11986 / 12069          1.3         762.0       1.2X
+    Native ORC Vectorized (Pushdown)            12037 / 12123          1.3         765.3       1.2X
+
+    Select all rows (id > -1):              Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                          14951 / 14976          1.1         950.6       1.0X
+    Parquet Vectorized (Pushdown)               14934 / 15016          1.1         949.5       1.0X
+    Native ORC Vectorized                       12000 / 12156          1.3         763.0       1.2X
+    Native ORC Vectorized (Pushdown)            12079 / 12113          1.3         767.9       1.2X
+
+    Select all rows (id != -1):             Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                          14930 / 14972          1.1         949.3       1.0X
+    Parquet Vectorized (Pushdown)               15015 / 15047          1.0         954.6       1.0X
+    Native ORC Vectorized                       12090 / 12259          1.3         768.7       1.2X
+    Native ORC Vectorized (Pushdown)            12021 / 12096          1.3         764.2       1.2X
+    */
+    benchmark.run()
+  }
+
+  def main(args: Array[String]): Unit = {
+    val numRows = 1024 * 1024 * 15
+    val width = 5
+    val mid = numRows / 2
+
+    withTempPath { dir =>
+      withTempTable("orcTable", "patquetTable") {
+        prepareTable(dir, numRows, width)
+
+        Seq("id IS NULL", s"$mid < id AND id < $mid").foreach { whereExpr =>
+          val title = s"Select 0 row ($whereExpr)".replace("id AND id", "id")
+          filterPushDownBenchmark(numRows, title, whereExpr)
+        }
+
+        Seq(
+          s"id = $mid",
+          s"id <=> $mid",
+          s"$mid <= id AND id <= $mid",
+          s"${mid - 1} < id AND id < ${mid + 1}"
+        ).foreach { whereExpr =>
+          val title = s"Select 1 row ($whereExpr)".replace("id AND id", "id")
+          filterPushDownBenchmark(numRows, title, whereExpr)
+        }
+
+        val selectExpr = (1 to width).map(i => s"MAX(c$i)").mkString("", ",", ", MAX(id)")
+
+        Seq(10, 50, 90).foreach { percent =>
+          filterPushDownBenchmark(
+            numRows,
+            s"Select $percent% rows (id < ${numRows * percent / 100})",
+            s"id < ${numRows * percent / 100}",
+            selectExpr
+          )
+        }
+
+        Seq("id IS NOT NULL", "id > -1", "id != -1").foreach { whereExpr =>
+          filterPushDownBenchmark(
+            numRows,
+            s"Select all rows ($whereExpr)",
+            whereExpr,
+            selectExpr)
+        }
+      }
+    }
+  }
+}

From aae73a21a42fa366a09c2be1a4b91308ef211beb Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Thu, 18 Jan 2018 00:05:26 +0800
Subject: [PATCH 112/566] [SPARK-23079][SQL] Fix query constraints propagation
 with aliases

## What changes were proposed in this pull request?

Previously, PR #19201 fix the problem of non-converging constraints.
After that PR #19149 improve the loop and constraints is inferred only once.
So the problem of non-converging constraints is gone.

However, the case below will fail.

```

spark.range(5).write.saveAsTable("t")
val t = spark.read.table("t")
val left = t.withColumn("xid", $"id" + lit(1)).as("x")
val right = t.withColumnRenamed("id", "xid").as("y")
val df = left.join(right, "xid").filter("id = 3").toDF()
checkAnswer(df, Row(4, 3))

```

Because `aliasMap` replace all the aliased child. See the test case in PR for details.

This PR is to fix this bug by removing useless code for preventing non-converging constraints.
It can be also fixed with #20270, but this is much simpler and clean up the code.

## How was this patch tested?

Unit test

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #20278 from gengliangwang/FixConstraintSimple.

(cherry picked from commit 8598a982b4147abe5f1aae005fea0fd5ae395ac4)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../catalyst/plans/logical/LogicalPlan.scala  |  1 +
 .../plans/logical/QueryPlanConstraints.scala  | 37 +-----------
 .../InferFiltersFromConstraintsSuite.scala    | 59 +------------------
 .../plans/ConstraintPropagationSuite.scala    |  2 +
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 11 ++++
 5 files changed, 17 insertions(+), 93 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index ff2a0ec588567..c8ccd9bd03994 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -255,6 +255,7 @@ abstract class UnaryNode extends LogicalPlan {
           case expr: Expression if expr.semanticEquals(e) =>
             a.toAttribute
         })
+        allConstraints += EqualNullSafe(e, a.toAttribute)
       case _ => // Don't change.
     }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala
index 9c0a30a47f839..5c7b8e5b97883 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala
@@ -94,25 +94,16 @@ trait QueryPlanConstraints { self: LogicalPlan =>
     case _ => Seq.empty[Attribute]
   }
 
-  // Collect aliases from expressions of the whole tree rooted by the current QueryPlan node, so
-  // we may avoid producing recursive constraints.
-  private lazy val aliasMap: AttributeMap[Expression] = AttributeMap(
-    expressions.collect {
-      case a: Alias if !a.child.isInstanceOf[Literal] => (a.toAttribute, a.child)
-    } ++ children.flatMap(_.asInstanceOf[QueryPlanConstraints].aliasMap))
-    // Note: the explicit cast is necessary, since Scala compiler fails to infer the type.
-
   /**
    * Infers an additional set of constraints from a given set of equality constraints.
    * For e.g., if an operator has constraints of the form (`a = 5`, `a = b`), this returns an
    * additional constraint of the form `b = 5`.
    */
   private def inferAdditionalConstraints(constraints: Set[Expression]): Set[Expression] = {
-    val aliasedConstraints = eliminateAliasedExpressionInConstraints(constraints)
     var inferredConstraints = Set.empty[Expression]
-    aliasedConstraints.foreach {
+    constraints.foreach {
       case eq @ EqualTo(l: Attribute, r: Attribute) =>
-        val candidateConstraints = aliasedConstraints - eq
+        val candidateConstraints = constraints - eq
         inferredConstraints ++= replaceConstraints(candidateConstraints, l, r)
         inferredConstraints ++= replaceConstraints(candidateConstraints, r, l)
       case _ => // No inference
@@ -120,30 +111,6 @@ trait QueryPlanConstraints { self: LogicalPlan =>
     inferredConstraints -- constraints
   }
 
-  /**
-   * Replace the aliased expression in [[Alias]] with the alias name if both exist in constraints.
-   * Thus non-converging inference can be prevented.
-   * E.g. `Alias(b, f(a)), a = b` infers `f(a) = f(f(a))` without eliminating aliased expressions.
-   * Also, the size of constraints is reduced without losing any information.
-   * When the inferred filters are pushed down the operators that generate the alias,
-   * the alias names used in filters are replaced by the aliased expressions.
-   */
-  private def eliminateAliasedExpressionInConstraints(constraints: Set[Expression])
-    : Set[Expression] = {
-    val attributesInEqualTo = constraints.flatMap {
-      case EqualTo(l: Attribute, r: Attribute) => l :: r :: Nil
-      case _ => Nil
-    }
-    var aliasedConstraints = constraints
-    attributesInEqualTo.foreach { a =>
-      if (aliasMap.contains(a)) {
-        val child = aliasMap.get(a).get
-        aliasedConstraints = replaceConstraints(aliasedConstraints, child, a)
-      }
-    }
-    aliasedConstraints
-  }
-
   private def replaceConstraints(
       constraints: Set[Expression],
       source: Expression,
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
index a0708bf7eee9a..178c4b8c270a0 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
@@ -34,6 +34,7 @@ class InferFiltersFromConstraintsSuite extends PlanTest {
         PushDownPredicate,
         InferFiltersFromConstraints,
         CombineFilters,
+        SimplifyBinaryComparison,
         BooleanSimplification) :: Nil
   }
 
@@ -160,64 +161,6 @@ class InferFiltersFromConstraintsSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
-  test("inner join with alias: don't generate constraints for recursive functions") {
-    val t1 = testRelation.subquery('t1)
-    val t2 = testRelation.subquery('t2)
-
-    // We should prevent `Coalese(a, b)` from recursively creating complicated constraints through
-    // the constraint inference procedure.
-    val originalQuery = t1.select('a, 'b.as('d), Coalesce(Seq('a, 'b)).as('int_col))
-      // We hide an `Alias` inside the child's child's expressions, to cover the situation reported
-      // in [SPARK-20700].
-      .select('int_col, 'd, 'a).as("t")
-      .join(t2, Inner,
-        Some("t.a".attr === "t2.a".attr
-          && "t.d".attr === "t2.a".attr
-          && "t.int_col".attr === "t2.a".attr))
-      .analyze
-    val correctAnswer = t1
-      .where(IsNotNull('a) && IsNotNull(Coalesce(Seq('a, 'a)))  && IsNotNull(Coalesce(Seq('b, 'a)))
-        && IsNotNull('b) && IsNotNull(Coalesce(Seq('b, 'b))) && IsNotNull(Coalesce(Seq('a, 'b)))
-        && 'a === 'b && 'a === Coalesce(Seq('a, 'a)) && 'a === Coalesce(Seq('a, 'b))
-        && 'a === Coalesce(Seq('b, 'a)) && 'b === Coalesce(Seq('a, 'b))
-        && 'b === Coalesce(Seq('b, 'a)) && 'b === Coalesce(Seq('b, 'b)))
-      .select('a, 'b.as('d), Coalesce(Seq('a, 'b)).as('int_col))
-      .select('int_col, 'd, 'a).as("t")
-      .join(
-        t2.where(IsNotNull('a) && IsNotNull(Coalesce(Seq('a, 'a))) &&
-          'a === Coalesce(Seq('a, 'a))),
-        Inner,
-        Some("t.a".attr === "t2.a".attr && "t.d".attr === "t2.a".attr
-          && "t.int_col".attr === "t2.a".attr))
-      .analyze
-    val optimized = Optimize.execute(originalQuery)
-    comparePlans(optimized, correctAnswer)
-  }
-
-  test("inner join with EqualTo expressions containing part of each other: don't generate " +
-    "constraints for recursive functions") {
-    val t1 = testRelation.subquery('t1)
-    val t2 = testRelation.subquery('t2)
-
-    // We should prevent `c = Coalese(a, b)` and `a = Coalese(b, c)` from recursively creating
-    // complicated constraints through the constraint inference procedure.
-    val originalQuery = t1
-      .select('a, 'b, 'c, Coalesce(Seq('b, 'c)).as('d), Coalesce(Seq('a, 'b)).as('e))
-      .where('a === 'd && 'c === 'e)
-      .join(t2, Inner, Some("t1.a".attr === "t2.a".attr && "t1.c".attr === "t2.c".attr))
-      .analyze
-    val correctAnswer = t1
-      .where(IsNotNull('a) && IsNotNull('c) && 'a === Coalesce(Seq('b, 'c)) &&
-        'c === Coalesce(Seq('a, 'b)))
-      .select('a, 'b, 'c, Coalesce(Seq('b, 'c)).as('d), Coalesce(Seq('a, 'b)).as('e))
-      .join(t2.where(IsNotNull('a) && IsNotNull('c)),
-        Inner,
-        Some("t1.a".attr === "t2.a".attr && "t1.c".attr === "t2.c".attr))
-      .analyze
-    val optimized = Optimize.execute(originalQuery)
-    comparePlans(optimized, correctAnswer)
-  }
-
   test("generate correct filters for alias that don't produce recursive constraints") {
     val t1 = testRelation.subquery('t1)
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
index 866ff0d33cbb2..a37e06d922642 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
@@ -134,6 +134,8 @@ class ConstraintPropagationSuite extends SparkFunSuite with PlanTest {
     verifyConstraints(aliasedRelation.analyze.constraints,
       ExpressionSet(Seq(resolveColumn(aliasedRelation.analyze, "x") > 10,
         IsNotNull(resolveColumn(aliasedRelation.analyze, "x")),
+        resolveColumn(aliasedRelation.analyze, "b") <=> resolveColumn(aliasedRelation.analyze, "y"),
+        resolveColumn(aliasedRelation.analyze, "z") <=> resolveColumn(aliasedRelation.analyze, "x"),
         resolveColumn(aliasedRelation.analyze, "z") > 10,
         IsNotNull(resolveColumn(aliasedRelation.analyze, "z")))))
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 7c9840a34eaa3..d4d0aa4f5f5eb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2717,6 +2717,17 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     }
   }
 
+  test("SPARK-23079: constraints should be inferred correctly with aliases") {
+    withTable("t") {
+      spark.range(5).write.saveAsTable("t")
+      val t = spark.read.table("t")
+      val left = t.withColumn("xid", $"id" + lit(1)).as("x")
+      val right = t.withColumnRenamed("id", "xid").as("y")
+      val df = left.join(right, "xid").filter("id = 3").toDF()
+      checkAnswer(df, Row(4, 3))
+    }
+  }
+
   test("SRARK-22266: the same aggregate function was calculated multiple times") {
     val query = "SELECT a, max(b+1), max(b+1) + 1 FROM testData2 GROUP BY a"
     val df = sql(query)

From 1a6dfaf25f507545debdf4cb1d427b9cc78c3cc8 Mon Sep 17 00:00:00 2001
From: Sameer Agarwal <sameerag@apache.org>
Date: Wed, 17 Jan 2018 09:27:49 -0800
Subject: [PATCH 113/566] [SPARK-23020] Ignore Flaky Test:
 SparkLauncherSuite.testInProcessLauncher

## What changes were proposed in this pull request?

Temporarily ignoring flaky test `SparkLauncherSuite.testInProcessLauncher` to de-flake the builds. This should be re-enabled when SPARK-23020 is merged.

## How was this patch tested?

N/A (Test Only Change)

Author: Sameer Agarwal <sameerag@apache.org>

Closes #20291 from sameeragarwal/disable-test-2.

(cherry picked from commit c132538a164cd8b55dbd7e8ffdc0c0782a0b588c)
Signed-off-by: Sameer Agarwal <sameerag@apache.org>
---
 .../java/org/apache/spark/launcher/SparkLauncherSuite.java    | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
index 9d2f563b2e367..dffa609f1cbdf 100644
--- a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
+++ b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
@@ -25,6 +25,7 @@
 import java.util.Properties;
 import java.util.concurrent.TimeUnit;
 
+import org.junit.Ignore;
 import org.junit.Test;
 import static org.junit.Assert.*;
 import static org.junit.Assume.*;
@@ -120,7 +121,8 @@ public void testChildProcLauncher() throws Exception {
     assertEquals(0, app.waitFor());
   }
 
-  @Test
+  // TODO: [SPARK-23020] Re-enable this
+  @Ignore
   public void testInProcessLauncher() throws Exception {
     // Because this test runs SparkLauncher in process and in client mode, it pollutes the system
     // properties, and that can cause test failures down the test pipeline. So restore the original

From dbd2a5566d8924ab340c3c840d31e83e5af92242 Mon Sep 17 00:00:00 2001
From: Jose Torres <jose@databricks.com>
Date: Wed, 17 Jan 2018 13:52:51 -0800
Subject: [PATCH 114/566] [SPARK-23033][SS] Don't use task level retry for
 continuous processing

## What changes were proposed in this pull request?

Continuous processing tasks will fail on any attempt number greater than 0. ContinuousExecution will catch these failures and restart globally from the last recorded checkpoints.
## How was this patch tested?
unit test

Author: Jose Torres <jose@databricks.com>

Closes #20225 from jose-torres/no-retry.

(cherry picked from commit 86a845031824a5334db6a5299c6f5dcc982bc5b8)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../spark/sql/kafka010/KafkaSourceSuite.scala |  8 +--
 .../ContinuousDataSourceRDDIter.scala         |  5 ++
 .../continuous/ContinuousExecution.scala      |  2 +-
 .../ContinuousTaskRetryException.scala        | 26 +++++++
 .../spark/sql/streaming/StreamTest.scala      |  9 ++-
 .../continuous/ContinuousSuite.scala          | 71 +++++++++++--------
 6 files changed, 84 insertions(+), 37 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTaskRetryException.scala

diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index 62f6a34a6b67a..27dbb3f7a8f31 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -808,16 +808,14 @@ class KafkaSourceSuiteBase extends KafkaSourceTest {
     val query = kafka
       .writeStream
       .format("memory")
-      .outputMode("append")
       .queryName("kafkaColumnTypes")
       .trigger(defaultTrigger)
       .start()
-    var rows: Array[Row] = Array()
     eventually(timeout(streamingTimeout)) {
-      rows = spark.table("kafkaColumnTypes").collect()
-      assert(rows.length === 1, s"Unexpected results: ${rows.toList}")
+      assert(spark.table("kafkaColumnTypes").count == 1,
+        s"Unexpected results: ${spark.table("kafkaColumnTypes").collectAsList()}")
     }
-    val row = rows(0)
+    val row = spark.table("kafkaColumnTypes").head()
     assert(row.getAs[Array[Byte]]("key") === null, s"Unexpected results: $row")
     assert(row.getAs[Array[Byte]]("value") === "1".getBytes(UTF_8), s"Unexpected results: $row")
     assert(row.getAs[String]("topic") === topic, s"Unexpected results: $row")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
index 66eb42d4658f6..dcb3b54c4e160 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
@@ -52,6 +52,11 @@ class ContinuousDataSourceRDD(
   }
 
   override def compute(split: Partition, context: TaskContext): Iterator[UnsafeRow] = {
+    // If attempt number isn't 0, this is a task retry, which we don't support.
+    if (context.attemptNumber() != 0) {
+      throw new ContinuousTaskRetryException()
+    }
+
     val reader = split.asInstanceOf[DataSourceRDDPartition[UnsafeRow]].readTask.createDataReader()
 
     val runId = context.getLocalProperty(ContinuousExecution.RUN_ID_KEY)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
index 667410ef9f1c6..45b794c70a50a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -24,7 +24,7 @@ import java.util.function.UnaryOperator
 import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, Map => MutableMap}
 
-import org.apache.spark.SparkEnv
+import org.apache.spark.{SparkEnv, SparkException}
 import org.apache.spark.sql.{AnalysisException, SparkSession}
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTaskRetryException.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTaskRetryException.scala
new file mode 100644
index 0000000000000..e0a6f6dd50bb3
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTaskRetryException.scala
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.continuous
+
+import org.apache.spark.SparkException
+
+/**
+ * An exception thrown when a continuous processing task runs with a nonzero attempt ID.
+ */
+class ContinuousTaskRetryException
+  extends SparkException("Continuous execution does not support task retry", null)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index 0762895fdc620..c75247e0f6ed8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -472,8 +472,8 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
               currentStream.awaitInitialization(streamingTimeout.toMillis)
               currentStream match {
                 case s: ContinuousExecution => eventually("IncrementalExecution was not created") {
-                    s.lastExecution.executedPlan // will fail if lastExecution is null
-                  }
+                  s.lastExecution.executedPlan // will fail if lastExecution is null
+                }
                 case _ =>
               }
             } catch {
@@ -645,7 +645,10 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
             }
 
           case CheckAnswerRowsContains(expectedAnswer, lastOnly) =>
-            val sparkAnswer = fetchStreamAnswer(currentStream, lastOnly)
+            val sparkAnswer = currentStream match {
+              case null => fetchStreamAnswer(lastStream, lastOnly)
+              case s => fetchStreamAnswer(s, lastOnly)
+            }
             QueryTest.includesRows(expectedAnswer, sparkAnswer).foreach {
               error => failTest(error)
             }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala
index 9562c10feafe9..4b4ed82dc6520 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala
@@ -17,36 +17,18 @@
 
 package org.apache.spark.sql.streaming.continuous
 
-import java.io.{File, InterruptedIOException, IOException, UncheckedIOException}
-import java.nio.channels.ClosedByInterruptException
-import java.util.concurrent.{CountDownLatch, ExecutionException, TimeoutException, TimeUnit}
+import java.util.UUID
 
-import scala.reflect.ClassTag
-import scala.util.control.ControlThrowable
-
-import com.google.common.util.concurrent.UncheckedExecutionException
-import org.apache.commons.io.FileUtils
-import org.apache.hadoop.conf.Configuration
-
-import org.apache.spark.{SparkContext, SparkEnv}
-import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
+import org.apache.spark.{SparkContext, SparkEnv, SparkException}
+import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart, SparkListenerTaskStart}
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.plans.logical.Range
-import org.apache.spark.sql.catalyst.streaming.InternalOutputModes
-import org.apache.spark.sql.execution.command.ExplainCommand
 import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2ScanExec, WriteToDataSourceV2Exec}
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.continuous._
 import org.apache.spark.sql.execution.streaming.sources.MemorySinkV2
-import org.apache.spark.sql.execution.streaming.state.{StateStore, StateStoreConf, StateStoreId, StateStoreProvider}
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.sources.StreamSourceProvider
 import org.apache.spark.sql.streaming.{StreamTest, Trigger}
-import org.apache.spark.sql.streaming.util.StreamManualClock
 import org.apache.spark.sql.test.TestSparkSession
-import org.apache.spark.sql.types._
-import org.apache.spark.util.Utils
 
 class ContinuousSuiteBase extends StreamTest {
   // We need more than the default local[2] to be able to schedule all partitions simultaneously.
@@ -219,6 +201,41 @@ class ContinuousSuite extends ContinuousSuiteBase {
       StopStream)
   }
 
+  test("task failure kills the query") {
+    val df = spark.readStream
+      .format("rate")
+      .option("numPartitions", "5")
+      .option("rowsPerSecond", "5")
+      .load()
+      .select('value)
+
+    // Get an arbitrary task from this query to kill. It doesn't matter which one.
+    var taskId: Long = -1
+    val listener = new SparkListener() {
+      override def onTaskStart(start: SparkListenerTaskStart): Unit = {
+        taskId = start.taskInfo.taskId
+      }
+    }
+    spark.sparkContext.addSparkListener(listener)
+    try {
+      testStream(df, useV2Sink = true)(
+        StartStream(Trigger.Continuous(100)),
+        Execute(waitForRateSourceTriggers(_, 2)),
+        Execute { _ =>
+          // Wait until a task is started, then kill its first attempt.
+          eventually(timeout(streamingTimeout)) {
+            assert(taskId != -1)
+          }
+          spark.sparkContext.killTaskAttempt(taskId)
+        },
+        ExpectFailure[SparkException] { e =>
+          e.getCause != null && e.getCause.getCause.isInstanceOf[ContinuousTaskRetryException]
+        })
+    } finally {
+      spark.sparkContext.removeSparkListener(listener)
+    }
+  }
+
   test("query without test harness") {
     val df = spark.readStream
       .format("rate")
@@ -258,13 +275,9 @@ class ContinuousStressSuite extends ContinuousSuiteBase {
       AwaitEpoch(0),
       Execute(waitForRateSourceTriggers(_, 201)),
       IncrementEpoch(),
-      Execute { query =>
-        val data = query.sink.asInstanceOf[MemorySinkV2].allData
-        val vals = data.map(_.getLong(0)).toSet
-        assert(scala.Range(0, 25000).forall { i =>
-          vals.contains(i)
-        })
-      })
+      StopStream,
+      CheckAnswerRowsContains(scala.Range(0, 25000).map(Row(_)))
+    )
   }
 
   test("automatic epoch advancement") {
@@ -280,6 +293,7 @@ class ContinuousStressSuite extends ContinuousSuiteBase {
       AwaitEpoch(0),
       Execute(waitForRateSourceTriggers(_, 201)),
       IncrementEpoch(),
+      StopStream,
       CheckAnswerRowsContains(scala.Range(0, 25000).map(Row(_))))
   }
 
@@ -311,6 +325,7 @@ class ContinuousStressSuite extends ContinuousSuiteBase {
       StopStream,
       StartStream(Trigger.Continuous(2012)),
       AwaitEpoch(50),
+      StopStream,
       CheckAnswerRowsContains(scala.Range(0, 25000).map(Row(_))))
   }
 }

From 79ccd0cadf09c41c0f4b5853a54798be17a20584 Mon Sep 17 00:00:00 2001
From: Jose Torres <jose@databricks.com>
Date: Wed, 17 Jan 2018 13:58:44 -0800
Subject: [PATCH 115/566] [SPARK-23093][SS] Don't change run id when
 reconfiguring a continuous processing query.

## What changes were proposed in this pull request?

Keep the run ID static, using a different ID for the epoch coordinator to avoid cross-execution message contamination.

## How was this patch tested?

new and existing unit tests

Author: Jose Torres <jose@databricks.com>

Closes #20282 from jose-torres/fix-runid.

(cherry picked from commit e946c63dd56d121cf898084ed7e9b5b0868b226e)
Signed-off-by: Shixiong Zhu <zsxwing@gmail.com>
---
 .../datasources/v2/DataSourceV2ScanExec.scala |  3 ++-
 .../datasources/v2/WriteToDataSourceV2.scala  |  5 ++--
 .../execution/streaming/StreamExecution.scala |  3 +--
 .../ContinuousDataSourceRDDIter.scala         | 10 ++++----
 .../continuous/ContinuousExecution.scala      | 18 ++++++++-----
 .../continuous/EpochCoordinator.scala         |  9 ++++---
 .../spark/sql/streaming/StreamTest.scala      |  2 +-
 .../StreamingQueryListenerSuite.scala         | 25 +++++++++++++++++++
 8 files changed, 54 insertions(+), 21 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
index 8c64df080242f..beb66738732be 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
@@ -58,7 +58,8 @@ case class DataSourceV2ScanExec(
 
     case _: ContinuousReader =>
       EpochCoordinatorRef.get(
-        sparkContext.getLocalProperty(ContinuousExecution.RUN_ID_KEY), sparkContext.env)
+          sparkContext.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY),
+          sparkContext.env)
         .askSync[Unit](SetReaderPartitions(readTasks.size()))
       new ContinuousDataSourceRDD(sparkContext, sqlContext, readTasks)
         .asInstanceOf[RDD[InternalRow]]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
index a4a857f2d4d9b..3dbdae7b4df9f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
@@ -64,7 +64,8 @@ case class WriteToDataSourceV2Exec(writer: DataSourceV2Writer, query: SparkPlan)
       val runTask = writer match {
         case w: ContinuousWriter =>
           EpochCoordinatorRef.get(
-            sparkContext.getLocalProperty(ContinuousExecution.RUN_ID_KEY), sparkContext.env)
+            sparkContext.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY),
+            sparkContext.env)
             .askSync[Unit](SetWriterPartitions(rdd.getNumPartitions))
 
           (context: TaskContext, iter: Iterator[InternalRow]) =>
@@ -135,7 +136,7 @@ object DataWritingSparkTask extends Logging {
       iter: Iterator[InternalRow]): WriterCommitMessage = {
     val dataWriter = writeTask.createDataWriter(context.partitionId(), context.attemptNumber())
     val epochCoordinator = EpochCoordinatorRef.get(
-      context.getLocalProperty(ContinuousExecution.RUN_ID_KEY),
+      context.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY),
       SparkEnv.get)
     val currentMsg: WriterCommitMessage = null
     var currentEpoch = context.getLocalProperty(ContinuousExecution.START_EPOCH_KEY).toLong
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index cf27e1a70650a..e7982d7880ceb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -142,8 +142,7 @@ abstract class StreamExecution(
 
   override val id: UUID = UUID.fromString(streamMetadata.id)
 
-  override def runId: UUID = currentRunId
-  protected var currentRunId = UUID.randomUUID
+  override val runId: UUID = UUID.randomUUID
 
   /**
    * Pretty identified string of printing in logs. Format is
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
index dcb3b54c4e160..cd7065f5e6601 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
@@ -59,7 +59,7 @@ class ContinuousDataSourceRDD(
 
     val reader = split.asInstanceOf[DataSourceRDDPartition[UnsafeRow]].readTask.createDataReader()
 
-    val runId = context.getLocalProperty(ContinuousExecution.RUN_ID_KEY)
+    val coordinatorId = context.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY)
 
     // This queue contains two types of messages:
     // * (null, null) representing an epoch boundary.
@@ -68,7 +68,7 @@ class ContinuousDataSourceRDD(
 
     val epochPollFailed = new AtomicBoolean(false)
     val epochPollExecutor = ThreadUtils.newDaemonSingleThreadScheduledExecutor(
-      s"epoch-poll--${runId}--${context.partitionId()}")
+      s"epoch-poll--$coordinatorId--${context.partitionId()}")
     val epochPollRunnable = new EpochPollRunnable(queue, context, epochPollFailed)
     epochPollExecutor.scheduleWithFixedDelay(
       epochPollRunnable, 0, epochPollIntervalMs, TimeUnit.MILLISECONDS)
@@ -86,7 +86,7 @@ class ContinuousDataSourceRDD(
       epochPollExecutor.shutdown()
     })
 
-    val epochEndpoint = EpochCoordinatorRef.get(runId, SparkEnv.get)
+    val epochEndpoint = EpochCoordinatorRef.get(coordinatorId, SparkEnv.get)
     new Iterator[UnsafeRow] {
       private val POLL_TIMEOUT_MS = 1000
 
@@ -150,7 +150,7 @@ class EpochPollRunnable(
   private[continuous] var failureReason: Throwable = _
 
   private val epochEndpoint = EpochCoordinatorRef.get(
-    context.getLocalProperty(ContinuousExecution.RUN_ID_KEY), SparkEnv.get)
+    context.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY), SparkEnv.get)
   private var currentEpoch = context.getLocalProperty(ContinuousExecution.START_EPOCH_KEY).toLong
 
   override def run(): Unit = {
@@ -177,7 +177,7 @@ class DataReaderThread(
     failedFlag: AtomicBoolean)
   extends Thread(
     s"continuous-reader--${context.partitionId()}--" +
-    s"${context.getLocalProperty(ContinuousExecution.RUN_ID_KEY)}") {
+    s"${context.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY)}") {
   private[continuous] var failureReason: Throwable = _
 
   override def run(): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
index 45b794c70a50a..c0507224f9be8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -57,6 +57,9 @@ class ContinuousExecution(
   @volatile protected var continuousSources: Seq[ContinuousReader] = _
   override protected def sources: Seq[BaseStreamingSource] = continuousSources
 
+  // For use only in test harnesses.
+  private[sql] var currentEpochCoordinatorId: String = _
+
   override lazy val logicalPlan: LogicalPlan = {
     assert(queryExecutionThread eq Thread.currentThread,
       "logicalPlan must be initialized in StreamExecutionThread " +
@@ -149,7 +152,6 @@ class ContinuousExecution(
    * @param sparkSessionForQuery Isolated [[SparkSession]] to run the continuous query with.
    */
   private def runContinuous(sparkSessionForQuery: SparkSession): Unit = {
-    currentRunId = UUID.randomUUID
     // A list of attributes that will need to be updated.
     val replacements = new ArrayBuffer[(Attribute, Attribute)]
     // Translate from continuous relation to the underlying data source.
@@ -219,15 +221,19 @@ class ContinuousExecution(
       lastExecution.executedPlan // Force the lazy generation of execution plan
     }
 
-    sparkSession.sparkContext.setLocalProperty(
+    sparkSessionForQuery.sparkContext.setLocalProperty(
       ContinuousExecution.START_EPOCH_KEY, currentBatchId.toString)
-    sparkSession.sparkContext.setLocalProperty(
-      ContinuousExecution.RUN_ID_KEY, runId.toString)
+    // Add another random ID on top of the run ID, to distinguish epoch coordinators across
+    // reconfigurations.
+    val epochCoordinatorId = s"$runId--${UUID.randomUUID}"
+    currentEpochCoordinatorId = epochCoordinatorId
+    sparkSessionForQuery.sparkContext.setLocalProperty(
+      ContinuousExecution.EPOCH_COORDINATOR_ID_KEY, epochCoordinatorId)
 
     // Use the parent Spark session for the endpoint since it's where this query ID is registered.
     val epochEndpoint =
       EpochCoordinatorRef.create(
-        writer.get(), reader, this, currentBatchId, sparkSession, SparkEnv.get)
+        writer.get(), reader, this, epochCoordinatorId, currentBatchId, sparkSession, SparkEnv.get)
     val epochUpdateThread = new Thread(new Runnable {
       override def run: Unit = {
         try {
@@ -359,5 +365,5 @@ class ContinuousExecution(
 
 object ContinuousExecution {
   val START_EPOCH_KEY = "__continuous_start_epoch"
-  val RUN_ID_KEY = "__run_id"
+  val EPOCH_COORDINATOR_ID_KEY = "__epoch_coordinator_id"
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
index 40dcbecade814..90b3584aa0436 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
@@ -79,7 +79,7 @@ private[sql] case class ReportPartitionOffset(
 
 /** Helper object used to create reference to [[EpochCoordinator]]. */
 private[sql] object EpochCoordinatorRef extends Logging {
-  private def endpointName(runId: String) = s"EpochCoordinator-$runId"
+  private def endpointName(id: String) = s"EpochCoordinator-$id"
 
   /**
    * Create a reference to a new [[EpochCoordinator]].
@@ -88,18 +88,19 @@ private[sql] object EpochCoordinatorRef extends Logging {
       writer: ContinuousWriter,
       reader: ContinuousReader,
       query: ContinuousExecution,
+      epochCoordinatorId: String,
       startEpoch: Long,
       session: SparkSession,
       env: SparkEnv): RpcEndpointRef = synchronized {
     val coordinator = new EpochCoordinator(
       writer, reader, query, startEpoch, session, env.rpcEnv)
-    val ref = env.rpcEnv.setupEndpoint(endpointName(query.runId.toString()), coordinator)
+    val ref = env.rpcEnv.setupEndpoint(endpointName(epochCoordinatorId), coordinator)
     logInfo("Registered EpochCoordinator endpoint")
     ref
   }
 
-  def get(runId: String, env: SparkEnv): RpcEndpointRef = synchronized {
-    val rpcEndpointRef = RpcUtils.makeDriverRef(endpointName(runId), env.conf, env.rpcEnv)
+  def get(id: String, env: SparkEnv): RpcEndpointRef = synchronized {
+    val rpcEndpointRef = RpcUtils.makeDriverRef(endpointName(id), env.conf, env.rpcEnv)
     logDebug("Retrieved existing EpochCoordinator endpoint")
     rpcEndpointRef
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index c75247e0f6ed8..efdb0e0e7cf1c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -263,7 +263,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
     def apply(): AssertOnQuery =
       Execute {
         case s: ContinuousExecution =>
-          val newEpoch = EpochCoordinatorRef.get(s.runId.toString, SparkEnv.get)
+          val newEpoch = EpochCoordinatorRef.get(s.currentEpochCoordinatorId, SparkEnv.get)
             .askSync[Long](IncrementAndGetEpoch)
           s.awaitEpoch(newEpoch - 1)
         case _ => throw new IllegalStateException("microbatch cannot increment epoch")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index 9ff02dee288fb..79d65192a14aa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -174,6 +174,31 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     }
   }
 
+  test("continuous processing listeners should receive QueryTerminatedEvent") {
+    val df = spark.readStream.format("rate").load()
+    val listeners = (1 to 5).map(_ => new EventCollector)
+    try {
+      listeners.foreach(listener => spark.streams.addListener(listener))
+      testStream(df, OutputMode.Append, useV2Sink = true)(
+        StartStream(Trigger.Continuous(1000)),
+        StopStream,
+        AssertOnQuery { query =>
+          eventually(Timeout(streamingTimeout)) {
+            listeners.foreach(listener => assert(listener.terminationEvent !== null))
+            listeners.foreach(listener => assert(listener.terminationEvent.id === query.id))
+            listeners.foreach(listener => assert(listener.terminationEvent.runId === query.runId))
+            listeners.foreach(listener => assert(listener.terminationEvent.exception === None))
+          }
+          listeners.foreach(listener => listener.checkAsyncErrors())
+          listeners.foreach(listener => listener.reset())
+          true
+        }
+      )
+    } finally {
+      listeners.foreach(spark.streams.removeListener)
+    }
+  }
+
   test("adding and removing listener") {
     def isListenerActive(listener: EventCollector): Boolean = {
       listener.reset()

From 6e509fde3f056316f46c71b672a7d69adb1b4f8e Mon Sep 17 00:00:00 2001
From: Li Jin <ice.xelloss@gmail.com>
Date: Thu, 18 Jan 2018 07:26:43 +0900
Subject: [PATCH 116/566] [SPARK-23047][PYTHON][SQL] Change MapVector to
 NullableMapVector in ArrowColumnVector

## What changes were proposed in this pull request?
This PR changes usage of `MapVector` in Spark codebase to use `NullableMapVector`.

`MapVector` is an internal Arrow class that is not supposed to be used directly. We should use `NullableMapVector` instead.

## How was this patch tested?

Existing test.

Author: Li Jin <ice.xelloss@gmail.com>

Closes #20239 from icexelloss/arrow-map-vector.

(cherry picked from commit 4e6f8fb150ae09c7d1de6beecb2b98e5afa5da19)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 .../sql/vectorized/ArrowColumnVector.java     | 13 +++++--
 .../vectorized/ArrowColumnVectorSuite.scala   | 36 +++++++++++++++++++
 2 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
index 708333213f3f1..eb69001fe677e 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
@@ -247,8 +247,8 @@ public ArrowColumnVector(ValueVector vector) {
 
       childColumns = new ArrowColumnVector[1];
       childColumns[0] = new ArrowColumnVector(listVector.getDataVector());
-    } else if (vector instanceof MapVector) {
-      MapVector mapVector = (MapVector) vector;
+    } else if (vector instanceof NullableMapVector) {
+      NullableMapVector mapVector = (NullableMapVector) vector;
       accessor = new StructAccessor(mapVector);
 
       childColumns = new ArrowColumnVector[mapVector.size()];
@@ -553,9 +553,16 @@ final int getArrayOffset(int rowId) {
     }
   }
 
+  /**
+   * Any call to "get" method will throw UnsupportedOperationException.
+   *
+   * Access struct values in a ArrowColumnVector doesn't use this accessor. Instead, it uses getStruct() method defined
+   * in the parent class. Any call to "get" method in this class is a bug in the code.
+   *
+   */
   private static class StructAccessor extends ArrowVectorAccessor {
 
-    StructAccessor(MapVector vector) {
+    StructAccessor(NullableMapVector vector) {
       super(vector);
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
index 7304803a092c0..53432669e215d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
@@ -322,6 +322,42 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
     allocator.close()
   }
 
+  test("non nullable struct") {
+    val allocator = ArrowUtils.rootAllocator.newChildAllocator("struct", 0, Long.MaxValue)
+    val schema = new StructType().add("int", IntegerType).add("long", LongType)
+    val vector = ArrowUtils.toArrowField("struct", schema, nullable = false, null)
+      .createVector(allocator).asInstanceOf[NullableMapVector]
+
+    vector.allocateNew()
+    val intVector = vector.getChildByOrdinal(0).asInstanceOf[IntVector]
+    val longVector = vector.getChildByOrdinal(1).asInstanceOf[BigIntVector]
+
+    vector.setIndexDefined(0)
+    intVector.setSafe(0, 1)
+    longVector.setSafe(0, 1L)
+
+    vector.setIndexDefined(1)
+    intVector.setSafe(1, 2)
+    longVector.setNull(1)
+
+    vector.setValueCount(2)
+
+    val columnVector = new ArrowColumnVector(vector)
+    assert(columnVector.dataType === schema)
+    assert(columnVector.numNulls === 0)
+
+    val row0 = columnVector.getStruct(0, 2)
+    assert(row0.getInt(0) === 1)
+    assert(row0.getLong(1) === 1L)
+
+    val row1 = columnVector.getStruct(1, 2)
+    assert(row1.getInt(0) === 2)
+    assert(row1.isNullAt(1))
+
+    columnVector.close()
+    allocator.close()
+  }
+
   test("struct") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("struct", 0, Long.MaxValue)
     val schema = new StructType().add("int", IntegerType).add("long", LongType)

From b84c2a30665ebbd65feb7418826501f6c959eb96 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Thu, 18 Jan 2018 07:30:54 +0900
Subject: [PATCH 117/566] [SPARK-23132][PYTHON][ML] Run doctests in ml.image
 when testing

## What changes were proposed in this pull request?

This PR proposes to actually run the doctests in `ml/image.py`.

## How was this patch tested?

doctests in `python/pyspark/ml/image.py`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20294 from HyukjinKwon/trigger-image.

(cherry picked from commit 45ad97df87c89cb94ce9564e5773897b6d9326f5)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 python/pyspark/ml/image.py | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/ml/image.py b/python/pyspark/ml/image.py
index c9b840276f675..2d86c7f03860c 100644
--- a/python/pyspark/ml/image.py
+++ b/python/pyspark/ml/image.py
@@ -194,9 +194,9 @@ def readImages(self, path, recursive=False, numPartitions=-1,
         :return: a :class:`DataFrame` with a single column of "images",
                see ImageSchema for details.
 
-        >>> df = ImageSchema.readImages('python/test_support/image/kittens', recursive=True)
+        >>> df = ImageSchema.readImages('data/mllib/images/kittens', recursive=True)
         >>> df.count()
-        4
+        5
 
         .. versionadded:: 2.3.0
         """
@@ -216,3 +216,25 @@ def readImages(self, path, recursive=False, numPartitions=-1,
 def _disallow_instance(_):
     raise RuntimeError("Creating instance of _ImageSchema class is disallowed.")
 _ImageSchema.__init__ = _disallow_instance
+
+
+def _test():
+    import doctest
+    import pyspark.ml.image
+    globs = pyspark.ml.image.__dict__.copy()
+    spark = SparkSession.builder\
+        .master("local[2]")\
+        .appName("ml.image tests")\
+        .getOrCreate()
+    globs['spark'] = spark
+
+    (failure_count, test_count) = doctest.testmod(
+        pyspark.ml.image, globs=globs,
+        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
+    spark.stop()
+    if failure_count:
+        exit(-1)
+
+
+if __name__ == "__main__":
+    _test()

From 9783aea2c75700e7ce9551ccfd33e43765de8981 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Wed, 17 Jan 2018 16:40:02 -0800
Subject: [PATCH 118/566] [SPARK-23119][SS] Minor fixes to V2 streaming APIs

## What changes were proposed in this pull request?

- Added `InterfaceStability.Evolving` annotations
- Improved docs.

## How was this patch tested?
Existing tests.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #20286 from tdas/SPARK-23119.

(cherry picked from commit bac0d661af6092dd26638223156827aceb901229)
Signed-off-by: Shixiong Zhu <zsxwing@gmail.com>
---
 .../v2/streaming/ContinuousReadSupport.java    |  2 ++
 .../streaming/reader/ContinuousDataReader.java |  2 ++
 .../v2/streaming/reader/ContinuousReader.java  |  9 +++++++--
 .../v2/streaming/reader/MicroBatchReader.java  |  5 +++++
 .../sources/v2/streaming/reader/Offset.java    | 18 +++++++++++++-----
 .../v2/streaming/reader/PartitionOffset.java   |  3 +++
 .../sources/v2/writer/DataSourceV2Writer.java  |  5 ++++-
 7 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java
index 3136cee1f655f..9a93a806b0efc 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java
@@ -19,6 +19,7 @@
 
 import java.util.Optional;
 
+import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.sources.v2.DataSourceV2;
 import org.apache.spark.sql.sources.v2.DataSourceV2Options;
 import org.apache.spark.sql.sources.v2.streaming.reader.ContinuousReader;
@@ -28,6 +29,7 @@
  * A mix-in interface for {@link DataSourceV2}. Data sources can implement this interface to
  * provide data reading ability for continuous stream processing.
  */
+@InterfaceStability.Evolving
 public interface ContinuousReadSupport extends DataSourceV2 {
   /**
    * Creates a {@link ContinuousReader} to scan the data from this data source.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousDataReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousDataReader.java
index ca9a290e97a02..3f13a4dbf5793 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousDataReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousDataReader.java
@@ -17,11 +17,13 @@
 
 package org.apache.spark.sql.sources.v2.streaming.reader;
 
+import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.sources.v2.reader.DataReader;
 
 /**
  * A variation on {@link DataReader} for use with streaming in continuous processing mode.
  */
+@InterfaceStability.Evolving
 public interface ContinuousDataReader<T> extends DataReader<T> {
     /**
      * Get the offset of the current record, or the start offset if no records have been read.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java
index f0b205869ed6c..745f1ce502443 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.sources.v2.streaming.reader;
 
+import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.execution.streaming.BaseStreamingSource;
 import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
 
@@ -27,11 +28,15 @@
  * interface to allow reading in a continuous processing mode stream.
  *
  * Implementations must ensure each read task output is a {@link ContinuousDataReader}.
+ *
+ * Note: This class currently extends {@link BaseStreamingSource} to maintain compatibility with
+ * DataSource V1 APIs. This extension will be removed once we get rid of V1 completely.
  */
+@InterfaceStability.Evolving
 public interface ContinuousReader extends BaseStreamingSource, DataSourceV2Reader {
     /**
-     * Merge offsets coming from {@link ContinuousDataReader} instances in each partition to
-     * a single global offset.
+     * Merge partitioned offsets coming from {@link ContinuousDataReader} instances for each
+     * partition to a single global offset.
      */
     Offset mergeOffsets(PartitionOffset[] offsets);
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java
index 70ff756806032..02f37cebc7484 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.sources.v2.streaming.reader;
 
+import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
 import org.apache.spark.sql.execution.streaming.BaseStreamingSource;
 
@@ -25,7 +26,11 @@
 /**
  * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
  * interface to indicate they allow micro-batch streaming reads.
+ *
+ * Note: This class currently extends {@link BaseStreamingSource} to maintain compatibility with
+ * DataSource V1 APIs. This extension will be removed once we get rid of V1 completely.
  */
+@InterfaceStability.Evolving
 public interface MicroBatchReader extends DataSourceV2Reader, BaseStreamingSource {
     /**
      * Set the desired offset range for read tasks created from this reader. Read tasks will
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/Offset.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/Offset.java
index 60b87f2ac0756..abba3e7188b13 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/Offset.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/Offset.java
@@ -17,12 +17,20 @@
 
 package org.apache.spark.sql.sources.v2.streaming.reader;
 
+import org.apache.spark.annotation.InterfaceStability;
+
 /**
- * An abstract representation of progress through a [[MicroBatchReader]] or [[ContinuousReader]].
- * During execution, Offsets provided by the data source implementation will be logged and used as
- * restart checkpoints. Sources should provide an Offset implementation which they can use to
- * reconstruct the stream position where the offset was taken.
+ * An abstract representation of progress through a {@link MicroBatchReader} or
+ * {@link ContinuousReader}.
+ * During execution, offsets provided by the data source implementation will be logged and used as
+ * restart checkpoints. Each source should provide an offset implementation which the source can use
+ * to reconstruct a position in the stream up to which data has been seen/processed.
+ *
+ * Note: This class currently extends {@link org.apache.spark.sql.execution.streaming.Offset} to
+ * maintain compatibility with DataSource V1 APIs. This extension will be removed once we
+ * get rid of V1 completely.
  */
+@InterfaceStability.Evolving
 public abstract class Offset extends org.apache.spark.sql.execution.streaming.Offset {
     /**
      * A JSON-serialized representation of an Offset that is
@@ -37,7 +45,7 @@ public abstract class Offset extends org.apache.spark.sql.execution.streaming.Of
     /**
      * Equality based on JSON string representation. We leverage the
      * JSON representation for normalization between the Offset's
-     * in memory and on disk representations.
+     * in deserialized and serialized representations.
      */
     @Override
     public boolean equals(Object obj) {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/PartitionOffset.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/PartitionOffset.java
index eca0085c8a8ce..4688b85f49f5f 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/PartitionOffset.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/PartitionOffset.java
@@ -19,11 +19,14 @@
 
 import java.io.Serializable;
 
+import org.apache.spark.annotation.InterfaceStability;
+
 /**
  * Used for per-partition offsets in continuous processing. ContinuousReader implementations will
  * provide a method to merge these into a global Offset.
  *
  * These offsets must be serializable.
  */
+@InterfaceStability.Evolving
 public interface PartitionOffset extends Serializable {
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java
index fc37b9a516f82..317ac45bcfd74 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java
@@ -22,11 +22,14 @@
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.sources.v2.DataSourceV2Options;
 import org.apache.spark.sql.sources.v2.WriteSupport;
+import org.apache.spark.sql.streaming.OutputMode;
 import org.apache.spark.sql.types.StructType;
 
 /**
  * A data source writer that is returned by
- * {@link WriteSupport#createWriter(String, StructType, SaveMode, DataSourceV2Options)}.
+ * {@link WriteSupport#createWriter(String, StructType, SaveMode, DataSourceV2Options)}/
+ * {@link org.apache.spark.sql.sources.v2.streaming.MicroBatchWriteSupport#createMicroBatchWriter(String, long, StructType, OutputMode, DataSourceV2Options)}/
+ * {@link org.apache.spark.sql.sources.v2.streaming.ContinuousWriteSupport#createContinuousWriter(String, StructType, OutputMode, DataSourceV2Options)}.
  * It can mix in various writing optimization interfaces to speed up the data saving. The actual
  * writing logic is delegated to {@link DataWriter}.
  *

From 050c1e24e506ff224bcf4e3e458e57fbd216765c Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Wed, 17 Jan 2018 16:41:43 -0800
Subject: [PATCH 119/566] [SPARK-23064][DOCS][SS] Added documentation for
 stream-stream joins

## What changes were proposed in this pull request?
Added documentation for stream-stream joins

![image](https://user-images.githubusercontent.com/663212/35018744-e999895a-fad7-11e7-9d6a-8c7a73e6eb9c.png)

![image](https://user-images.githubusercontent.com/663212/35018775-157eb464-fad8-11e7-879e-47a2fcbd8690.png)

![image](https://user-images.githubusercontent.com/663212/35018784-27791a24-fad8-11e7-98f4-7ff246f62a74.png)

![image](https://user-images.githubusercontent.com/663212/35018791-36a80334-fad8-11e7-9791-f85efa7c6ba2.png)

## How was this patch tested?

N/a

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #20255 from tdas/join-docs.

(cherry picked from commit 1002bd6b23ff78a010ca259ea76988ef4c478c6e)
Signed-off-by: Shixiong Zhu <zsxwing@gmail.com>
---
 .../structured-streaming-programming-guide.md | 338 +++++++++++++++++-
 1 file changed, 326 insertions(+), 12 deletions(-)

diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index de13e281916db..1779a4215e085 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -1051,7 +1051,19 @@ output mode.
 
 
 ### Join Operations
-Streaming DataFrames can be joined with static DataFrames to create new streaming DataFrames. Here are a few examples.
+Structured Streaming supports joining a streaming Dataset/DataFrame with a static Dataset/DataFrame
+as well as another streaming Dataset/DataFrame. The result of the streaming join is generated
+incrementally, similar to the results of streaming aggregations in the previous section. In this
+section we will explore what type of joins (i.e. inner, outer, etc.) are supported in the above
+cases. Note that in all the supported join types, the result of the join with a streaming
+Dataset/DataFrame will be the exactly the same as if it was with a static Dataset/DataFrame
+containing the same data in the stream.
+
+
+#### Stream-static joins
+
+Since the introduction in Spark 2.0, Structured Streaming has supported joins (inner join and some
+type of outer joins) between a streaming and a static DataFrame/Dataset. Here is a simple example.
 
 <div class="codetabs">
 <div data-lang="scala"  markdown="1">
@@ -1089,6 +1101,300 @@ streamingDf.join(staticDf, "type", "right_join")  # right outer join with a stat
 </div>
 </div>
 
+Note that stream-static joins are not stateful, so no state management is necessary.
+However, a few types of stream-static outer joins are not yet supported.
+These are listed at the [end of this Join section](#support-matrix-for-joins-in-streaming-queries).
+
+#### Stream-stream Joins
+In Spark 2.3, we have added support for stream-stream joins, that is, you can join two streaming
+Datasets/DataFrames. The challenge of generating join results between two data streams is that,
+at any point of time, the view of the dataset is incomplete for both sides of the join making
+it much harder to find matches between inputs. Any row received from one input stream can match
+with any future, yet-to-be-received row from the other input stream. Hence, for both the input
+streams, we buffer past input as streaming state, so that we can match every future input with
+past input and accordingly generate joined results. Furthermore, similar to streaming aggregations,
+we automatically handle late, out-of-order data and can limit the state using watermarks.
+Let’s discuss the different types of supported stream-stream joins and how to use them.
+
+##### Inner Joins with optional Watermarking
+Inner joins on any kind of columns along with any kind of join conditions are supported.
+However, as the stream runs, the size of streaming state will keep growing indefinitely as
+*all* past input must be saved as the any new input can match with any input from the past.
+To avoid unbounded state, you have to define additional join conditions such that indefinitely
+old inputs cannot match with future inputs and therefore can be cleared from the state.
+In other words, you will have to do the following additional steps in the join.
+
+1. Define watermark delays on both inputs such that the engine knows how delayed the input can be
+(similar to streaming aggregations)
+
+1. Define a constraint on event-time across the two inputs such that the engine can figure out when
+old rows of one input is not going to be required (i.e. will not satisfy the time constraint) for
+matches with the other input. This constraint can be defined in one of the two ways.
+
+    1. Time range join conditions (e.g. `...JOIN ON leftTime BETWEN rightTime AND rightTime + INTERVAL 1 HOUR`),
+
+    1. Join on event-time windows (e.g. `...JOIN ON leftTimeWindow = rightTimeWindow`).
+
+Let’s understand this with an example.
+
+Let’s say we want to join a stream of advertisement impressions (when an ad was shown) with
+another stream of user clicks on advertisements to correlate when impressions led to
+monetizable clicks. To allow the state cleanup in this stream-stream join, you will have to
+specify the watermarking delays and the time constraints as follows.
+
+1. Watermark delays: Say, the impressions and the corresponding clicks can be late/out-of-order
+in event-time by at most 2 and 3 hours, respectively.
+
+1. Event-time range condition: Say, a click can occur within a time range of 0 seconds to 1 hour
+after the corresponding impression.
+
+The code would look like this.
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+
+{% highlight scala %}
+import org.apache.spark.sql.functions.expr
+
+val impressions = spark.readStream. ...
+val clicks = spark.readStream. ...
+
+// Apply watermarks on event-time columns
+val impressionsWithWatermark = impressions.withWatermark("impressionTime", "2 hours")
+val clicksWithWatermark = clicks.withWatermark("clickTime", "3 hours")
+
+// Join with event-time constraints
+impressionsWithWatermark.join(
+  clicksWithWatermark,
+  expr("""
+    clickAdId = impressionAdId AND
+    clickTime >= impressionTime AND
+    clickTime <= impressionTime + interval 1 hour
+    """)
+)
+
+{% endhighlight %}
+
+</div>
+<div data-lang="java"  markdown="1">
+
+{% highlight java %}
+import static org.apache.spark.sql.functions.expr
+
+Dataset<Row> impressions = spark.readStream(). ...
+Dataset<Row> clicks = spark.readStream(). ...
+
+// Apply watermarks on event-time columns
+Dataset<Row> impressionsWithWatermark = impressions.withWatermark("impressionTime", "2 hours");
+Dataset<Row> clicksWithWatermark = clicks.withWatermark("clickTime", "3 hours");
+
+// Join with event-time constraints
+impressionsWithWatermark.join(
+  clicksWithWatermark,
+  expr(
+    "clickAdId = impressionAdId AND " +
+    "clickTime >= impressionTime AND " +
+    "clickTime <= impressionTime + interval 1 hour ")
+);
+
+{% endhighlight %}
+
+
+</div>
+<div data-lang="python"  markdown="1">
+
+{% highlight python %}
+from pyspark.sql.functions import expr
+
+impressions = spark.readStream. ...
+clicks = spark.readStream. ...
+
+# Apply watermarks on event-time columns
+impressionsWithWatermark = impressions.withWatermark("impressionTime", "2 hours")
+clicksWithWatermark = clicks.withWatermark("clickTime", "3 hours")
+
+# Join with event-time constraints
+impressionsWithWatermark.join(
+  clicksWithWatermark,
+  expr("""
+    clickAdId = impressionAdId AND
+    clickTime >= impressionTime AND
+    clickTime <= impressionTime + interval 1 hour
+    """)
+)
+
+{% endhighlight %}
+
+</div>
+</div>
+
+##### Outer Joins with Watermarking
+While the watermark + event-time constraints is optional for inner joins, for left and right outer
+joins they must be specified. This is because for generating the NULL results in outer join, the
+engine must know when an input row is not going to match with anything in future. Hence, the
+watermark + event-time constraints must be specified for generating correct results. Therefore,
+a query with outer-join will look quite like the ad-monetization example earlier, except that
+there will be an additional parameter specifying it to be an outer-join.
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+
+{% highlight scala %}
+
+impressionsWithWatermark.join(
+  clicksWithWatermark,
+  expr("""
+    clickAdId = impressionAdId AND
+    clickTime >= impressionTime AND
+    clickTime <= impressionTime + interval 1 hour
+    """),
+  joinType = "leftOuter"      // can be "inner", "leftOuter", "rightOuter"
+ )
+
+{% endhighlight %}
+
+</div>
+<div data-lang="java"  markdown="1">
+
+{% highlight java %}
+impressionsWithWatermark.join(
+  clicksWithWatermark,
+  expr(
+    "clickAdId = impressionAdId AND " +
+    "clickTime >= impressionTime AND " +
+    "clickTime <= impressionTime + interval 1 hour "),
+  "leftOuter"                 // can be "inner", "leftOuter", "rightOuter"
+);
+
+{% endhighlight %}
+
+
+</div>
+<div data-lang="python"  markdown="1">
+
+{% highlight python %}
+impressionsWithWatermark.join(
+  clicksWithWatermark,
+  expr("""
+    clickAdId = impressionAdId AND
+    clickTime >= impressionTime AND
+    clickTime <= impressionTime + interval 1 hour
+    """),
+  "leftOuter"                 # can be "inner", "leftOuter", "rightOuter"
+)
+
+{% endhighlight %}
+
+</div>
+</div>
+
+However, note that the outer NULL results will be generated with a delay (depends on the specified
+watermark delay and the time range condition) because the engine has to wait for that long to ensure
+there were no matches and there will be no more matches in future.
+
+##### Support matrix for joins in streaming queries
+
+<table class ="table">
+  <tr>
+    <th>Left Input</th>
+    <th>Right Input</th>
+    <th>Join Type</th>
+    <th></th>
+  </tr>
+  <tr>
+      <td style="vertical-align: middle;">Static</td>
+      <td style="vertical-align: middle;">Static</td>
+      <td style="vertical-align: middle;">All types</td>
+      <td style="vertical-align: middle;">
+        Supported, since its not on streaming data even though it
+        can be present in a streaming query
+      </td>
+  </tr>
+  <tr>
+    <td rowspan="4" style="vertical-align: middle;">Stream</td>
+    <td rowspan="4" style="vertical-align: middle;">Static</td>
+    <td style="vertical-align: middle;">Inner</td>
+    <td style="vertical-align: middle;">Supported, not stateful</td>
+  </tr>
+  <tr>
+    <td style="vertical-align: middle;">Left Outer</td>
+    <td style="vertical-align: middle;">Supported, not stateful</td>
+  </tr>
+  <tr>
+    <td style="vertical-align: middle;">Right Outer</td>
+    <td style="vertical-align: middle;">Not supported</td>
+  </tr>
+  <tr>
+    <td style="vertical-align: middle;">Full Outer</td>
+    <td style="vertical-align: middle;">Not supported</td>
+  </tr>
+  <tr>
+    <td rowspan="4" style="vertical-align: middle;">Static</td>
+    <td rowspan="4" style="vertical-align: middle;">Stream</td>
+    <td style="vertical-align: middle;">Inner</td>
+    <td style="vertical-align: middle;">Supported, not stateful</td>
+  </tr>
+  <tr>
+    <td style="vertical-align: middle;">Left Outer</td>
+    <td style="vertical-align: middle;">Not supported</td>
+  </tr>
+  <tr>
+    <td style="vertical-align: middle;">Right Outer</td>
+    <td style="vertical-align: middle;">Supported, not stateful</td>
+  </tr>
+  <tr>
+    <td style="vertical-align: middle;">Full Outer</td>
+    <td style="vertical-align: middle;">Not supported</td>
+  </tr>
+  <tr>
+    <td rowspan="4" style="vertical-align: middle;">Stream</td>
+    <td rowspan="4" style="vertical-align: middle;">Stream</td>
+    <td style="vertical-align: middle;">Inner</td>
+    <td style="vertical-align: middle;">
+      Supported, optionally specify watermark on both sides +
+      time constraints for state cleanup
+    </td>
+  </tr>
+  <tr>
+    <td style="vertical-align: middle;">Left Outer</td>
+    <td style="vertical-align: middle;">
+      Conditionally supported, must specify watermark on right + time constraints for correct
+      results, optionally specify watermark on left for all state cleanup
+    </td>
+  </tr>
+  <tr>
+    <td style="vertical-align: middle;">Right Outer</td>
+    <td style="vertical-align: middle;">
+      Conditionally supported, must specify watermark on left + time constraints for correct
+      results, optionally specify watermark on right for all state cleanup
+    </td>
+  </tr>
+  <tr>
+    <td style="vertical-align: middle;">Full Outer</td>
+    <td style="vertical-align: middle;">Not supported</td>
+  </tr>
+ <tr>
+    <td></td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+</table>
+
+Additional details on supported joins:
+
+- Joins can be cascaded, that is, you can do `df1.join(df2, ...).join(df3, ...).join(df4, ....)`.
+
+- As of Spark 2.3, you can use joins only when the query is in Append output mode. Other output modes are not yet supported.
+
+- As of Spark 2.3, you cannot use other non-map-like operations before joins. Here are a few examples of
+  what cannot be used.
+
+  - Cannot use streaming aggregations before joins.
+
+  - Cannot use mapGroupsWithState and flatMapGroupsWithState in Update mode before joins.
+
+
 ### Streaming Deduplication
 You can deduplicate records in data streams using a unique identifier in the events. This is exactly same as deduplication on static using a unique identifier column. The query will store the necessary amount of data from previous records such that it can filter duplicate records. Similar to aggregations, you can use deduplication with or without watermarking.
 
@@ -1160,15 +1466,9 @@ Some of them are as follows.
 
 - Sorting operations are supported on streaming Datasets only after an aggregation and in Complete Output Mode.
 
-- Outer joins between a streaming and a static Datasets are conditionally supported.
-
-    + Full outer join with a streaming Dataset is not supported
-
-    + Left outer join with a streaming Dataset on the right is not supported
-
-    + Right outer join with a streaming Dataset on the left is not supported
-
-- Any kind of joins between two streaming Datasets is not yet supported.
+- Few types of outer joins on streaming Datasets are not supported. See the
+  <a href="#support-matrix-for-joins-in-streaming-queries">support matrix in the Join Operations section</a>
+  for more details.
 
 In addition, there are some Dataset methods that will not work on streaming Datasets. They are actions that will immediately run queries and return results, which does not make sense on a streaming Dataset. Rather, those functionalities can be done by explicitly starting a streaming query (see the next section regarding that).
 
@@ -1276,6 +1576,15 @@ Here is the compatibility matrix.
       Aggregations not allowed after <code>flatMapGroupsWithState</code>.
     </td>
   </tr>
+  <tr>
+      <td colspan="2" style="vertical-align: middle;">Queries with <code>joins</code></td>
+      <td style="vertical-align: middle;">Append</td>
+      <td style="vertical-align: middle;">
+        Update and Complete mode not supported yet. See the
+        <a href="#support-matrix-for-joins-in-streaming-queries">support matrix in the Join Operations section</a>
+         for more details on what types of joins are supported.
+      </td>
+    </tr>
   <tr>
     <td colspan="2" style="vertical-align: middle;">Other queries</td>
     <td style="vertical-align: middle;">Append, Update</td>
@@ -2142,6 +2451,11 @@ write.stream(aggDF, "memory", outputMode = "complete", checkpointLocation = "pat
 
 **Talks**
 
-- Spark Summit 2017 Talk - [Easy, Scalable, Fault-tolerant Stream Processing with Structured Streaming in Apache Spark](https://spark-summit.org/2017/events/easy-scalable-fault-tolerant-stream-processing-with-structured-streaming-in-apache-spark/)
-- Spark Summit 2016 Talk - [A Deep Dive into Structured Streaming](https://spark-summit.org/2016/events/a-deep-dive-into-structured-streaming/)
+- Spark Summit Europe 2017
+  - Easy, Scalable, Fault-tolerant Stream Processing with Structured Streaming in Apache Spark -
+    [Part 1 slides/video](https://databricks.com/session/easy-scalable-fault-tolerant-stream-processing-with-structured-streaming-in-apache-spark), [Part 2 slides/video](https://databricks.com/session/easy-scalable-fault-tolerant-stream-processing-with-structured-streaming-in-apache-spark-continues)
+  - Deep Dive into Stateful Stream Processing in Structured Streaming - [slides/video](https://databricks.com/session/deep-dive-into-stateful-stream-processing-in-structured-streaming)
+- Spark Summit 2016
+  - A Deep Dive into Structured Streaming - [slides/video](https://spark-summit.org/2016/events/a-deep-dive-into-structured-streaming/)
+
 

From f2688ef0fbd9d355d13ce4056d35e99970f4cd47 Mon Sep 17 00:00:00 2001
From: Xiayun Sun <xiayunsun@gmail.com>
Date: Wed, 17 Jan 2018 16:42:38 -0800
Subject: [PATCH 120/566] [SPARK-21996][SQL] read files with space in name for
 streaming

## What changes were proposed in this pull request?

Structured streaming is now able to read files with space in file name (previously it would skip the file and output a warning)

## How was this patch tested?

Added new unit test.

Author: Xiayun Sun <xiayunsun@gmail.com>

Closes #19247 from xysun/SPARK-21996.

(cherry picked from commit 02194702068291b3af77486d01029fb848c36d7b)
Signed-off-by: Shixiong Zhu <zsxwing@gmail.com>
---
 .../streaming/FileStreamSource.scala          |  2 +-
 .../sql/streaming/FileStreamSourceSuite.scala | 50 ++++++++++++++++++-
 2 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
index 0debd7db84757..8c016abc5b643 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
@@ -166,7 +166,7 @@ class FileStreamSource(
     val newDataSource =
       DataSource(
         sparkSession,
-        paths = files.map(_.path),
+        paths = files.map(f => new Path(new URI(f.path)).toString),
         userSpecifiedSchema = Some(schema),
         partitionColumns = partitionColumns,
         className = fileFormatClassName,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index 39bb572740617..5bb0f4d643bbe 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -74,11 +74,11 @@ abstract class FileStreamSourceTest
     protected def addData(source: FileStreamSource): Unit
   }
 
-  case class AddTextFileData(content: String, src: File, tmp: File)
+  case class AddTextFileData(content: String, src: File, tmp: File, tmpFilePrefix: String = "text")
     extends AddFileData {
 
     override def addData(source: FileStreamSource): Unit = {
-      val tempFile = Utils.tempFileWith(new File(tmp, "text"))
+      val tempFile = Utils.tempFileWith(new File(tmp, tmpFilePrefix))
       val finalFile = new File(src, tempFile.getName)
       src.mkdirs()
       require(stringToFile(tempFile, content).renameTo(finalFile))
@@ -408,6 +408,52 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
     }
   }
 
+  test("SPARK-21996 read from text files -- file name has space") {
+    withTempDirs { case (src, tmp) =>
+      val textStream = createFileStream("text", src.getCanonicalPath)
+      val filtered = textStream.filter($"value" contains "keep")
+
+      testStream(filtered)(
+        AddTextFileData("drop1\nkeep2\nkeep3", src, tmp, "text text"),
+        CheckAnswer("keep2", "keep3")
+      )
+    }
+  }
+
+  test("SPARK-21996 read from text files generated by file sink -- file name has space") {
+    val testTableName = "FileStreamSourceTest"
+    withTable(testTableName) {
+      withTempDirs { case (src, checkpoint) =>
+        val output = new File(src, "text text")
+        val inputData = MemoryStream[String]
+        val ds = inputData.toDS()
+
+        val query = ds.writeStream
+          .option("checkpointLocation", checkpoint.getCanonicalPath)
+          .format("text")
+          .start(output.getCanonicalPath)
+
+        try {
+          inputData.addData("foo")
+          failAfter(streamingTimeout) {
+            query.processAllAvailable()
+          }
+        } finally {
+          query.stop()
+        }
+
+        val df2 = spark.readStream.format("text").load(output.getCanonicalPath)
+        val query2 = df2.writeStream.format("memory").queryName(testTableName).start()
+        try {
+          query2.processAllAvailable()
+          checkDatasetUnorderly(spark.table(testTableName).as[String], "foo")
+        } finally {
+          query2.stop()
+        }
+      }
+    }
+  }
+
   test("read from textfile") {
     withTempDirs { case (src, tmp) =>
       val textStream = spark.readStream.textFile(src.getCanonicalPath)

From 3a80cc59b54bb8e92df507777e836f167a4db14e Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Thu, 18 Jan 2018 14:51:05 +0900
Subject: [PATCH 121/566] [SPARK-23122][PYTHON][SQL] Deprecate register* for
 UDFs in SQLContext and Catalog in PySpark

## What changes were proposed in this pull request?

This PR proposes to deprecate `register*` for UDFs in `SQLContext` and `Catalog` in Spark 2.3.0.

These are inconsistent with Scala / Java APIs and also these basically do the same things with `spark.udf.register*`.

Also, this PR moves the logcis from `[sqlContext|spark.catalog].register*` to `spark.udf.register*` and reuse the docstring.

This PR also handles minor doc corrections. It also includes https://github.com/apache/spark/pull/20158

## How was this patch tested?

Manually tested, manually checked the API documentation and tests added to check if deprecated APIs call the aliases correctly.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20288 from HyukjinKwon/deprecate-udf.

(cherry picked from commit 39d244d921d8d2d3ed741e8e8f1175515a74bdbd)
Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
---
 dev/sparktestsupport/modules.py |   1 +
 python/pyspark/sql/catalog.py   |  91 ++--------------
 python/pyspark/sql/context.py   | 137 ++++--------------------
 python/pyspark/sql/functions.py |   4 +-
 python/pyspark/sql/group.py     |   3 +-
 python/pyspark/sql/session.py   |   6 +-
 python/pyspark/sql/tests.py     |  20 ++++
 python/pyspark/sql/udf.py       | 182 +++++++++++++++++++++++++++++++-
 8 files changed, 234 insertions(+), 210 deletions(-)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 7164180a6a7b0..b900f0bd913c3 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -400,6 +400,7 @@ def __hash__(self):
         "pyspark.sql.functions",
         "pyspark.sql.readwriter",
         "pyspark.sql.streaming",
+        "pyspark.sql.udf",
         "pyspark.sql.window",
         "pyspark.sql.tests",
     ]
diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py
index 35fbe9e669adb..6aef0f22340be 100644
--- a/python/pyspark/sql/catalog.py
+++ b/python/pyspark/sql/catalog.py
@@ -224,92 +224,17 @@ def dropGlobalTempView(self, viewName):
         """
         self._jcatalog.dropGlobalTempView(viewName)
 
-    @ignore_unicode_prefix
     @since(2.0)
     def registerFunction(self, name, f, returnType=None):
-        """Registers a Python function (including lambda function) or a :class:`UserDefinedFunction`
-        as a UDF. The registered UDF can be used in SQL statements.
-
-        :func:`spark.udf.register` is an alias for :func:`spark.catalog.registerFunction`.
-
-        In addition to a name and the function itself, `returnType` can be optionally specified.
-        1) When f is a Python function, `returnType` defaults to a string. The produced object must
-        match the specified type. 2) When f is a :class:`UserDefinedFunction`, Spark uses the return
-        type of the given UDF as the return type of the registered UDF. The input parameter
-        `returnType` is None by default. If given by users, the value must be None.
-
-        :param name: name of the UDF in SQL statements.
-        :param f: a Python function, or a wrapped/native UserDefinedFunction. The UDF can be either
-            row-at-a-time or vectorized.
-        :param returnType: the return type of the registered UDF.
-        :return: a wrapped/native :class:`UserDefinedFunction`
-
-        >>> strlen = spark.catalog.registerFunction("stringLengthString", len)
-        >>> spark.sql("SELECT stringLengthString('test')").collect()
-        [Row(stringLengthString(test)=u'4')]
-
-        >>> spark.sql("SELECT 'foo' AS text").select(strlen("text")).collect()
-        [Row(stringLengthString(text)=u'3')]
-
-        >>> from pyspark.sql.types import IntegerType
-        >>> _ = spark.catalog.registerFunction("stringLengthInt", len, IntegerType())
-        >>> spark.sql("SELECT stringLengthInt('test')").collect()
-        [Row(stringLengthInt(test)=4)]
-
-        >>> from pyspark.sql.types import IntegerType
-        >>> _ = spark.udf.register("stringLengthInt", len, IntegerType())
-        >>> spark.sql("SELECT stringLengthInt('test')").collect()
-        [Row(stringLengthInt(test)=4)]
-
-        >>> from pyspark.sql.types import IntegerType
-        >>> from pyspark.sql.functions import udf
-        >>> slen = udf(lambda s: len(s), IntegerType())
-        >>> _ = spark.udf.register("slen", slen)
-        >>> spark.sql("SELECT slen('test')").collect()
-        [Row(slen(test)=4)]
-
-        >>> import random
-        >>> from pyspark.sql.functions import udf
-        >>> from pyspark.sql.types import IntegerType
-        >>> random_udf = udf(lambda: random.randint(0, 100), IntegerType()).asNondeterministic()
-        >>> new_random_udf = spark.catalog.registerFunction("random_udf", random_udf)
-        >>> spark.sql("SELECT random_udf()").collect()  # doctest: +SKIP
-        [Row(random_udf()=82)]
-        >>> spark.range(1).select(new_random_udf()).collect()  # doctest: +SKIP
-        [Row(<lambda>()=26)]
-
-        >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
-        >>> @pandas_udf("integer", PandasUDFType.SCALAR)  # doctest: +SKIP
-        ... def add_one(x):
-        ...     return x + 1
-        ...
-        >>> _ = spark.udf.register("add_one", add_one)  # doctest: +SKIP
-        >>> spark.sql("SELECT add_one(id) FROM range(3)").collect()  # doctest: +SKIP
-        [Row(add_one(id)=1), Row(add_one(id)=2), Row(add_one(id)=3)]
-        """
+        """An alias for :func:`spark.udf.register`.
+        See :meth:`pyspark.sql.UDFRegistration.register`.
 
-        # This is to check whether the input function is a wrapped/native UserDefinedFunction
-        if hasattr(f, 'asNondeterministic'):
-            if returnType is not None:
-                raise TypeError(
-                    "Invalid returnType: None is expected when f is a UserDefinedFunction, "
-                    "but got %s." % returnType)
-            if f.evalType not in [PythonEvalType.SQL_BATCHED_UDF,
-                                  PythonEvalType.SQL_PANDAS_SCALAR_UDF]:
-                raise ValueError(
-                    "Invalid f: f must be either SQL_BATCHED_UDF or SQL_PANDAS_SCALAR_UDF")
-            register_udf = UserDefinedFunction(f.func, returnType=f.returnType, name=name,
-                                               evalType=f.evalType,
-                                               deterministic=f.deterministic)
-            return_udf = f
-        else:
-            if returnType is None:
-                returnType = StringType()
-            register_udf = UserDefinedFunction(f, returnType=returnType, name=name,
-                                               evalType=PythonEvalType.SQL_BATCHED_UDF)
-            return_udf = register_udf._wrapped()
-        self._jsparkSession.udf().registerPython(name, register_udf._judf)
-        return return_udf
+        .. note:: Deprecated in 2.3.0. Use :func:`spark.udf.register` instead.
+        """
+        warnings.warn(
+            "Deprecated in 2.3.0. Use spark.udf.register instead.",
+            DeprecationWarning)
+        return self._sparkSession.udf.register(name, f, returnType)
 
     @since(2.0)
     def isCached(self, tableName):
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 85479095af594..cc1cd1a5842d9 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -29,9 +29,10 @@
 from pyspark.sql.readwriter import DataFrameReader
 from pyspark.sql.streaming import DataStreamReader
 from pyspark.sql.types import IntegerType, Row, StringType
+from pyspark.sql.udf import UDFRegistration
 from pyspark.sql.utils import install_exception_handler
 
-__all__ = ["SQLContext", "HiveContext", "UDFRegistration"]
+__all__ = ["SQLContext", "HiveContext"]
 
 
 class SQLContext(object):
@@ -147,7 +148,7 @@ def udf(self):
 
         :return: :class:`UDFRegistration`
         """
-        return UDFRegistration(self)
+        return self.sparkSession.udf
 
     @since(1.4)
     def range(self, start, end=None, step=1, numPartitions=None):
@@ -172,113 +173,29 @@ def range(self, start, end=None, step=1, numPartitions=None):
         """
         return self.sparkSession.range(start, end, step, numPartitions)
 
-    @ignore_unicode_prefix
     @since(1.2)
     def registerFunction(self, name, f, returnType=None):
-        """Registers a Python function (including lambda function) or a :class:`UserDefinedFunction`
-        as a UDF. The registered UDF can be used in SQL statements.
-
-        :func:`spark.udf.register` is an alias for :func:`sqlContext.registerFunction`.
-
-        In addition to a name and the function itself, `returnType` can be optionally specified.
-        1) When f is a Python function, `returnType` defaults to a string. The produced object must
-        match the specified type. 2) When f is a :class:`UserDefinedFunction`, Spark uses the return
-        type of the given UDF as the return type of the registered UDF. The input parameter
-        `returnType` is None by default. If given by users, the value must be None.
-
-        :param name: name of the UDF in SQL statements.
-        :param f: a Python function, or a wrapped/native UserDefinedFunction. The UDF can be either
-            row-at-a-time or vectorized.
-        :param returnType: the return type of the registered UDF.
-        :return: a wrapped/native :class:`UserDefinedFunction`
-
-        >>> strlen = sqlContext.registerFunction("stringLengthString", lambda x: len(x))
-        >>> sqlContext.sql("SELECT stringLengthString('test')").collect()
-        [Row(stringLengthString(test)=u'4')]
-
-        >>> sqlContext.sql("SELECT 'foo' AS text").select(strlen("text")).collect()
-        [Row(stringLengthString(text)=u'3')]
-
-        >>> from pyspark.sql.types import IntegerType
-        >>> _ = sqlContext.registerFunction("stringLengthInt", lambda x: len(x), IntegerType())
-        >>> sqlContext.sql("SELECT stringLengthInt('test')").collect()
-        [Row(stringLengthInt(test)=4)]
-
-        >>> from pyspark.sql.types import IntegerType
-        >>> _ = sqlContext.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
-        >>> sqlContext.sql("SELECT stringLengthInt('test')").collect()
-        [Row(stringLengthInt(test)=4)]
-
-        >>> from pyspark.sql.types import IntegerType
-        >>> from pyspark.sql.functions import udf
-        >>> slen = udf(lambda s: len(s), IntegerType())
-        >>> _ = sqlContext.udf.register("slen", slen)
-        >>> sqlContext.sql("SELECT slen('test')").collect()
-        [Row(slen(test)=4)]
-
-        >>> import random
-        >>> from pyspark.sql.functions import udf
-        >>> from pyspark.sql.types import IntegerType
-        >>> random_udf = udf(lambda: random.randint(0, 100), IntegerType()).asNondeterministic()
-        >>> new_random_udf = sqlContext.registerFunction("random_udf", random_udf)
-        >>> sqlContext.sql("SELECT random_udf()").collect()  # doctest: +SKIP
-        [Row(random_udf()=82)]
-        >>> sqlContext.range(1).select(new_random_udf()).collect()  # doctest: +SKIP
-        [Row(<lambda>()=26)]
-
-        >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
-        >>> @pandas_udf("integer", PandasUDFType.SCALAR)  # doctest: +SKIP
-        ... def add_one(x):
-        ...     return x + 1
-        ...
-        >>> _ = sqlContext.udf.register("add_one", add_one)  # doctest: +SKIP
-        >>> sqlContext.sql("SELECT add_one(id) FROM range(3)").collect()  # doctest: +SKIP
-        [Row(add_one(id)=1), Row(add_one(id)=2), Row(add_one(id)=3)]
+        """An alias for :func:`spark.udf.register`.
+        See :meth:`pyspark.sql.UDFRegistration.register`.
+
+        .. note:: Deprecated in 2.3.0. Use :func:`spark.udf.register` instead.
         """
-        return self.sparkSession.catalog.registerFunction(name, f, returnType)
+        warnings.warn(
+            "Deprecated in 2.3.0. Use spark.udf.register instead.",
+            DeprecationWarning)
+        return self.sparkSession.udf.register(name, f, returnType)
 
-    @ignore_unicode_prefix
     @since(2.1)
     def registerJavaFunction(self, name, javaClassName, returnType=None):
-        """Register a java UDF so it can be used in SQL statements.
-
-        In addition to a name and the function itself, the return type can be optionally specified.
-        When the return type is not specified we would infer it via reflection.
-        :param name:  name of the UDF
-        :param javaClassName: fully qualified name of java class
-        :param returnType: a :class:`pyspark.sql.types.DataType` object
-
-        >>> sqlContext.registerJavaFunction("javaStringLength",
-        ...   "test.org.apache.spark.sql.JavaStringLength", IntegerType())
-        >>> sqlContext.sql("SELECT javaStringLength('test')").collect()
-        [Row(UDF:javaStringLength(test)=4)]
-        >>> sqlContext.registerJavaFunction("javaStringLength2",
-        ...   "test.org.apache.spark.sql.JavaStringLength")
-        >>> sqlContext.sql("SELECT javaStringLength2('test')").collect()
-        [Row(UDF:javaStringLength2(test)=4)]
+        """An alias for :func:`spark.udf.registerJavaFunction`.
+        See :meth:`pyspark.sql.UDFRegistration.registerJavaFunction`.
 
+        .. note:: Deprecated in 2.3.0. Use :func:`spark.udf.registerJavaFunction` instead.
         """
-        jdt = None
-        if returnType is not None:
-            jdt = self.sparkSession._jsparkSession.parseDataType(returnType.json())
-        self.sparkSession._jsparkSession.udf().registerJava(name, javaClassName, jdt)
-
-    @ignore_unicode_prefix
-    @since(2.3)
-    def registerJavaUDAF(self, name, javaClassName):
-        """Register a java UDAF so it can be used in SQL statements.
-
-        :param name:  name of the UDAF
-        :param javaClassName: fully qualified name of java class
-
-        >>> sqlContext.registerJavaUDAF("javaUDAF",
-        ...   "test.org.apache.spark.sql.MyDoubleAvg")
-        >>> df = sqlContext.createDataFrame([(1, "a"),(2, "b"), (3, "a")],["id", "name"])
-        >>> df.registerTempTable("df")
-        >>> sqlContext.sql("SELECT name, javaUDAF(id) as avg from df group by name").collect()
-        [Row(name=u'b', avg=102.0), Row(name=u'a', avg=102.0)]
-        """
-        self.sparkSession._jsparkSession.udf().registerJavaUDAF(name, javaClassName)
+        warnings.warn(
+            "Deprecated in 2.3.0. Use spark.udf.registerJavaFunction instead.",
+            DeprecationWarning)
+        return self.sparkSession.udf.registerJavaFunction(name, javaClassName, returnType)
 
     # TODO(andrew): delete this once we refactor things to take in SparkSession
     def _inferSchema(self, rdd, samplingRatio=None):
@@ -590,24 +507,6 @@ def refreshTable(self, tableName):
         self._ssql_ctx.refreshTable(tableName)
 
 
-class UDFRegistration(object):
-    """Wrapper for user-defined function registration."""
-
-    def __init__(self, sqlContext):
-        self.sqlContext = sqlContext
-
-    def register(self, name, f, returnType=None):
-        return self.sqlContext.registerFunction(name, f, returnType)
-
-    def registerJavaFunction(self, name, javaClassName, returnType=None):
-        self.sqlContext.registerJavaFunction(name, javaClassName, returnType)
-
-    def registerJavaUDAF(self, name, javaClassName):
-        self.sqlContext.registerJavaUDAF(name, javaClassName)
-
-    register.__doc__ = SQLContext.registerFunction.__doc__
-
-
 def _test():
     import os
     import doctest
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index f7b3f29764040..988c1d25259bc 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2103,7 +2103,7 @@ def udf(f=None, returnType=StringType()):
     >>> import random
     >>> random_udf = udf(lambda: int(random.random() * 100), IntegerType()).asNondeterministic()
 
-    .. note:: The user-defined functions do not support conditional expressions or short curcuiting
+    .. note:: The user-defined functions do not support conditional expressions or short circuiting
         in boolean expressions and it ends up with being executed all internally. If the functions
         can fail on special rows, the workaround is to incorporate the condition into the functions.
 
@@ -2231,7 +2231,7 @@ def pandas_udf(f=None, returnType=None, functionType=None):
     ...     return pd.Series(np.random.randn(len(v))
     >>> random = random.asNondeterministic()  # doctest: +SKIP
 
-    .. note:: The user-defined functions do not support conditional expressions or short curcuiting
+    .. note:: The user-defined functions do not support conditional expressions or short circuiting
         in boolean expressions and it ends up with being executed all internally. If the functions
         can fail on special rows, the workaround is to incorporate the condition into the functions.
     """
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index 09fae46adf014..22061b83eb78c 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -212,7 +212,8 @@ def apply(self, udf):
         This function does not support partial aggregation, and requires shuffling all the data in
         the :class:`DataFrame`.
 
-        :param udf: A function object returned by :meth:`pyspark.sql.functions.pandas_udf`
+        :param udf: a group map user-defined function returned by
+            :meth:`pyspark.sql.functions.pandas_udf`.
 
         >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
         >>> df = spark.createDataFrame(
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 604021c1f45cc..6c84023c43fb6 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -29,7 +29,6 @@
 
 from pyspark import since
 from pyspark.rdd import RDD, ignore_unicode_prefix
-from pyspark.sql.catalog import Catalog
 from pyspark.sql.conf import RuntimeConfig
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.readwriter import DataFrameReader
@@ -280,6 +279,7 @@ def catalog(self):
 
         :return: :class:`Catalog`
         """
+        from pyspark.sql.catalog import Catalog
         if not hasattr(self, "_catalog"):
             self._catalog = Catalog(self)
         return self._catalog
@@ -291,8 +291,8 @@ def udf(self):
 
         :return: :class:`UDFRegistration`
         """
-        from pyspark.sql.context import UDFRegistration
-        return UDFRegistration(self._wrapped)
+        from pyspark.sql.udf import UDFRegistration
+        return UDFRegistration(self)
 
     @since(2.0)
     def range(self, start, end=None, step=1, numPartitions=None):
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 8906618666b14..f84aa3d68b808 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -372,6 +372,12 @@ def test_udf(self):
         [row] = self.spark.sql("SELECT twoArgs('test', 1)").collect()
         self.assertEqual(row[0], 5)
 
+        # This is to check if a deprecated 'SQLContext.registerFunction' can call its alias.
+        sqlContext = self.spark._wrapped
+        sqlContext.registerFunction("oneArg", lambda x: len(x), IntegerType())
+        [row] = sqlContext.sql("SELECT oneArg('test')").collect()
+        self.assertEqual(row[0], 4)
+
     def test_udf2(self):
         self.spark.catalog.registerFunction("strlen", lambda string: len(string), IntegerType())
         self.spark.createDataFrame(self.sc.parallelize([Row(a="test")]))\
@@ -577,11 +583,25 @@ def test_udf_registration_returns_udf(self):
             df.select(add_three("id").alias("plus_three")).collect()
         )
 
+        # This is to check if a 'SQLContext.udf' can call its alias.
+        sqlContext = self.spark._wrapped
+        add_four = sqlContext.udf.register("add_four", lambda x: x + 4, IntegerType())
+
+        self.assertListEqual(
+            df.selectExpr("add_four(id) AS plus_four").collect(),
+            df.select(add_four("id").alias("plus_four")).collect()
+        )
+
     def test_non_existed_udf(self):
         spark = self.spark
         self.assertRaisesRegexp(AnalysisException, "Can not load class non_existed_udf",
                                 lambda: spark.udf.registerJavaFunction("udf1", "non_existed_udf"))
 
+        # This is to check if a deprecated 'SQLContext.registerJavaFunction' can call its alias.
+        sqlContext = spark._wrapped
+        self.assertRaisesRegexp(AnalysisException, "Can not load class non_existed_udf",
+                                lambda: sqlContext.registerJavaFunction("udf1", "non_existed_udf"))
+
     def test_non_existed_udaf(self):
         spark = self.spark
         self.assertRaisesRegexp(AnalysisException, "Can not load class non_existed_udaf",
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index 5e80ab9165867..1943bb73f9ac2 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -19,11 +19,13 @@
 """
 import functools
 
-from pyspark import SparkContext
-from pyspark.rdd import _prepare_for_python_RDD, PythonEvalType
+from pyspark import SparkContext, since
+from pyspark.rdd import _prepare_for_python_RDD, PythonEvalType, ignore_unicode_prefix
 from pyspark.sql.column import Column, _to_java_column, _to_seq
 from pyspark.sql.types import StringType, DataType, StructType, _parse_datatype_string
 
+__all__ = ["UDFRegistration"]
+
 
 def _wrap_function(sc, func, returnType):
     command = (func, returnType)
@@ -181,3 +183,179 @@ def asNondeterministic(self):
         """
         self.deterministic = False
         return self
+
+
+class UDFRegistration(object):
+    """
+    Wrapper for user-defined function registration. This instance can be accessed by
+    :attr:`spark.udf` or :attr:`sqlContext.udf`.
+
+    .. versionadded:: 1.3.1
+    """
+
+    def __init__(self, sparkSession):
+        self.sparkSession = sparkSession
+
+    @ignore_unicode_prefix
+    @since("1.3.1")
+    def register(self, name, f, returnType=None):
+        """Registers a Python function (including lambda function) or a user-defined function
+        in SQL statements.
+
+        :param name: name of the user-defined function in SQL statements.
+        :param f: a Python function, or a user-defined function. The user-defined function can
+            be either row-at-a-time or vectorized. See :meth:`pyspark.sql.functions.udf` and
+            :meth:`pyspark.sql.functions.pandas_udf`.
+        :param returnType: the return type of the registered user-defined function.
+        :return: a user-defined function.
+
+        `returnType` can be optionally specified when `f` is a Python function but not
+        when `f` is a user-defined function. Please see below.
+
+        1. When `f` is a Python function:
+
+            `returnType` defaults to string type and can be optionally specified. The produced
+            object must match the specified type. In this case, this API works as if
+            `register(name, f, returnType=StringType())`.
+
+            >>> strlen = spark.udf.register("stringLengthString", lambda x: len(x))
+            >>> spark.sql("SELECT stringLengthString('test')").collect()
+            [Row(stringLengthString(test)=u'4')]
+
+            >>> spark.sql("SELECT 'foo' AS text").select(strlen("text")).collect()
+            [Row(stringLengthString(text)=u'3')]
+
+            >>> from pyspark.sql.types import IntegerType
+            >>> _ = spark.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
+            >>> spark.sql("SELECT stringLengthInt('test')").collect()
+            [Row(stringLengthInt(test)=4)]
+
+            >>> from pyspark.sql.types import IntegerType
+            >>> _ = spark.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
+            >>> spark.sql("SELECT stringLengthInt('test')").collect()
+            [Row(stringLengthInt(test)=4)]
+
+        2. When `f` is a user-defined function:
+
+            Spark uses the return type of the given user-defined function as the return type of
+            the registered user-defined function. `returnType` should not be specified.
+            In this case, this API works as if `register(name, f)`.
+
+            >>> from pyspark.sql.types import IntegerType
+            >>> from pyspark.sql.functions import udf
+            >>> slen = udf(lambda s: len(s), IntegerType())
+            >>> _ = spark.udf.register("slen", slen)
+            >>> spark.sql("SELECT slen('test')").collect()
+            [Row(slen(test)=4)]
+
+            >>> import random
+            >>> from pyspark.sql.functions import udf
+            >>> from pyspark.sql.types import IntegerType
+            >>> random_udf = udf(lambda: random.randint(0, 100), IntegerType()).asNondeterministic()
+            >>> new_random_udf = spark.udf.register("random_udf", random_udf)
+            >>> spark.sql("SELECT random_udf()").collect()  # doctest: +SKIP
+            [Row(random_udf()=82)]
+
+            >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
+            >>> @pandas_udf("integer", PandasUDFType.SCALAR)  # doctest: +SKIP
+            ... def add_one(x):
+            ...     return x + 1
+            ...
+            >>> _ = spark.udf.register("add_one", add_one)  # doctest: +SKIP
+            >>> spark.sql("SELECT add_one(id) FROM range(3)").collect()  # doctest: +SKIP
+            [Row(add_one(id)=1), Row(add_one(id)=2), Row(add_one(id)=3)]
+
+            .. note:: Registration for a user-defined function (case 2.) was added from
+                Spark 2.3.0.
+        """
+
+        # This is to check whether the input function is from a user-defined function or
+        # Python function.
+        if hasattr(f, 'asNondeterministic'):
+            if returnType is not None:
+                raise TypeError(
+                    "Invalid returnType: data type can not be specified when f is"
+                    "a user-defined function, but got %s." % returnType)
+            if f.evalType not in [PythonEvalType.SQL_BATCHED_UDF,
+                                  PythonEvalType.SQL_PANDAS_SCALAR_UDF]:
+                raise ValueError(
+                    "Invalid f: f must be either SQL_BATCHED_UDF or SQL_PANDAS_SCALAR_UDF")
+            register_udf = UserDefinedFunction(f.func, returnType=f.returnType, name=name,
+                                               evalType=f.evalType,
+                                               deterministic=f.deterministic)
+            return_udf = f
+        else:
+            if returnType is None:
+                returnType = StringType()
+            register_udf = UserDefinedFunction(f, returnType=returnType, name=name,
+                                               evalType=PythonEvalType.SQL_BATCHED_UDF)
+            return_udf = register_udf._wrapped()
+        self.sparkSession._jsparkSession.udf().registerPython(name, register_udf._judf)
+        return return_udf
+
+    @ignore_unicode_prefix
+    @since(2.3)
+    def registerJavaFunction(self, name, javaClassName, returnType=None):
+        """Register a Java user-defined function so it can be used in SQL statements.
+
+        In addition to a name and the function itself, the return type can be optionally specified.
+        When the return type is not specified we would infer it via reflection.
+
+        :param name: name of the user-defined function
+        :param javaClassName: fully qualified name of java class
+        :param returnType: a :class:`pyspark.sql.types.DataType` object
+
+        >>> from pyspark.sql.types import IntegerType
+        >>> spark.udf.registerJavaFunction(
+        ...     "javaStringLength", "test.org.apache.spark.sql.JavaStringLength", IntegerType())
+        >>> spark.sql("SELECT javaStringLength('test')").collect()
+        [Row(UDF:javaStringLength(test)=4)]
+        >>> spark.udf.registerJavaFunction(
+        ...     "javaStringLength2", "test.org.apache.spark.sql.JavaStringLength")
+        >>> spark.sql("SELECT javaStringLength2('test')").collect()
+        [Row(UDF:javaStringLength2(test)=4)]
+        """
+
+        jdt = None
+        if returnType is not None:
+            jdt = self.sparkSession._jsparkSession.parseDataType(returnType.json())
+        self.sparkSession._jsparkSession.udf().registerJava(name, javaClassName, jdt)
+
+    @ignore_unicode_prefix
+    @since(2.3)
+    def registerJavaUDAF(self, name, javaClassName):
+        """Register a Java user-defined aggregate function so it can be used in SQL statements.
+
+        :param name: name of the user-defined aggregate function
+        :param javaClassName: fully qualified name of java class
+
+        >>> spark.udf.registerJavaUDAF("javaUDAF", "test.org.apache.spark.sql.MyDoubleAvg")
+        >>> df = spark.createDataFrame([(1, "a"),(2, "b"), (3, "a")],["id", "name"])
+        >>> df.registerTempTable("df")
+        >>> spark.sql("SELECT name, javaUDAF(id) as avg from df group by name").collect()
+        [Row(name=u'b', avg=102.0), Row(name=u'a', avg=102.0)]
+        """
+
+        self.sparkSession._jsparkSession.udf().registerJavaUDAF(name, javaClassName)
+
+
+def _test():
+    import doctest
+    from pyspark.sql import SparkSession
+    import pyspark.sql.udf
+    globs = pyspark.sql.udf.__dict__.copy()
+    spark = SparkSession.builder\
+        .master("local[4]")\
+        .appName("sql.udf tests")\
+        .getOrCreate()
+    globs['spark'] = spark
+    (failure_count, test_count) = doctest.testmod(
+        pyspark.sql.udf, globs=globs,
+        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
+    spark.stop()
+    if failure_count:
+        exit(-1)
+
+
+if __name__ == "__main__":
+    _test()

From 2a87c3a77cbe40cbe5a8bdef41e3c37a660e2308 Mon Sep 17 00:00:00 2001
From: Jose Torres <jose@databricks.com>
Date: Wed, 17 Jan 2018 22:36:29 -0800
Subject: [PATCH 122/566] [SPARK-23052][SS] Migrate ConsoleSink to data source
 V2 api.

## What changes were proposed in this pull request?

Migrate ConsoleSink to data source V2 api.

Note that this includes a missing piece in DataStreamWriter required to specify a data source V2 writer.

Note also that I've removed the "Rerun batch" part of the sink, because as far as I can tell this would never have actually happened. A MicroBatchExecution object will only commit each batch once for its lifetime, and a new MicroBatchExecution object would have a new ConsoleSink object which doesn't know it's retrying a batch. So I think this represents an anti-feature rather than a weakness in the V2 API.

## How was this patch tested?

new unit test

Author: Jose Torres <jose@databricks.com>

Closes #20243 from jose-torres/console-sink.

(cherry picked from commit 1c76a91e5fae11dcb66c453889e587b48039fdc9)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../streaming/MicroBatchExecution.scala       |   7 +-
 .../sql/execution/streaming/console.scala     |  62 ++---
 .../continuous/ContinuousExecution.scala      |   9 +-
 .../streaming/sources/ConsoleWriter.scala     |  64 +++++
 .../sources/PackedRowWriterFactory.scala      |  60 +++++
 .../sql/streaming/DataStreamWriter.scala      |  16 +-
 ...pache.spark.sql.sources.DataSourceRegister |   8 +
 .../sources/ConsoleWriterSuite.scala          | 135 ++++++++++
 .../sources/StreamingDataSourceV2Suite.scala  | 249 ++++++++++++++++++
 .../test/DataStreamReaderWriterSuite.scala    |  25 --
 10 files changed, 551 insertions(+), 84 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriterSuite.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
index 70407f0580f97..7c3804547b736 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
@@ -91,11 +91,14 @@ class MicroBatchExecution(
           nextSourceId += 1
           StreamingExecutionRelation(reader, output)(sparkSession)
         })
-      case s @ StreamingRelationV2(_, _, _, output, v1Relation) =>
+      case s @ StreamingRelationV2(_, sourceName, _, output, v1Relation) =>
         v2ToExecutionRelationMap.getOrElseUpdate(s, {
           // Materialize source to avoid creating it in every batch
           val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId"
-          assert(v1Relation.isDefined, "v2 execution didn't match but v1 was unavailable")
+          if (v1Relation.isEmpty) {
+            throw new UnsupportedOperationException(
+              s"Data source $sourceName does not support microbatch processing.")
+          }
           val source = v1Relation.get.dataSource.createSource(metadataPath)
           nextSourceId += 1
           StreamingExecutionRelation(source, output)(sparkSession)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
index 71eaabe273fea..94820376ff7e7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
@@ -17,58 +17,36 @@
 
 package org.apache.spark.sql.execution.streaming
 
-import org.apache.spark.internal.Logging
-import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext}
-import org.apache.spark.sql.execution.SQLExecution
-import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, StreamSinkProvider}
-import org.apache.spark.sql.streaming.OutputMode
-import org.apache.spark.sql.types.StructType
-
-class ConsoleSink(options: Map[String, String]) extends Sink with Logging {
-  // Number of rows to display, by default 20 rows
-  private val numRowsToShow = options.get("numRows").map(_.toInt).getOrElse(20)
-
-  // Truncate the displayed data if it is too long, by default it is true
-  private val isTruncated = options.get("truncate").map(_.toBoolean).getOrElse(true)
+import java.util.Optional
 
-  // Track the batch id
-  private var lastBatchId = -1L
-
-  override def addBatch(batchId: Long, data: DataFrame): Unit = synchronized {
-    val batchIdStr = if (batchId <= lastBatchId) {
-      s"Rerun batch: $batchId"
-    } else {
-      lastBatchId = batchId
-      s"Batch: $batchId"
-    }
-
-    // scalastyle:off println
-    println("-------------------------------------------")
-    println(batchIdStr)
-    println("-------------------------------------------")
-    // scalastyle:off println
-    data.sparkSession.createDataFrame(
-      data.sparkSession.sparkContext.parallelize(data.collect()), data.schema)
-      .show(numRowsToShow, isTruncated)
-  }
+import scala.collection.JavaConverters._
 
-  override def toString(): String = s"ConsoleSink[numRows=$numRowsToShow, truncate=$isTruncated]"
-}
+import org.apache.spark.sql._
+import org.apache.spark.sql.execution.streaming.sources.ConsoleWriter
+import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister}
+import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
+import org.apache.spark.sql.sources.v2.streaming.MicroBatchWriteSupport
+import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer
+import org.apache.spark.sql.streaming.OutputMode
+import org.apache.spark.sql.types.StructType
 
 case class ConsoleRelation(override val sqlContext: SQLContext, data: DataFrame)
   extends BaseRelation {
   override def schema: StructType = data.schema
 }
 
-class ConsoleSinkProvider extends StreamSinkProvider
+class ConsoleSinkProvider extends DataSourceV2
+  with MicroBatchWriteSupport
   with DataSourceRegister
   with CreatableRelationProvider {
-  def createSink(
-      sqlContext: SQLContext,
-      parameters: Map[String, String],
-      partitionColumns: Seq[String],
-      outputMode: OutputMode): Sink = {
-    new ConsoleSink(parameters)
+
+  override def createMicroBatchWriter(
+      queryId: String,
+      epochId: Long,
+      schema: StructType,
+      mode: OutputMode,
+      options: DataSourceV2Options): Optional[DataSourceV2Writer] = {
+    Optional.of(new ConsoleWriter(epochId, schema, options))
   }
 
   def createRelation(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
index c0507224f9be8..462e7d9721d28 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -54,16 +54,13 @@ class ContinuousExecution(
     sparkSession, name, checkpointRoot, analyzedPlan, sink,
     trigger, triggerClock, outputMode, deleteCheckpointOnStop) {
 
-  @volatile protected var continuousSources: Seq[ContinuousReader] = _
+  @volatile protected var continuousSources: Seq[ContinuousReader] = Seq()
   override protected def sources: Seq[BaseStreamingSource] = continuousSources
 
   // For use only in test harnesses.
   private[sql] var currentEpochCoordinatorId: String = _
 
-  override lazy val logicalPlan: LogicalPlan = {
-    assert(queryExecutionThread eq Thread.currentThread,
-      "logicalPlan must be initialized in StreamExecutionThread " +
-        s"but the current thread was ${Thread.currentThread}")
+  override val logicalPlan: LogicalPlan = {
     val toExecutionRelationMap = MutableMap[StreamingRelationV2, ContinuousExecutionRelation]()
     analyzedPlan.transform {
       case r @ StreamingRelationV2(
@@ -72,7 +69,7 @@ class ContinuousExecution(
           ContinuousExecutionRelation(source, extraReaderOptions, output)(sparkSession)
         })
       case StreamingRelationV2(_, sourceName, _, _, _) =>
-        throw new AnalysisException(
+        throw new UnsupportedOperationException(
           s"Data source $sourceName does not support continuous processing.")
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala
new file mode 100644
index 0000000000000..361979984bbec
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.sources
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.sources.v2.DataSourceV2Options
+import org.apache.spark.sql.sources.v2.writer.{DataSourceV2Writer, DataWriterFactory, WriterCommitMessage}
+import org.apache.spark.sql.types.StructType
+
+/**
+ * A [[DataSourceV2Writer]] that collects results to the driver and prints them in the console.
+ * Generated by [[org.apache.spark.sql.execution.streaming.ConsoleSinkProvider]].
+ *
+ * This sink should not be used for production, as it requires sending all rows to the driver
+ * and does not support recovery.
+ */
+class ConsoleWriter(batchId: Long, schema: StructType, options: DataSourceV2Options)
+    extends DataSourceV2Writer with Logging {
+  // Number of rows to display, by default 20 rows
+  private val numRowsToShow = options.getInt("numRows", 20)
+
+  // Truncate the displayed data if it is too long, by default it is true
+  private val isTruncated = options.getBoolean("truncate", true)
+
+  assert(SparkSession.getActiveSession.isDefined)
+  private val spark = SparkSession.getActiveSession.get
+
+  override def createWriterFactory(): DataWriterFactory[Row] = PackedRowWriterFactory
+
+  override def commit(messages: Array[WriterCommitMessage]): Unit = synchronized {
+    val batch = messages.collect {
+      case PackedRowCommitMessage(rows) => rows
+    }.flatten
+
+    // scalastyle:off println
+    println("-------------------------------------------")
+    println(s"Batch: $batchId")
+    println("-------------------------------------------")
+    // scalastyle:off println
+    spark.createDataFrame(
+      spark.sparkContext.parallelize(batch), schema)
+      .show(numRowsToShow, isTruncated)
+  }
+
+  override def abort(messages: Array[WriterCommitMessage]): Unit = {}
+
+  override def toString(): String = s"ConsoleWriter[numRows=$numRowsToShow, truncate=$isTruncated]"
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala
new file mode 100644
index 0000000000000..9282ba05bdb7b
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.sources
+
+import scala.collection.mutable
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.sources.v2.writer.{DataWriter, DataWriterFactory, WriterCommitMessage}
+
+/**
+ * A simple [[DataWriterFactory]] whose tasks just pack rows into the commit message for delivery
+ * to a [[org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer]] on the driver.
+ *
+ * Note that, because it sends all rows to the driver, this factory will generally be unsuitable
+ * for production-quality sinks. It's intended for use in tests.
+ */
+case object PackedRowWriterFactory extends DataWriterFactory[Row] {
+  def createDataWriter(partitionId: Int, attemptNumber: Int): DataWriter[Row] = {
+    new PackedRowDataWriter()
+  }
+}
+
+/**
+ * Commit message for a [[PackedRowDataWriter]], containing all the rows written in the most
+ * recent interval.
+ */
+case class PackedRowCommitMessage(rows: Array[Row]) extends WriterCommitMessage
+
+/**
+ * A simple [[DataWriter]] that just sends all the rows it's received as a commit message.
+ */
+class PackedRowDataWriter() extends DataWriter[Row] with Logging {
+  private val data = mutable.Buffer[Row]()
+
+  override def write(row: Row): Unit = data.append(row)
+
+  override def commit(): PackedRowCommitMessage = {
+    val msg = PackedRowCommitMessage(data.toArray)
+    data.clear()
+    msg
+  }
+
+  override def abort(): Unit = data.clear()
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
index b5b4a05ab4973..d24f0ddeab4de 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.continuous.ContinuousTrigger
 import org.apache.spark.sql.execution.streaming.sources.{MemoryPlanV2, MemorySinkV2}
-import org.apache.spark.sql.sources.v2.streaming.ContinuousWriteSupport
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousWriteSupport, MicroBatchWriteSupport}
 
 /**
  * Interface used to write a streaming `Dataset` to external storage systems (e.g. file systems,
@@ -280,14 +280,12 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
         useTempCheckpointLocation = true,
         trigger = trigger)
     } else {
-      val sink = trigger match {
-        case _: ContinuousTrigger =>
-          val ds = DataSource.lookupDataSource(source, df.sparkSession.sessionState.conf)
-          ds.newInstance() match {
-            case w: ContinuousWriteSupport => w
-            case _ => throw new AnalysisException(
-              s"Data source $source does not support continuous writing")
-          }
+      val ds = DataSource.lookupDataSource(source, df.sparkSession.sessionState.conf)
+      val sink = (ds.newInstance(), trigger) match {
+        case (w: ContinuousWriteSupport, _: ContinuousTrigger) => w
+        case (_, _: ContinuousTrigger) => throw new UnsupportedOperationException(
+            s"Data source $source does not support continuous writing")
+        case (w: MicroBatchWriteSupport, _) => w
         case _ =>
           val ds = DataSource(
             df.sparkSession,
diff --git a/sql/core/src/test/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/sql/core/src/test/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
index c6973bf41d34b..a0b25b4e82364 100644
--- a/sql/core/src/test/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
+++ b/sql/core/src/test/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
@@ -5,3 +5,11 @@ org.apache.spark.sql.sources.FakeSourceFour
 org.apache.fakesource.FakeExternalSourceOne
 org.apache.fakesource.FakeExternalSourceTwo
 org.apache.fakesource.FakeExternalSourceThree
+org.apache.spark.sql.streaming.sources.FakeReadMicroBatchOnly
+org.apache.spark.sql.streaming.sources.FakeReadContinuousOnly
+org.apache.spark.sql.streaming.sources.FakeReadBothModes
+org.apache.spark.sql.streaming.sources.FakeReadNeitherMode
+org.apache.spark.sql.streaming.sources.FakeWriteMicroBatchOnly
+org.apache.spark.sql.streaming.sources.FakeWriteContinuousOnly
+org.apache.spark.sql.streaming.sources.FakeWriteBothModes
+org.apache.spark.sql.streaming.sources.FakeWriteNeitherMode
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriterSuite.scala
new file mode 100644
index 0000000000000..60ffee9b9b42c
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriterSuite.scala
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.sources
+
+import java.io.ByteArrayOutputStream
+
+import org.apache.spark.sql.execution.streaming.MemoryStream
+import org.apache.spark.sql.streaming.StreamTest
+
+class ConsoleWriterSuite extends StreamTest {
+  import testImplicits._
+
+  test("console") {
+    val input = MemoryStream[Int]
+
+    val captured = new ByteArrayOutputStream()
+    Console.withOut(captured) {
+      val query = input.toDF().writeStream.format("console").start()
+      try {
+        input.addData(1, 2, 3)
+        query.processAllAvailable()
+        input.addData(4, 5, 6)
+        query.processAllAvailable()
+        input.addData()
+        query.processAllAvailable()
+      } finally {
+        query.stop()
+      }
+    }
+
+    assert(captured.toString() ==
+      """-------------------------------------------
+        |Batch: 0
+        |-------------------------------------------
+        |+-----+
+        ||value|
+        |+-----+
+        ||    1|
+        ||    2|
+        ||    3|
+        |+-----+
+        |
+        |-------------------------------------------
+        |Batch: 1
+        |-------------------------------------------
+        |+-----+
+        ||value|
+        |+-----+
+        ||    4|
+        ||    5|
+        ||    6|
+        |+-----+
+        |
+        |-------------------------------------------
+        |Batch: 2
+        |-------------------------------------------
+        |+-----+
+        ||value|
+        |+-----+
+        |+-----+
+        |
+        |""".stripMargin)
+  }
+
+  test("console with numRows") {
+    val input = MemoryStream[Int]
+
+    val captured = new ByteArrayOutputStream()
+    Console.withOut(captured) {
+      val query = input.toDF().writeStream.format("console").option("NUMROWS", 2).start()
+      try {
+        input.addData(1, 2, 3)
+        query.processAllAvailable()
+      } finally {
+        query.stop()
+      }
+    }
+
+    assert(captured.toString() ==
+      """-------------------------------------------
+        |Batch: 0
+        |-------------------------------------------
+        |+-----+
+        ||value|
+        |+-----+
+        ||    1|
+        ||    2|
+        |+-----+
+        |only showing top 2 rows
+        |
+        |""".stripMargin)
+  }
+
+  test("console with truncation") {
+    val input = MemoryStream[String]
+
+    val captured = new ByteArrayOutputStream()
+    Console.withOut(captured) {
+      val query = input.toDF().writeStream.format("console").option("TRUNCATE", true).start()
+      try {
+        input.addData("123456789012345678901234567890")
+        query.processAllAvailable()
+      } finally {
+        query.stop()
+      }
+    }
+
+    assert(captured.toString() ==
+      """-------------------------------------------
+        |Batch: 0
+        |-------------------------------------------
+        |+--------------------+
+        ||               value|
+        |+--------------------+
+        ||12345678901234567...|
+        |+--------------------+
+        |
+        |""".stripMargin)
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
new file mode 100644
index 0000000000000..f152174b0a7f0
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
@@ -0,0 +1,249 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming.sources
+
+import java.util.Optional
+
+import org.apache.spark.sql.{AnalysisException, Row}
+import org.apache.spark.sql.execution.datasources.DataSource
+import org.apache.spark.sql.execution.streaming.{LongOffset, RateStreamOffset}
+import org.apache.spark.sql.execution.streaming.continuous.ContinuousTrigger
+import org.apache.spark.sql.sources.DataSourceRegister
+import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
+import org.apache.spark.sql.sources.v2.reader.ReadTask
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, ContinuousWriteSupport, MicroBatchReadSupport, MicroBatchWriteSupport}
+import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousReader, MicroBatchReader, Offset, PartitionOffset}
+import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
+import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer
+import org.apache.spark.sql.streaming.{OutputMode, StreamingQueryException, StreamTest, Trigger}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.util.Utils
+
+case class FakeReader() extends MicroBatchReader with ContinuousReader {
+  def setOffsetRange(start: Optional[Offset], end: Optional[Offset]): Unit = {}
+  def getStartOffset: Offset = RateStreamOffset(Map())
+  def getEndOffset: Offset = RateStreamOffset(Map())
+  def deserializeOffset(json: String): Offset = RateStreamOffset(Map())
+  def commit(end: Offset): Unit = {}
+  def readSchema(): StructType = StructType(Seq())
+  def stop(): Unit = {}
+  def mergeOffsets(offsets: Array[PartitionOffset]): Offset = RateStreamOffset(Map())
+  def setOffset(start: Optional[Offset]): Unit = {}
+
+  def createReadTasks(): java.util.ArrayList[ReadTask[Row]] = {
+    throw new IllegalStateException("fake source - cannot actually read")
+  }
+}
+
+trait FakeMicroBatchReadSupport extends MicroBatchReadSupport {
+  override def createMicroBatchReader(
+      schema: Optional[StructType],
+      checkpointLocation: String,
+      options: DataSourceV2Options): MicroBatchReader = FakeReader()
+}
+
+trait FakeContinuousReadSupport extends ContinuousReadSupport {
+  override def createContinuousReader(
+      schema: Optional[StructType],
+      checkpointLocation: String,
+      options: DataSourceV2Options): ContinuousReader = FakeReader()
+}
+
+trait FakeMicroBatchWriteSupport extends MicroBatchWriteSupport {
+  def createMicroBatchWriter(
+      queryId: String,
+      epochId: Long,
+      schema: StructType,
+      mode: OutputMode,
+      options: DataSourceV2Options): Optional[DataSourceV2Writer] = {
+    throw new IllegalStateException("fake sink - cannot actually write")
+  }
+}
+
+trait FakeContinuousWriteSupport extends ContinuousWriteSupport {
+  def createContinuousWriter(
+      queryId: String,
+      schema: StructType,
+      mode: OutputMode,
+      options: DataSourceV2Options): Optional[ContinuousWriter] = {
+    throw new IllegalStateException("fake sink - cannot actually write")
+  }
+}
+
+class FakeReadMicroBatchOnly extends DataSourceRegister with FakeMicroBatchReadSupport {
+  override def shortName(): String = "fake-read-microbatch-only"
+}
+
+class FakeReadContinuousOnly extends DataSourceRegister with FakeContinuousReadSupport {
+  override def shortName(): String = "fake-read-continuous-only"
+}
+
+class FakeReadBothModes extends DataSourceRegister
+    with FakeMicroBatchReadSupport with FakeContinuousReadSupport {
+  override def shortName(): String = "fake-read-microbatch-continuous"
+}
+
+class FakeReadNeitherMode extends DataSourceRegister {
+  override def shortName(): String = "fake-read-neither-mode"
+}
+
+class FakeWriteMicroBatchOnly extends DataSourceRegister with FakeMicroBatchWriteSupport {
+  override def shortName(): String = "fake-write-microbatch-only"
+}
+
+class FakeWriteContinuousOnly extends DataSourceRegister with FakeContinuousWriteSupport {
+  override def shortName(): String = "fake-write-continuous-only"
+}
+
+class FakeWriteBothModes extends DataSourceRegister
+    with FakeMicroBatchWriteSupport with FakeContinuousWriteSupport {
+  override def shortName(): String = "fake-write-microbatch-continuous"
+}
+
+class FakeWriteNeitherMode extends DataSourceRegister {
+  override def shortName(): String = "fake-write-neither-mode"
+}
+
+class StreamingDataSourceV2Suite extends StreamTest {
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    val fakeCheckpoint = Utils.createTempDir()
+    spark.conf.set("spark.sql.streaming.checkpointLocation", fakeCheckpoint.getCanonicalPath)
+  }
+
+  val readFormats = Seq(
+    "fake-read-microbatch-only",
+    "fake-read-continuous-only",
+    "fake-read-microbatch-continuous",
+    "fake-read-neither-mode")
+  val writeFormats = Seq(
+    "fake-write-microbatch-only",
+    "fake-write-continuous-only",
+    "fake-write-microbatch-continuous",
+    "fake-write-neither-mode")
+  val triggers = Seq(
+    Trigger.Once(),
+    Trigger.ProcessingTime(1000),
+    Trigger.Continuous(1000))
+
+  private def testPositiveCase(readFormat: String, writeFormat: String, trigger: Trigger) = {
+    val query = spark.readStream
+      .format(readFormat)
+      .load()
+      .writeStream
+      .format(writeFormat)
+      .trigger(trigger)
+      .start()
+    query.stop()
+  }
+
+  private def testNegativeCase(
+      readFormat: String,
+      writeFormat: String,
+      trigger: Trigger,
+      errorMsg: String) = {
+    val ex = intercept[UnsupportedOperationException] {
+      testPositiveCase(readFormat, writeFormat, trigger)
+    }
+    assert(ex.getMessage.contains(errorMsg))
+  }
+
+  private def testPostCreationNegativeCase(
+      readFormat: String,
+      writeFormat: String,
+      trigger: Trigger,
+      errorMsg: String) = {
+    val query = spark.readStream
+      .format(readFormat)
+      .load()
+      .writeStream
+      .format(writeFormat)
+      .trigger(trigger)
+      .start()
+
+    eventually(timeout(streamingTimeout)) {
+      assert(query.exception.isDefined)
+      assert(query.exception.get.cause != null)
+      assert(query.exception.get.cause.getMessage.contains(errorMsg))
+    }
+  }
+
+  // Get a list of (read, write, trigger) tuples for test cases.
+  val cases = readFormats.flatMap { read =>
+    writeFormats.flatMap { write =>
+      triggers.map(t => (write, t))
+    }.map {
+      case (write, t) => (read, write, t)
+    }
+  }
+
+  for ((read, write, trigger) <- cases) {
+    testQuietly(s"stream with read format $read, write format $write, trigger $trigger") {
+      val readSource = DataSource.lookupDataSource(read, spark.sqlContext.conf).newInstance()
+      val writeSource = DataSource.lookupDataSource(write, spark.sqlContext.conf).newInstance()
+      (readSource, writeSource, trigger) match {
+        // Valid microbatch queries.
+        case (_: MicroBatchReadSupport, _: MicroBatchWriteSupport, t)
+          if !t.isInstanceOf[ContinuousTrigger] =>
+          testPositiveCase(read, write, trigger)
+
+        // Valid continuous queries.
+        case (_: ContinuousReadSupport, _: ContinuousWriteSupport, _: ContinuousTrigger) =>
+          testPositiveCase(read, write, trigger)
+
+        // Invalid - can't read at all
+        case (r, _, _)
+            if !r.isInstanceOf[MicroBatchReadSupport]
+              && !r.isInstanceOf[ContinuousReadSupport] =>
+          testNegativeCase(read, write, trigger,
+            s"Data source $read does not support streamed reading")
+
+        // Invalid - trigger is continuous but writer is not
+        case (_, w, _: ContinuousTrigger) if !w.isInstanceOf[ContinuousWriteSupport] =>
+          testNegativeCase(read, write, trigger,
+            s"Data source $write does not support continuous writing")
+
+        // Invalid - can't write at all
+        case (_, w, _)
+            if !w.isInstanceOf[MicroBatchWriteSupport]
+              && !w.isInstanceOf[ContinuousWriteSupport] =>
+          testNegativeCase(read, write, trigger,
+            s"Data source $write does not support streamed writing")
+
+        // Invalid - trigger and writer are continuous but reader is not
+        case (r, _: ContinuousWriteSupport, _: ContinuousTrigger)
+            if !r.isInstanceOf[ContinuousReadSupport] =>
+          testNegativeCase(read, write, trigger,
+            s"Data source $read does not support continuous processing")
+
+        // Invalid - trigger is microbatch but writer is not
+        case (_, w, t)
+            if !w.isInstanceOf[MicroBatchWriteSupport] && !t.isInstanceOf[ContinuousTrigger] =>
+          testNegativeCase(read, write, trigger,
+            s"Data source $write does not support streamed writing")
+
+        // Invalid - trigger and writer are microbatch but reader is not
+        case (r, _, t)
+           if !r.isInstanceOf[MicroBatchReadSupport] && !t.isInstanceOf[ContinuousTrigger] =>
+          testPostCreationNegativeCase(read, write, trigger,
+            s"Data source $read does not support microbatch processing")
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
index aa163d2211c38..8212fb912ec57 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
@@ -422,21 +422,6 @@ class DataStreamReaderWriterSuite extends StreamTest with BeforeAndAfter {
     }
   }
 
-  test("ConsoleSink can be correctly loaded") {
-    LastOptions.clear()
-    val df = spark.readStream
-      .format("org.apache.spark.sql.streaming.test")
-      .load()
-
-    val sq = df.writeStream
-      .format("console")
-      .option("checkpointLocation", newMetadataDir)
-      .trigger(ProcessingTime(2.seconds))
-      .start()
-
-    sq.awaitTermination(2000L)
-  }
-
   test("prevent all column partitioning") {
     withTempDir { dir =>
       val path = dir.getCanonicalPath
@@ -450,16 +435,6 @@ class DataStreamReaderWriterSuite extends StreamTest with BeforeAndAfter {
     }
   }
 
-  test("ConsoleSink should not require checkpointLocation") {
-    LastOptions.clear()
-    val df = spark.readStream
-      .format("org.apache.spark.sql.streaming.test")
-      .load()
-
-    val sq = df.writeStream.format("console").start()
-    sq.stop()
-  }
-
   private def testMemorySinkCheckpointRecovery(chkLoc: String, provideInWriter: Boolean): Unit = {
     import testImplicits._
     val ms = new MemoryStream[Int](0, sqlContext)

From f801ac417ba13a975887ba83904ee771bc3a003e Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Thu, 18 Jan 2018 19:18:55 +0800
Subject: [PATCH 123/566] [SPARK-23140][SQL] Add DataSourceV2Strategy to Hive
 Session state's planner

## What changes were proposed in this pull request?

`DataSourceV2Strategy` is missing in `HiveSessionStateBuilder`'s planner, which will throw exception as described in [SPARK-23140](https://issues.apache.org/jira/browse/SPARK-23140).

## How was this patch tested?

Manual test.

Author: jerryshao <sshao@hortonworks.com>

Closes #20305 from jerryshao/SPARK-23140.

(cherry picked from commit 7a2248341396840628eef398aa512cac3e3bd55f)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/hive/HiveSessionStateBuilder.scala      | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
index dc92ad3b0c1ac..12c74368dd184 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
@@ -96,22 +96,7 @@ class HiveSessionStateBuilder(session: SparkSession, parentState: Option[Session
       override val sparkSession: SparkSession = session
 
       override def extraPlanningStrategies: Seq[Strategy] =
-        super.extraPlanningStrategies ++ customPlanningStrategies
-
-      override def strategies: Seq[Strategy] = {
-        experimentalMethods.extraStrategies ++
-          extraPlanningStrategies ++ Seq(
-          FileSourceStrategy,
-          DataSourceStrategy(conf),
-          SpecialLimits,
-          InMemoryScans,
-          HiveTableScans,
-          Scripts,
-          Aggregation,
-          JoinSelection,
-          BasicOperators
-        )
-      }
+        super.extraPlanningStrategies ++ customPlanningStrategies ++ Seq(HiveTableScans, Scripts)
     }
   }
 

From 8a98274823a4671cee85081dd19f40146e736325 Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Thu, 18 Jan 2018 21:24:39 +0800
Subject: [PATCH 124/566] [SPARK-22036][SQL] Decimal multiplication with high
 precision/scale often returns NULL

## What changes were proposed in this pull request?

When there is an operation between Decimals and the result is a number which is not representable exactly with the result's precision and scale, Spark is returning `NULL`. This was done to reflect Hive's behavior, but it is against SQL ANSI 2011, which states that "If the result cannot be represented exactly in the result type, then whether it is rounded or truncated is implementation-defined". Moreover, Hive now changed its behavior in order to respect the standard, thanks to HIVE-15331.

Therefore, the PR propose to:
 - update the rules to determine the result precision and scale according to the new Hive's ones introduces in HIVE-15331;
 - round the result of the operations, when it is not representable exactly with the result's precision and scale, instead of returning `NULL`
 - introduce a new config `spark.sql.decimalOperations.allowPrecisionLoss` which default to `true` (ie. the new behavior) in order to allow users to switch back to the previous one.

Hive behavior reflects SQLServer's one. The only difference is that the precision and scale are adjusted for all the arithmetic operations in Hive, while SQL Server is said to do so only for multiplications and divisions in the documentation. This PR follows Hive's behavior.

A more detailed explanation is available here: https://mail-archives.apache.org/mod_mbox/spark-dev/201712.mbox/%3CCAEorWNAJ4TxJR9NBcgSFMD_VxTg8qVxusjP%2BAJP-x%2BJV9zH-yA%40mail.gmail.com%3E.

## How was this patch tested?

modified and added UTs. Comparisons with results of Hive and SQLServer.

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #20023 from mgaido91/SPARK-22036.

(cherry picked from commit e28eb431146bcdcaf02a6f6c406ca30920592a6a)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 docs/sql-programming-guide.md                 |   5 +
 .../catalyst/analysis/DecimalPrecision.scala  | 114 +++++---
 .../sql/catalyst/expressions/literals.scala   |   2 +-
 .../apache/spark/sql/internal/SQLConf.scala   |  12 +
 .../apache/spark/sql/types/DecimalType.scala  |  45 +++-
 .../sql/catalyst/analysis/AnalysisSuite.scala |   4 +-
 .../analysis/DecimalPrecisionSuite.scala      |  20 +-
 .../native/decimalArithmeticOperations.sql    |  47 ++++
 .../decimalArithmeticOperations.sql.out       | 245 ++++++++++++++++--
 .../native/decimalPrecision.sql.out           |   4 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  18 --
 11 files changed, 434 insertions(+), 82 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 258c769ff593b..3e2e48a0ef249 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1793,6 +1793,11 @@ options.
   - Since Spark 2.3, when all inputs are binary, `functions.concat()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.concatBinaryAsString` to `true`.
   - Since Spark 2.3, when all inputs are binary, SQL `elt()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.eltOutputAsString` to `true`.
 
+ - Since Spark 2.3, by default arithmetic operations between decimals return a rounded value if an exact representation is not possible (instead of returning NULL). This is compliant to SQL ANSI 2011 specification and Hive's new behavior introduced in Hive 2.2 (HIVE-15331). This involves the following changes
+    - The rules to determine the result type of an arithmetic operation have been updated. In particular, if the precision / scale needed are out of the range of available values, the scale is reduced up to 6, in order to prevent the truncation of the integer part of the decimals. All the arithmetic operations are affected by the change, ie. addition (`+`), subtraction (`-`), multiplication (`*`), division (`/`), remainder (`%`) and positive module (`pmod`).
+    - Literal values used in SQL operations are converted to DECIMAL with the exact precision and scale needed by them.
+    - The configuration `spark.sql.decimalOperations.allowPrecisionLoss` has been introduced. It defaults to `true`, which means the new behavior described here; if set to `false`, Spark uses previous rules, ie. it doesn't adjust the needed scale to represent the values and it returns NULL if an exact representation of the value is not possible.
+
 ## Upgrading From Spark SQL 2.1 to 2.2
 
   - Spark 2.1.1 introduced a new configuration key: `spark.sql.hive.caseSensitiveInferenceMode`. It had a default setting of `NEVER_INFER`, which kept behavior identical to 2.1.0. However, Spark 2.2.0 changes this setting's default value to `INFER_AND_SAVE` to restore compatibility with reading Hive metastore tables whose underlying file schema have mixed-case column names. With the `INFER_AND_SAVE` configuration value, on first access Spark will perform schema inference on any Hive metastore table for which it has not already saved an inferred schema. Note that schema inference can be a very time consuming operation for tables with thousands of partitions. If compatibility with mixed-case column names is not a concern, you can safely set `spark.sql.hive.caseSensitiveInferenceMode` to `NEVER_INFER` to avoid the initial overhead of schema inference. Note that with the new default `INFER_AND_SAVE` setting, the results of the schema inference are saved as a metastore key for future use. Therefore, the initial schema inference occurs only at a table's first access.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
index a8100b9b24aac..ab63131b07573 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
@@ -21,6 +21,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.Literal._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 
 
@@ -42,8 +43,10 @@ import org.apache.spark.sql.types._
  *   e1 / e2      p1 - s1 + s2 + max(6, s1 + p2 + 1)      max(6, s1 + p2 + 1)
  *   e1 % e2      min(p1-s1, p2-s2) + max(s1, s2)         max(s1, s2)
  *   e1 union e2  max(s1, s2) + max(p1-s1, p2-s2)         max(s1, s2)
- *   sum(e1)      p1 + 10                                 s1
- *   avg(e1)      p1 + 4                                  s1 + 4
+ *
+ * When `spark.sql.decimalOperations.allowPrecisionLoss` is set to true, if the precision / scale
+ * needed are out of the range of available values, the scale is reduced up to 6, in order to
+ * prevent the truncation of the integer part of the decimals.
  *
  * To implement the rules for fixed-precision types, we introduce casts to turn them to unlimited
  * precision, do the math on unlimited-precision numbers, then introduce casts back to the
@@ -56,6 +59,7 @@ import org.apache.spark.sql.types._
  * - INT gets turned into DECIMAL(10, 0)
  * - LONG gets turned into DECIMAL(20, 0)
  * - FLOAT and DOUBLE cause fixed-length decimals to turn into DOUBLE
+ * - Literals INT and LONG get turned into DECIMAL with the precision strictly needed by the value
  */
 // scalastyle:on
 object DecimalPrecision extends TypeCoercionRule {
@@ -93,41 +97,76 @@ object DecimalPrecision extends TypeCoercionRule {
     case e: BinaryArithmetic if e.left.isInstanceOf[PromotePrecision] => e
 
     case Add(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-      val dt = DecimalType.bounded(max(s1, s2) + max(p1 - s1, p2 - s2) + 1, max(s1, s2))
-      CheckOverflow(Add(promotePrecision(e1, dt), promotePrecision(e2, dt)), dt)
+      val resultScale = max(s1, s2)
+      val resultType = if (SQLConf.get.decimalOperationsAllowPrecisionLoss) {
+        DecimalType.adjustPrecisionScale(max(p1 - s1, p2 - s2) + resultScale + 1,
+          resultScale)
+      } else {
+        DecimalType.bounded(max(p1 - s1, p2 - s2) + resultScale + 1, resultScale)
+      }
+      CheckOverflow(Add(promotePrecision(e1, resultType), promotePrecision(e2, resultType)),
+        resultType)
 
     case Subtract(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-      val dt = DecimalType.bounded(max(s1, s2) + max(p1 - s1, p2 - s2) + 1, max(s1, s2))
-      CheckOverflow(Subtract(promotePrecision(e1, dt), promotePrecision(e2, dt)), dt)
+      val resultScale = max(s1, s2)
+      val resultType = if (SQLConf.get.decimalOperationsAllowPrecisionLoss) {
+        DecimalType.adjustPrecisionScale(max(p1 - s1, p2 - s2) + resultScale + 1,
+          resultScale)
+      } else {
+        DecimalType.bounded(max(p1 - s1, p2 - s2) + resultScale + 1, resultScale)
+      }
+      CheckOverflow(Subtract(promotePrecision(e1, resultType), promotePrecision(e2, resultType)),
+        resultType)
 
     case Multiply(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-      val resultType = DecimalType.bounded(p1 + p2 + 1, s1 + s2)
+      val resultType = if (SQLConf.get.decimalOperationsAllowPrecisionLoss) {
+        DecimalType.adjustPrecisionScale(p1 + p2 + 1, s1 + s2)
+      } else {
+        DecimalType.bounded(p1 + p2 + 1, s1 + s2)
+      }
       val widerType = widerDecimalType(p1, s1, p2, s2)
       CheckOverflow(Multiply(promotePrecision(e1, widerType), promotePrecision(e2, widerType)),
         resultType)
 
     case Divide(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-      var intDig = min(DecimalType.MAX_SCALE, p1 - s1 + s2)
-      var decDig = min(DecimalType.MAX_SCALE, max(6, s1 + p2 + 1))
-      val diff = (intDig + decDig) - DecimalType.MAX_SCALE
-      if (diff > 0) {
-        decDig -= diff / 2 + 1
-        intDig = DecimalType.MAX_SCALE - decDig
+      val resultType = if (SQLConf.get.decimalOperationsAllowPrecisionLoss) {
+        // Precision: p1 - s1 + s2 + max(6, s1 + p2 + 1)
+        // Scale: max(6, s1 + p2 + 1)
+        val intDig = p1 - s1 + s2
+        val scale = max(DecimalType.MINIMUM_ADJUSTED_SCALE, s1 + p2 + 1)
+        val prec = intDig + scale
+        DecimalType.adjustPrecisionScale(prec, scale)
+      } else {
+        var intDig = min(DecimalType.MAX_SCALE, p1 - s1 + s2)
+        var decDig = min(DecimalType.MAX_SCALE, max(6, s1 + p2 + 1))
+        val diff = (intDig + decDig) - DecimalType.MAX_SCALE
+        if (diff > 0) {
+          decDig -= diff / 2 + 1
+          intDig = DecimalType.MAX_SCALE - decDig
+        }
+        DecimalType.bounded(intDig + decDig, decDig)
       }
-      val resultType = DecimalType.bounded(intDig + decDig, decDig)
       val widerType = widerDecimalType(p1, s1, p2, s2)
       CheckOverflow(Divide(promotePrecision(e1, widerType), promotePrecision(e2, widerType)),
         resultType)
 
     case Remainder(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-      val resultType = DecimalType.bounded(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
+      val resultType = if (SQLConf.get.decimalOperationsAllowPrecisionLoss) {
+        DecimalType.adjustPrecisionScale(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
+      } else {
+        DecimalType.bounded(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
+      }
       // resultType may have lower precision, so we cast them into wider type first.
       val widerType = widerDecimalType(p1, s1, p2, s2)
       CheckOverflow(Remainder(promotePrecision(e1, widerType), promotePrecision(e2, widerType)),
         resultType)
 
     case Pmod(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-      val resultType = DecimalType.bounded(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
+      val resultType = if (SQLConf.get.decimalOperationsAllowPrecisionLoss) {
+        DecimalType.adjustPrecisionScale(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
+      } else {
+        DecimalType.bounded(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
+      }
       // resultType may have lower precision, so we cast them into wider type first.
       val widerType = widerDecimalType(p1, s1, p2, s2)
       CheckOverflow(Pmod(promotePrecision(e1, widerType), promotePrecision(e2, widerType)),
@@ -137,9 +176,6 @@ object DecimalPrecision extends TypeCoercionRule {
     e2 @ DecimalType.Expression(p2, s2)) if p1 != p2 || s1 != s2 =>
       val resultType = widerDecimalType(p1, s1, p2, s2)
       b.makeCopy(Array(Cast(e1, resultType), Cast(e2, resultType)))
-
-    // TODO: MaxOf, MinOf, etc might want other rules
-    // SUM and AVERAGE are handled by the implementations of those expressions
   }
 
   /**
@@ -243,17 +279,35 @@ object DecimalPrecision extends TypeCoercionRule {
     // Promote integers inside a binary expression with fixed-precision decimals to decimals,
     // and fixed-precision decimals in an expression with floats / doubles to doubles
     case b @ BinaryOperator(left, right) if left.dataType != right.dataType =>
-      (left.dataType, right.dataType) match {
-        case (t: IntegralType, DecimalType.Fixed(p, s)) =>
-          b.makeCopy(Array(Cast(left, DecimalType.forType(t)), right))
-        case (DecimalType.Fixed(p, s), t: IntegralType) =>
-          b.makeCopy(Array(left, Cast(right, DecimalType.forType(t))))
-        case (t, DecimalType.Fixed(p, s)) if isFloat(t) =>
-          b.makeCopy(Array(left, Cast(right, DoubleType)))
-        case (DecimalType.Fixed(p, s), t) if isFloat(t) =>
-          b.makeCopy(Array(Cast(left, DoubleType), right))
-        case _ =>
-          b
+      (left, right) match {
+        // Promote literal integers inside a binary expression with fixed-precision decimals to
+        // decimals. The precision and scale are the ones strictly needed by the integer value.
+        // Requiring more precision than necessary may lead to a useless loss of precision.
+        // Consider the following example: multiplying a column which is DECIMAL(38, 18) by 2.
+        // If we use the default precision and scale for the integer type, 2 is considered a
+        // DECIMAL(10, 0). According to the rules, the result would be DECIMAL(38 + 10 + 1, 18),
+        // which is out of range and therefore it will becomes DECIMAL(38, 7), leading to
+        // potentially loosing 11 digits of the fractional part. Using only the precision needed
+        // by the Literal, instead, the result would be DECIMAL(38 + 1 + 1, 18), which would
+        // become DECIMAL(38, 16), safely having a much lower precision loss.
+        case (l: Literal, r) if r.dataType.isInstanceOf[DecimalType]
+          && l.dataType.isInstanceOf[IntegralType] =>
+          b.makeCopy(Array(Cast(l, DecimalType.fromLiteral(l)), r))
+        case (l, r: Literal) if l.dataType.isInstanceOf[DecimalType]
+          && r.dataType.isInstanceOf[IntegralType] =>
+          b.makeCopy(Array(l, Cast(r, DecimalType.fromLiteral(r))))
+        // Promote integers inside a binary expression with fixed-precision decimals to decimals,
+        // and fixed-precision decimals in an expression with floats / doubles to doubles
+        case (l @ IntegralType(), r @ DecimalType.Expression(_, _)) =>
+          b.makeCopy(Array(Cast(l, DecimalType.forType(l.dataType)), r))
+        case (l @ DecimalType.Expression(_, _), r @ IntegralType()) =>
+          b.makeCopy(Array(l, Cast(r, DecimalType.forType(r.dataType))))
+        case (l, r @ DecimalType.Expression(_, _)) if isFloat(l.dataType) =>
+          b.makeCopy(Array(l, Cast(r, DoubleType)))
+        case (l @ DecimalType.Expression(_, _), r) if isFloat(r.dataType) =>
+          b.makeCopy(Array(Cast(l, DoubleType), r))
+        case _ => b
       }
   }
+
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index 383203a209833..cd176d941819f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -58,7 +58,7 @@ object Literal {
     case s: Short => Literal(s, ShortType)
     case s: String => Literal(UTF8String.fromString(s), StringType)
     case b: Boolean => Literal(b, BooleanType)
-    case d: BigDecimal => Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale))
+    case d: BigDecimal => Literal(Decimal(d), DecimalType.fromBigDecimal(d))
     case d: JavaBigDecimal =>
       Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale()))
     case d: Decimal => Literal(d, DecimalType(Math.max(d.precision, d.scale), d.scale))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 16fbb0c3e9e21..cc4f4bf332459 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1064,6 +1064,16 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
+  val DECIMAL_OPERATIONS_ALLOW_PREC_LOSS =
+    buildConf("spark.sql.decimalOperations.allowPrecisionLoss")
+      .internal()
+      .doc("When true (default), establishing the result type of an arithmetic operation " +
+        "happens according to Hive behavior and SQL ANSI 2011 specification, ie. rounding the " +
+        "decimal part of the result if an exact representation is not possible. Otherwise, NULL " +
+        "is returned in those cases, as previously.")
+      .booleanConf
+      .createWithDefault(true)
+
   val SQL_STRING_REDACTION_PATTERN =
     ConfigBuilder("spark.sql.redaction.string.regex")
       .doc("Regex to decide which parts of strings produced by Spark contain sensitive " +
@@ -1441,6 +1451,8 @@ class SQLConf extends Serializable with Logging {
 
   def replaceExceptWithFilter: Boolean = getConf(REPLACE_EXCEPT_WITH_FILTER)
 
+  def decimalOperationsAllowPrecisionLoss: Boolean = getConf(DECIMAL_OPERATIONS_ALLOW_PREC_LOSS)
+
   def continuousStreamingExecutorQueueSize: Int = getConf(CONTINUOUS_STREAMING_EXECUTOR_QUEUE_SIZE)
 
   def continuousStreamingExecutorPollIntervalMs: Long =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
index 6e050c18b8acb..ef3b67c0d48d0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
@@ -23,7 +23,7 @@ import scala.reflect.runtime.universe.typeTag
 
 import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.expressions.{Expression, Literal}
 
 
 /**
@@ -117,6 +117,7 @@ object DecimalType extends AbstractDataType {
   val MAX_SCALE = 38
   val SYSTEM_DEFAULT: DecimalType = DecimalType(MAX_PRECISION, 18)
   val USER_DEFAULT: DecimalType = DecimalType(10, 0)
+  val MINIMUM_ADJUSTED_SCALE = 6
 
   // The decimal types compatible with other numeric types
   private[sql] val ByteDecimal = DecimalType(3, 0)
@@ -136,10 +137,52 @@ object DecimalType extends AbstractDataType {
     case DoubleType => DoubleDecimal
   }
 
+  private[sql] def fromLiteral(literal: Literal): DecimalType = literal.value match {
+    case v: Short => fromBigDecimal(BigDecimal(v))
+    case v: Int => fromBigDecimal(BigDecimal(v))
+    case v: Long => fromBigDecimal(BigDecimal(v))
+    case _ => forType(literal.dataType)
+  }
+
+  private[sql] def fromBigDecimal(d: BigDecimal): DecimalType = {
+    DecimalType(Math.max(d.precision, d.scale), d.scale)
+  }
+
   private[sql] def bounded(precision: Int, scale: Int): DecimalType = {
     DecimalType(min(precision, MAX_PRECISION), min(scale, MAX_SCALE))
   }
 
+  /**
+   * Scale adjustment implementation is based on Hive's one, which is itself inspired to
+   * SQLServer's one. In particular, when a result precision is greater than
+   * {@link #MAX_PRECISION}, the corresponding scale is reduced to prevent the integral part of a
+   * result from being truncated.
+   *
+   * This method is used only when `spark.sql.decimalOperations.allowPrecisionLoss` is set to true.
+   */
+  private[sql] def adjustPrecisionScale(precision: Int, scale: Int): DecimalType = {
+    // Assumptions:
+    assert(precision >= scale)
+    assert(scale >= 0)
+
+    if (precision <= MAX_PRECISION) {
+      // Adjustment only needed when we exceed max precision
+      DecimalType(precision, scale)
+    } else {
+      // Precision/scale exceed maximum precision. Result must be adjusted to MAX_PRECISION.
+      val intDigits = precision - scale
+      // If original scale is less than MINIMUM_ADJUSTED_SCALE, use original scale value; otherwise
+      // preserve at least MINIMUM_ADJUSTED_SCALE fractional digits
+      val minScaleValue = Math.min(scale, MINIMUM_ADJUSTED_SCALE)
+      // The resulting scale is the maximum between what is available without causing a loss of
+      // digits for the integer part of the decimal and the minimum guaranteed scale, which is
+      // computed above
+      val adjustedScale = Math.max(MAX_PRECISION - intDigits, minScaleValue)
+
+      DecimalType(MAX_PRECISION, adjustedScale)
+    }
+  }
+
   override private[sql] def defaultConcreteType: DataType = SYSTEM_DEFAULT
 
   override private[sql] def acceptsType(other: DataType): Boolean = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index f4514205d3ae0..cd8579584eada 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -408,8 +408,8 @@ class AnalysisSuite extends AnalysisTest with Matchers {
     assertExpressionType(sum(Divide(1.0, 2.0)), DoubleType)
     assertExpressionType(sum(Divide(1, 2.0f)), DoubleType)
     assertExpressionType(sum(Divide(1.0f, 2)), DoubleType)
-    assertExpressionType(sum(Divide(1, Decimal(2))), DecimalType(31, 11))
-    assertExpressionType(sum(Divide(Decimal(1), 2)), DecimalType(31, 11))
+    assertExpressionType(sum(Divide(1, Decimal(2))), DecimalType(22, 11))
+    assertExpressionType(sum(Divide(Decimal(1), 2)), DecimalType(26, 6))
     assertExpressionType(sum(Divide(Decimal(1), 2.0)), DoubleType)
     assertExpressionType(sum(Divide(1.0, Decimal(2.0))), DoubleType)
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
index 60e46a9910a8b..c86dc18dfa680 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
@@ -136,19 +136,19 @@ class DecimalPrecisionSuite extends AnalysisTest with BeforeAndAfter {
 
   test("maximum decimals") {
     for (expr <- Seq(d1, d2, i, u)) {
-      checkType(Add(expr, u), DecimalType.SYSTEM_DEFAULT)
-      checkType(Subtract(expr, u), DecimalType.SYSTEM_DEFAULT)
+      checkType(Add(expr, u), DecimalType(38, 17))
+      checkType(Subtract(expr, u), DecimalType(38, 17))
     }
 
-    checkType(Multiply(d1, u), DecimalType(38, 19))
-    checkType(Multiply(d2, u), DecimalType(38, 20))
-    checkType(Multiply(i, u), DecimalType(38, 18))
-    checkType(Multiply(u, u), DecimalType(38, 36))
+    checkType(Multiply(d1, u), DecimalType(38, 16))
+    checkType(Multiply(d2, u), DecimalType(38, 14))
+    checkType(Multiply(i, u), DecimalType(38, 7))
+    checkType(Multiply(u, u), DecimalType(38, 6))
 
-    checkType(Divide(u, d1), DecimalType(38, 18))
-    checkType(Divide(u, d2), DecimalType(38, 19))
-    checkType(Divide(u, i), DecimalType(38, 23))
-    checkType(Divide(u, u), DecimalType(38, 18))
+    checkType(Divide(u, d1), DecimalType(38, 17))
+    checkType(Divide(u, d2), DecimalType(38, 16))
+    checkType(Divide(u, i), DecimalType(38, 18))
+    checkType(Divide(u, u), DecimalType(38, 6))
 
     checkType(Remainder(d1, u), DecimalType(19, 18))
     checkType(Remainder(d2, u), DecimalType(21, 18))
diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalArithmeticOperations.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalArithmeticOperations.sql
index c8e108ac2c45e..c6d8a49d4b93a 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalArithmeticOperations.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalArithmeticOperations.sql
@@ -22,6 +22,51 @@ select a / b from t;
 select a % b from t;
 select pmod(a, b) from t;
 
+-- tests for decimals handling in operations
+create table decimals_test(id int, a decimal(38,18), b decimal(38,18)) using parquet;
+
+insert into decimals_test values(1, 100.0, 999.0), (2, 12345.123, 12345.123),
+  (3, 0.1234567891011, 1234.1), (4, 123456789123456789.0, 1.123456789123456789);
+
+-- test decimal operations
+select id, a+b, a-b, a*b, a/b from decimals_test order by id;
+
+-- test operations between decimals and constants
+select id, a*10, b/10 from decimals_test order by id;
+
+-- test operations on constants
+select 10.3 * 3.0;
+select 10.3000 * 3.0;
+select 10.30000 * 30.0;
+select 10.300000000000000000 * 3.000000000000000000;
+select 10.300000000000000000 * 3.0000000000000000000;
+
+-- arithmetic operations causing an overflow return NULL
+select (5e36 + 0.1) + 5e36;
+select (-4e36 - 0.1) - 7e36;
+select 12345678901234567890.0 * 12345678901234567890.0;
+select 1e35 / 0.1;
+
+-- arithmetic operations causing a precision loss are truncated
+select 123456789123456789.1234567890 * 1.123456789123456789;
+select 0.001 / 9876543210987654321098765432109876543.2
+
+-- return NULL instead of rounding, according to old Spark versions' behavior
+set spark.sql.decimalOperations.allowPrecisionLoss=false;
+
+-- test decimal operations
+select id, a+b, a-b, a*b, a/b from decimals_test order by id;
+
+-- test operations between decimals and constants
+select id, a*10, b/10 from decimals_test order by id;
+
+-- test operations on constants
+select 10.3 * 3.0;
+select 10.3000 * 3.0;
+select 10.30000 * 30.0;
+select 10.300000000000000000 * 3.000000000000000000;
+select 10.300000000000000000 * 3.0000000000000000000;
+
 -- arithmetic operations causing an overflow return NULL
 select (5e36 + 0.1) + 5e36;
 select (-4e36 - 0.1) - 7e36;
@@ -31,3 +76,5 @@ select 1e35 / 0.1;
 -- arithmetic operations causing a precision loss return NULL
 select 123456789123456789.1234567890 * 1.123456789123456789;
 select 0.001 / 9876543210987654321098765432109876543.2
+
+drop table decimals_test;
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalArithmeticOperations.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalArithmeticOperations.sql.out
index ce02f6adc456c..4d70fe19d539f 100644
--- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalArithmeticOperations.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalArithmeticOperations.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 10
+-- Number of queries: 32
 
 
 -- !query 0
@@ -35,48 +35,257 @@ NULL
 
 
 -- !query 4
-select (5e36 + 0.1) + 5e36
+create table decimals_test(id int, a decimal(38,18), b decimal(38,18)) using parquet
 -- !query 4 schema
-struct<(CAST((CAST(5E+36 AS DECIMAL(38,1)) + CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) + CAST(5E+36 AS DECIMAL(38,1))):decimal(38,1)>
+struct<>
 -- !query 4 output
-NULL
+
 
 
 -- !query 5
-select (-4e36 - 0.1) - 7e36
+insert into decimals_test values(1, 100.0, 999.0), (2, 12345.123, 12345.123),
+  (3, 0.1234567891011, 1234.1), (4, 123456789123456789.0, 1.123456789123456789)
 -- !query 5 schema
-struct<(CAST((CAST(-4E+36 AS DECIMAL(38,1)) - CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) - CAST(7E+36 AS DECIMAL(38,1))):decimal(38,1)>
+struct<>
 -- !query 5 output
-NULL
+
 
 
 -- !query 6
-select 12345678901234567890.0 * 12345678901234567890.0
+select id, a+b, a-b, a*b, a/b from decimals_test order by id
 -- !query 6 schema
-struct<(12345678901234567890.0 * 12345678901234567890.0):decimal(38,2)>
+struct<id:int,(a + b):decimal(38,17),(a - b):decimal(38,17),(a * b):decimal(38,6),(a / b):decimal(38,6)>
 -- !query 6 output
-NULL
+1	1099	-899	99900	0.1001
+2	24690.246	0	152402061.885129	1
+3	1234.2234567891011	-1233.9765432108989	152.358023	0.0001
+4	123456789123456790.12345678912345679	123456789123456787.87654321087654321	138698367904130467.515623	109890109097814272.043109
 
 
 -- !query 7
-select 1e35 / 0.1
+select id, a*10, b/10 from decimals_test order by id
 -- !query 7 schema
-struct<(CAST(1E+35 AS DECIMAL(37,1)) / CAST(0.1 AS DECIMAL(37,1))):decimal(38,3)>
+struct<id:int,(CAST(a AS DECIMAL(38,18)) * CAST(CAST(10 AS DECIMAL(2,0)) AS DECIMAL(38,18))):decimal(38,15),(CAST(b AS DECIMAL(38,18)) / CAST(CAST(10 AS DECIMAL(2,0)) AS DECIMAL(38,18))):decimal(38,18)>
 -- !query 7 output
-NULL
+1	1000	99.9
+2	123451.23	1234.5123
+3	1.234567891011	123.41
+4	1234567891234567890	0.112345678912345679
 
 
 -- !query 8
-select 123456789123456789.1234567890 * 1.123456789123456789
+select 10.3 * 3.0
 -- !query 8 schema
-struct<(CAST(123456789123456789.1234567890 AS DECIMAL(36,18)) * CAST(1.123456789123456789 AS DECIMAL(36,18))):decimal(38,28)>
+struct<(CAST(10.3 AS DECIMAL(3,1)) * CAST(3.0 AS DECIMAL(3,1))):decimal(6,2)>
 -- !query 8 output
-NULL
+30.9
 
 
 -- !query 9
-select 0.001 / 9876543210987654321098765432109876543.2
+select 10.3000 * 3.0
 -- !query 9 schema
-struct<(CAST(0.001 AS DECIMAL(38,3)) / CAST(9876543210987654321098765432109876543.2 AS DECIMAL(38,3))):decimal(38,37)>
+struct<(CAST(10.3000 AS DECIMAL(6,4)) * CAST(3.0 AS DECIMAL(6,4))):decimal(9,5)>
 -- !query 9 output
+30.9
+
+
+-- !query 10
+select 10.30000 * 30.0
+-- !query 10 schema
+struct<(CAST(10.30000 AS DECIMAL(7,5)) * CAST(30.0 AS DECIMAL(7,5))):decimal(11,6)>
+-- !query 10 output
+309
+
+
+-- !query 11
+select 10.300000000000000000 * 3.000000000000000000
+-- !query 11 schema
+struct<(CAST(10.300000000000000000 AS DECIMAL(20,18)) * CAST(3.000000000000000000 AS DECIMAL(20,18))):decimal(38,34)>
+-- !query 11 output
+30.9
+
+
+-- !query 12
+select 10.300000000000000000 * 3.0000000000000000000
+-- !query 12 schema
+struct<(CAST(10.300000000000000000 AS DECIMAL(21,19)) * CAST(3.0000000000000000000 AS DECIMAL(21,19))):decimal(38,34)>
+-- !query 12 output
+30.9
+
+
+-- !query 13
+select (5e36 + 0.1) + 5e36
+-- !query 13 schema
+struct<(CAST((CAST(5E+36 AS DECIMAL(38,1)) + CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) + CAST(5E+36 AS DECIMAL(38,1))):decimal(38,1)>
+-- !query 13 output
+NULL
+
+
+-- !query 14
+select (-4e36 - 0.1) - 7e36
+-- !query 14 schema
+struct<(CAST((CAST(-4E+36 AS DECIMAL(38,1)) - CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) - CAST(7E+36 AS DECIMAL(38,1))):decimal(38,1)>
+-- !query 14 output
+NULL
+
+
+-- !query 15
+select 12345678901234567890.0 * 12345678901234567890.0
+-- !query 15 schema
+struct<(12345678901234567890.0 * 12345678901234567890.0):decimal(38,2)>
+-- !query 15 output
 NULL
+
+
+-- !query 16
+select 1e35 / 0.1
+-- !query 16 schema
+struct<(CAST(1E+35 AS DECIMAL(37,1)) / CAST(0.1 AS DECIMAL(37,1))):decimal(38,6)>
+-- !query 16 output
+NULL
+
+
+-- !query 17
+select 123456789123456789.1234567890 * 1.123456789123456789
+-- !query 17 schema
+struct<(CAST(123456789123456789.1234567890 AS DECIMAL(36,18)) * CAST(1.123456789123456789 AS DECIMAL(36,18))):decimal(38,18)>
+-- !query 17 output
+138698367904130467.654320988515622621
+
+
+-- !query 18
+select 0.001 / 9876543210987654321098765432109876543.2
+
+set spark.sql.decimalOperations.allowPrecisionLoss=false
+-- !query 18 schema
+struct<>
+-- !query 18 output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+mismatched input 'spark' expecting <EOF>(line 3, pos 4)
+
+== SQL ==
+select 0.001 / 9876543210987654321098765432109876543.2
+
+set spark.sql.decimalOperations.allowPrecisionLoss=false
+----^^^
+
+
+-- !query 19
+select id, a+b, a-b, a*b, a/b from decimals_test order by id
+-- !query 19 schema
+struct<id:int,(a + b):decimal(38,17),(a - b):decimal(38,17),(a * b):decimal(38,6),(a / b):decimal(38,6)>
+-- !query 19 output
+1	1099	-899	99900	0.1001
+2	24690.246	0	152402061.885129	1
+3	1234.2234567891011	-1233.9765432108989	152.358023	0.0001
+4	123456789123456790.12345678912345679	123456789123456787.87654321087654321	138698367904130467.515623	109890109097814272.043109
+
+
+-- !query 20
+select id, a*10, b/10 from decimals_test order by id
+-- !query 20 schema
+struct<id:int,(CAST(a AS DECIMAL(38,18)) * CAST(CAST(10 AS DECIMAL(2,0)) AS DECIMAL(38,18))):decimal(38,15),(CAST(b AS DECIMAL(38,18)) / CAST(CAST(10 AS DECIMAL(2,0)) AS DECIMAL(38,18))):decimal(38,18)>
+-- !query 20 output
+1	1000	99.9
+2	123451.23	1234.5123
+3	1.234567891011	123.41
+4	1234567891234567890	0.112345678912345679
+
+
+-- !query 21
+select 10.3 * 3.0
+-- !query 21 schema
+struct<(CAST(10.3 AS DECIMAL(3,1)) * CAST(3.0 AS DECIMAL(3,1))):decimal(6,2)>
+-- !query 21 output
+30.9
+
+
+-- !query 22
+select 10.3000 * 3.0
+-- !query 22 schema
+struct<(CAST(10.3000 AS DECIMAL(6,4)) * CAST(3.0 AS DECIMAL(6,4))):decimal(9,5)>
+-- !query 22 output
+30.9
+
+
+-- !query 23
+select 10.30000 * 30.0
+-- !query 23 schema
+struct<(CAST(10.30000 AS DECIMAL(7,5)) * CAST(30.0 AS DECIMAL(7,5))):decimal(11,6)>
+-- !query 23 output
+309
+
+
+-- !query 24
+select 10.300000000000000000 * 3.000000000000000000
+-- !query 24 schema
+struct<(CAST(10.300000000000000000 AS DECIMAL(20,18)) * CAST(3.000000000000000000 AS DECIMAL(20,18))):decimal(38,34)>
+-- !query 24 output
+30.9
+
+
+-- !query 25
+select 10.300000000000000000 * 3.0000000000000000000
+-- !query 25 schema
+struct<(CAST(10.300000000000000000 AS DECIMAL(21,19)) * CAST(3.0000000000000000000 AS DECIMAL(21,19))):decimal(38,34)>
+-- !query 25 output
+30.9
+
+
+-- !query 26
+select (5e36 + 0.1) + 5e36
+-- !query 26 schema
+struct<(CAST((CAST(5E+36 AS DECIMAL(38,1)) + CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) + CAST(5E+36 AS DECIMAL(38,1))):decimal(38,1)>
+-- !query 26 output
+NULL
+
+
+-- !query 27
+select (-4e36 - 0.1) - 7e36
+-- !query 27 schema
+struct<(CAST((CAST(-4E+36 AS DECIMAL(38,1)) - CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) - CAST(7E+36 AS DECIMAL(38,1))):decimal(38,1)>
+-- !query 27 output
+NULL
+
+
+-- !query 28
+select 12345678901234567890.0 * 12345678901234567890.0
+-- !query 28 schema
+struct<(12345678901234567890.0 * 12345678901234567890.0):decimal(38,2)>
+-- !query 28 output
+NULL
+
+
+-- !query 29
+select 1e35 / 0.1
+-- !query 29 schema
+struct<(CAST(1E+35 AS DECIMAL(37,1)) / CAST(0.1 AS DECIMAL(37,1))):decimal(38,6)>
+-- !query 29 output
+NULL
+
+
+-- !query 30
+select 123456789123456789.1234567890 * 1.123456789123456789
+-- !query 30 schema
+struct<(CAST(123456789123456789.1234567890 AS DECIMAL(36,18)) * CAST(1.123456789123456789 AS DECIMAL(36,18))):decimal(38,18)>
+-- !query 30 output
+138698367904130467.654320988515622621
+
+
+-- !query 31
+select 0.001 / 9876543210987654321098765432109876543.2
+
+drop table decimals_test
+-- !query 31 schema
+struct<>
+-- !query 31 output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+mismatched input 'table' expecting <EOF>(line 3, pos 5)
+
+== SQL ==
+select 0.001 / 9876543210987654321098765432109876543.2
+
+drop table decimals_test
+-----^^^
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalPrecision.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalPrecision.sql.out
index ebc8201ed5a1d..6ee7f59d69877 100644
--- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalPrecision.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalPrecision.sql.out
@@ -2329,7 +2329,7 @@ struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) / CAST(C
 -- !query 280
 SELECT cast(1 as bigint) / cast(1 as decimal(20, 0)) FROM t
 -- !query 280 schema
-struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) / CAST(1 AS DECIMAL(20,0))):decimal(38,19)>
+struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) / CAST(1 AS DECIMAL(20,0))):decimal(38,18)>
 -- !query 280 output
 1
 
@@ -2661,7 +2661,7 @@ struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) / CAST(CAST(CAST(1 AS BI
 -- !query 320
 SELECT cast(1 as decimal(20, 0)) / cast(1 as bigint) FROM t
 -- !query 320 schema
-struct<(CAST(1 AS DECIMAL(20,0)) / CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):decimal(38,19)>
+struct<(CAST(1 AS DECIMAL(20,0)) / CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):decimal(38,18)>
 -- !query 320 output
 1
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index d4d0aa4f5f5eb..083a0c0b1b9a0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1517,24 +1517,6 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     }
   }
 
-  test("decimal precision with multiply/division") {
-    checkAnswer(sql("select 10.3 * 3.0"), Row(BigDecimal("30.90")))
-    checkAnswer(sql("select 10.3000 * 3.0"), Row(BigDecimal("30.90000")))
-    checkAnswer(sql("select 10.30000 * 30.0"), Row(BigDecimal("309.000000")))
-    checkAnswer(sql("select 10.300000000000000000 * 3.000000000000000000"),
-      Row(BigDecimal("30.900000000000000000000000000000000000", new MathContext(38))))
-    checkAnswer(sql("select 10.300000000000000000 * 3.0000000000000000000"),
-      Row(null))
-
-    checkAnswer(sql("select 10.3 / 3.0"), Row(BigDecimal("3.433333")))
-    checkAnswer(sql("select 10.3000 / 3.0"), Row(BigDecimal("3.4333333")))
-    checkAnswer(sql("select 10.30000 / 30.0"), Row(BigDecimal("0.343333333")))
-    checkAnswer(sql("select 10.300000000000000000 / 3.00000000000000000"),
-      Row(BigDecimal("3.433333333333333333333333333", new MathContext(38))))
-    checkAnswer(sql("select 10.3000000000000000000 / 3.00000000000000000"),
-      Row(BigDecimal("3.4333333333333333333333333333", new MathContext(38))))
-  }
-
   test("SPARK-10215 Div of Decimal returns null") {
     val d = Decimal(1.12321).toBigDecimal
     val df = Seq((d, 1)).toDF("a", "b")

From e0421c65093f66b365539358dd9be38d2006fa47 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Thu, 18 Jan 2018 22:33:04 +0900
Subject: [PATCH 125/566] [SPARK-23141][SQL][PYSPARK] Support data type string
 as a returnType for registerJavaFunction.

## What changes were proposed in this pull request?

Currently `UDFRegistration.registerJavaFunction` doesn't support data type string as a `returnType` whereas `UDFRegistration.register`, `udf`, or `pandas_udf` does.
We can support it for `UDFRegistration.registerJavaFunction` as well.

## How was this patch tested?

Added a doctest and existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #20307 from ueshin/issues/SPARK-23141.

(cherry picked from commit 5063b7481173ad72bd0dc941b5cf3c9b26a591e4)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 python/pyspark/sql/functions.py |  6 ++++--
 python/pyspark/sql/udf.py       | 14 ++++++++++++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 988c1d25259bc..961b3267b44cf 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2108,7 +2108,8 @@ def udf(f=None, returnType=StringType()):
         can fail on special rows, the workaround is to incorporate the condition into the functions.
 
     :param f: python function if used as a standalone function
-    :param returnType: a :class:`pyspark.sql.types.DataType` object
+    :param returnType: the return type of the user-defined function. The value can be either a
+        :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
 
     >>> from pyspark.sql.types import IntegerType
     >>> slen = udf(lambda s: len(s), IntegerType())
@@ -2148,7 +2149,8 @@ def pandas_udf(f=None, returnType=None, functionType=None):
     Creates a vectorized user defined function (UDF).
 
     :param f: user-defined function. A python function if used as a standalone function
-    :param returnType: a :class:`pyspark.sql.types.DataType` object
+    :param returnType: the return type of the user-defined function. The value can be either a
+        :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
     :param functionType: an enum value in :class:`pyspark.sql.functions.PandasUDFType`.
                          Default: SCALAR.
 
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index 1943bb73f9ac2..c77f19f89a442 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -206,7 +206,8 @@ def register(self, name, f, returnType=None):
         :param f: a Python function, or a user-defined function. The user-defined function can
             be either row-at-a-time or vectorized. See :meth:`pyspark.sql.functions.udf` and
             :meth:`pyspark.sql.functions.pandas_udf`.
-        :param returnType: the return type of the registered user-defined function.
+        :param returnType: the return type of the registered user-defined function. The value can
+            be either a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
         :return: a user-defined function.
 
         `returnType` can be optionally specified when `f` is a Python function but not
@@ -303,21 +304,30 @@ def registerJavaFunction(self, name, javaClassName, returnType=None):
 
         :param name: name of the user-defined function
         :param javaClassName: fully qualified name of java class
-        :param returnType: a :class:`pyspark.sql.types.DataType` object
+        :param returnType: the return type of the registered Java function. The value can be either
+            a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
 
         >>> from pyspark.sql.types import IntegerType
         >>> spark.udf.registerJavaFunction(
         ...     "javaStringLength", "test.org.apache.spark.sql.JavaStringLength", IntegerType())
         >>> spark.sql("SELECT javaStringLength('test')").collect()
         [Row(UDF:javaStringLength(test)=4)]
+
         >>> spark.udf.registerJavaFunction(
         ...     "javaStringLength2", "test.org.apache.spark.sql.JavaStringLength")
         >>> spark.sql("SELECT javaStringLength2('test')").collect()
         [Row(UDF:javaStringLength2(test)=4)]
+
+        >>> spark.udf.registerJavaFunction(
+        ...     "javaStringLength3", "test.org.apache.spark.sql.JavaStringLength", "integer")
+        >>> spark.sql("SELECT javaStringLength3('test')").collect()
+        [Row(UDF:javaStringLength3(test)=4)]
         """
 
         jdt = None
         if returnType is not None:
+            if not isinstance(returnType, DataType):
+                returnType = _parse_datatype_string(returnType)
             jdt = self.sparkSession._jsparkSession.parseDataType(returnType.json())
         self.sparkSession._jsparkSession.udf().registerJava(name, javaClassName, jdt)
 

From bd0a1627b9396c69dbe3554e6ca6c700eeb08f74 Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Thu, 18 Jan 2018 10:19:36 -0800
Subject: [PATCH 126/566] [SPARK-23147][UI] Fix task page table IndexOutOfBound
 Exception

## What changes were proposed in this pull request?

Stage's task page table will throw an exception when there's no complete tasks. Furthermore, because the `dataSize` doesn't take running tasks into account, so sometimes UI cannot show the running tasks. Besides table will only be displayed when first task is finished according to the default sortColumn("index").

![screen shot 2018-01-18 at 8 50 08 pm](https://user-images.githubusercontent.com/850797/35100052-470b4cae-fc95-11e7-96a2-ad9636e732b3.png)

To reproduce this issue, user could try `sc.parallelize(1 to 20, 20).map { i => Thread.sleep(10000); i }.collect()` or `sc.parallelize(1 to 20, 20).map { i => Thread.sleep((20 - i) * 1000); i }.collect` to reproduce the above issue.

Here propose a solution to fix it. Not sure if it is a right fix, please help to review.

## How was this patch tested?

Manual test.

Author: jerryshao <sshao@hortonworks.com>

Closes #20315 from jerryshao/SPARK-23147.

(cherry picked from commit cf7ee1767ddadce08dce050fc3b40c77cdd187da)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 7c6e06cf183ba..af78373ddb4b2 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -676,7 +676,7 @@ private[ui] class TaskDataSource(
 
   private var _tasksToShow: Seq[TaskData] = null
 
-  override def dataSize: Int = stage.numCompleteTasks + stage.numFailedTasks + stage.numKilledTasks
+  override def dataSize: Int = stage.numTasks
 
   override def sliceData(from: Int, to: Int): Seq[TaskData] = {
     if (_tasksToShow == null) {

From bfdbdd37951a872676a22b0524cbde12a1df418d Mon Sep 17 00:00:00 2001
From: Fernando Pereira <fernando.pereira@epfl.ch>
Date: Thu, 18 Jan 2018 13:02:03 -0600
Subject: [PATCH 127/566] [SPARK-23029][DOCS] Specifying default units of
 configuration entries

## What changes were proposed in this pull request?
This PR completes the docs, specifying the default units assumed in configuration entries of type size.
This is crucial since unit-less values are accepted and the user might assume the base unit is bytes, which in most cases it is not, leading to hard-to-debug problems.

## How was this patch tested?
This patch updates only documentation only.

Author: Fernando Pereira <fernando.pereira@epfl.ch>

Closes #20269 from ferdonline/docs_units.

(cherry picked from commit 9678941f54ebc5db935ed8d694e502086e2a31c0)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../scala/org/apache/spark/SparkConf.scala    |   6 +-
 .../spark/internal/config/package.scala       |  47 ++++----
 docs/configuration.md                         | 100 ++++++++++--------
 3 files changed, 85 insertions(+), 68 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index d77303e6fdf8b..f53b2bed74c6e 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -640,9 +640,9 @@ private[spark] object SparkConf extends Logging {
         translation = s => s"${s.toLong * 10}s")),
     "spark.reducer.maxSizeInFlight" -> Seq(
       AlternateConfig("spark.reducer.maxMbInFlight", "1.4")),
-    "spark.kryoserializer.buffer" ->
-        Seq(AlternateConfig("spark.kryoserializer.buffer.mb", "1.4",
-          translation = s => s"${(s.toDouble * 1000).toInt}k")),
+    "spark.kryoserializer.buffer" -> Seq(
+      AlternateConfig("spark.kryoserializer.buffer.mb", "1.4",
+        translation = s => s"${(s.toDouble * 1000).toInt}k")),
     "spark.kryoserializer.buffer.max" -> Seq(
       AlternateConfig("spark.kryoserializer.buffer.max.mb", "1.4")),
     "spark.shuffle.file.buffer" -> Seq(
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index eb12ddf961314..bbfcfbaa7363c 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -38,10 +38,13 @@ package object config {
     ConfigBuilder("spark.driver.userClassPathFirst").booleanConf.createWithDefault(false)
 
   private[spark] val DRIVER_MEMORY = ConfigBuilder("spark.driver.memory")
+    .doc("Amount of memory to use for the driver process, in MiB unless otherwise specified.")
     .bytesConf(ByteUnit.MiB)
     .createWithDefaultString("1g")
 
   private[spark] val DRIVER_MEMORY_OVERHEAD = ConfigBuilder("spark.driver.memoryOverhead")
+    .doc("The amount of off-heap memory to be allocated per driver in cluster mode, " +
+      "in MiB unless otherwise specified.")
     .bytesConf(ByteUnit.MiB)
     .createOptional
 
@@ -62,6 +65,7 @@ package object config {
       .createWithDefault(false)
 
   private[spark] val EVENT_LOG_OUTPUT_BUFFER_SIZE = ConfigBuilder("spark.eventLog.buffer.kb")
+    .doc("Buffer size to use when writing to output streams, in KiB unless otherwise specified.")
     .bytesConf(ByteUnit.KiB)
     .createWithDefaultString("100k")
 
@@ -81,10 +85,13 @@ package object config {
     ConfigBuilder("spark.executor.userClassPathFirst").booleanConf.createWithDefault(false)
 
   private[spark] val EXECUTOR_MEMORY = ConfigBuilder("spark.executor.memory")
+    .doc("Amount of memory to use per executor process, in MiB unless otherwise specified.")
     .bytesConf(ByteUnit.MiB)
     .createWithDefaultString("1g")
 
   private[spark] val EXECUTOR_MEMORY_OVERHEAD = ConfigBuilder("spark.executor.memoryOverhead")
+    .doc("The amount of off-heap memory to be allocated per executor in cluster mode, " +
+      "in MiB unless otherwise specified.")
     .bytesConf(ByteUnit.MiB)
     .createOptional
 
@@ -353,7 +360,7 @@ package object config {
   private[spark] val BUFFER_WRITE_CHUNK_SIZE =
     ConfigBuilder("spark.buffer.write.chunkSize")
       .internal()
-      .doc("The chunk size during writing out the bytes of ChunkedByteBuffer.")
+      .doc("The chunk size in bytes during writing out the bytes of ChunkedByteBuffer.")
       .bytesConf(ByteUnit.BYTE)
       .checkValue(_ <= Int.MaxValue, "The chunk size during writing out the bytes of" +
         " ChunkedByteBuffer should not larger than Int.MaxValue.")
@@ -368,9 +375,9 @@ package object config {
 
   private[spark] val SHUFFLE_ACCURATE_BLOCK_THRESHOLD =
     ConfigBuilder("spark.shuffle.accurateBlockThreshold")
-      .doc("When we compress the size of shuffle blocks in HighlyCompressedMapStatus, we will " +
-        "record the size accurately if it's above this config. This helps to prevent OOM by " +
-        "avoiding underestimating shuffle block size when fetch shuffle blocks.")
+      .doc("Threshold in bytes above which the size of shuffle blocks in " +
+        "HighlyCompressedMapStatus is accurately recorded. This helps to prevent OOM " +
+        "by avoiding underestimating shuffle block size when fetch shuffle blocks.")
       .bytesConf(ByteUnit.BYTE)
       .createWithDefault(100 * 1024 * 1024)
 
@@ -389,23 +396,23 @@ package object config {
 
   private[spark] val REDUCER_MAX_BLOCKS_IN_FLIGHT_PER_ADDRESS =
     ConfigBuilder("spark.reducer.maxBlocksInFlightPerAddress")
-      .doc("This configuration limits the number of remote blocks being fetched per reduce task" +
-        " from a given host port. When a large number of blocks are being requested from a given" +
-        " address in a single fetch or simultaneously, this could crash the serving executor or" +
-        " Node Manager. This is especially useful to reduce the load on the Node Manager when" +
-        " external shuffle is enabled. You can mitigate the issue by setting it to a lower value.")
+      .doc("This configuration limits the number of remote blocks being fetched per reduce task " +
+        "from a given host port. When a large number of blocks are being requested from a given " +
+        "address in a single fetch or simultaneously, this could crash the serving executor or " +
+        "Node Manager. This is especially useful to reduce the load on the Node Manager when " +
+        "external shuffle is enabled. You can mitigate the issue by setting it to a lower value.")
       .intConf
       .checkValue(_ > 0, "The max no. of blocks in flight cannot be non-positive.")
       .createWithDefault(Int.MaxValue)
 
   private[spark] val MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM =
     ConfigBuilder("spark.maxRemoteBlockSizeFetchToMem")
-      .doc("Remote block will be fetched to disk when size of the block is " +
-        "above this threshold. This is to avoid a giant request takes too much memory. We can " +
-        "enable this config by setting a specific value(e.g. 200m). Note this configuration will " +
-        "affect both shuffle fetch and block manager remote block fetch. For users who " +
-        "enabled external shuffle service, this feature can only be worked when external shuffle" +
-        " service is newer than Spark 2.2.")
+      .doc("Remote block will be fetched to disk when size of the block is above this threshold " +
+        "in bytes. This is to avoid a giant request takes too much memory. We can enable this " +
+        "config by setting a specific value(e.g. 200m). Note this configuration will affect " +
+        "both shuffle fetch and block manager remote block fetch. For users who enabled " +
+        "external shuffle service, this feature can only be worked when external shuffle" +
+        "service is newer than Spark 2.2.")
       .bytesConf(ByteUnit.BYTE)
       .createWithDefault(Long.MaxValue)
 
@@ -419,9 +426,9 @@ package object config {
 
   private[spark] val SHUFFLE_FILE_BUFFER_SIZE =
     ConfigBuilder("spark.shuffle.file.buffer")
-      .doc("Size of the in-memory buffer for each shuffle file output stream. " +
-        "These buffers reduce the number of disk seeks and system calls made " +
-        "in creating intermediate shuffle files.")
+      .doc("Size of the in-memory buffer for each shuffle file output stream, in KiB unless " +
+        "otherwise specified. These buffers reduce the number of disk seeks and system calls " +
+        "made in creating intermediate shuffle files.")
       .bytesConf(ByteUnit.KiB)
       .checkValue(v => v > 0 && v <= Int.MaxValue / 1024,
         s"The file buffer size must be greater than 0 and less than ${Int.MaxValue / 1024}.")
@@ -430,7 +437,7 @@ package object config {
   private[spark] val SHUFFLE_UNSAFE_FILE_OUTPUT_BUFFER_SIZE =
     ConfigBuilder("spark.shuffle.unsafe.file.output.buffer")
       .doc("The file system for this buffer size after each partition " +
-        "is written in unsafe shuffle writer.")
+        "is written in unsafe shuffle writer. In KiB unless otherwise specified.")
       .bytesConf(ByteUnit.KiB)
       .checkValue(v => v > 0 && v <= Int.MaxValue / 1024,
         s"The buffer size must be greater than 0 and less than ${Int.MaxValue / 1024}.")
@@ -438,7 +445,7 @@ package object config {
 
   private[spark] val SHUFFLE_DISK_WRITE_BUFFER_SIZE =
     ConfigBuilder("spark.shuffle.spill.diskWriteBufferSize")
-      .doc("The buffer size to use when writing the sorted records to an on-disk file.")
+      .doc("The buffer size, in bytes, to use when writing the sorted records to an on-disk file.")
       .bytesConf(ByteUnit.BYTE)
       .checkValue(v => v > 0 && v <= Int.MaxValue,
         s"The buffer size must be greater than 0 and less than ${Int.MaxValue}.")
diff --git a/docs/configuration.md b/docs/configuration.md
index 1189aea2aa71f..eecb39dcafc9e 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -58,6 +58,10 @@ The following format is accepted:
     1t or 1tb (tebibytes = 1024 gibibytes)
     1p or 1pb (pebibytes = 1024 tebibytes)
 
+While numbers without units are generally interpreted as bytes, a few are interpreted as KiB or MiB.
+See documentation of individual configuration properties. Specifying units is desirable where 
+possible.
+
 ## Dynamically Loading Spark Properties
 
 In some cases, you may want to avoid hard-coding certain configurations in a `SparkConf`. For
@@ -136,9 +140,9 @@ of the most common options to set are:
   <td><code>spark.driver.maxResultSize</code></td>
   <td>1g</td>
   <td>
-    Limit of total size of serialized results of all partitions for each Spark action (e.g. collect).
-    Should be at least 1M, or 0 for unlimited. Jobs will be aborted if the total size
-    is above this limit.
+    Limit of total size of serialized results of all partitions for each Spark action (e.g. 
+    collect) in bytes. Should be at least 1M, or 0 for unlimited. Jobs will be aborted if the total 
+    size is above this limit.
     Having a high limit may cause out-of-memory errors in driver (depends on spark.driver.memory
     and memory overhead of objects in JVM). Setting a proper limit can protect the driver from
     out-of-memory errors.
@@ -148,10 +152,10 @@ of the most common options to set are:
   <td><code>spark.driver.memory</code></td>
   <td>1g</td>
   <td>
-    Amount of memory to use for the driver process, i.e. where SparkContext is initialized.
-    (e.g. <code>1g</code>, <code>2g</code>).
-
-    <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
+    Amount of memory to use for the driver process, i.e. where SparkContext is initialized, in MiB 
+    unless otherwise specified (e.g. <code>1g</code>, <code>2g</code>).
+    <br />
+    <em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
     directly in your application, because the driver JVM has already started at that point.
     Instead, please set this through the <code>--driver-memory</code> command line option
     or in your default properties file.
@@ -161,27 +165,28 @@ of the most common options to set are:
   <td><code>spark.driver.memoryOverhead</code></td>
   <td>driverMemory * 0.10, with minimum of 384 </td>
   <td>
-    The amount of off-heap memory (in megabytes) to be allocated per driver in cluster mode. This is
-    memory that accounts for things like VM overheads, interned strings, other native overheads, etc.
-    This tends to grow with the container size (typically 6-10%). This option is currently supported
-    on YARN and Kubernetes.
+    The amount of off-heap memory to be allocated per driver in cluster mode, in MiB unless
+    otherwise specified. This is memory that accounts for things like VM overheads, interned strings, 
+    other native overheads, etc. This tends to grow with the container size (typically 6-10%). 
+    This option is currently supported on YARN and Kubernetes.
   </td>
 </tr>
 <tr>
   <td><code>spark.executor.memory</code></td>
   <td>1g</td>
   <td>
-    Amount of memory to use per executor process (e.g. <code>2g</code>, <code>8g</code>).
+    Amount of memory to use per executor process, in MiB unless otherwise specified.
+    (e.g. <code>2g</code>, <code>8g</code>).
   </td>
 </tr>
 <tr>
  <td><code>spark.executor.memoryOverhead</code></td>
   <td>executorMemory * 0.10, with minimum of 384 </td>
   <td>
-    The amount of off-heap memory (in megabytes) to be allocated per executor. This is memory that
-    accounts for things like VM overheads, interned strings, other native overheads, etc. This tends
-    to grow with the executor size (typically 6-10%). This option is currently supported on YARN and
-    Kubernetes.
+    The amount of off-heap memory to be allocated per executor, in MiB unless otherwise specified.
+    This is memory that accounts for things like VM overheads, interned strings, other native 
+    overheads, etc. This tends to grow with the executor size (typically 6-10%).
+    This option is currently supported on YARN and Kubernetes.
   </td>
 </tr>
 <tr>
@@ -431,8 +436,9 @@ Apart from these, the following properties are also available, and may be useful
   <td>512m</td>
   <td>
     Amount of memory to use per python worker process during aggregation, in the same
-    format as JVM memory strings (e.g. <code>512m</code>, <code>2g</code>). If the memory
-    used during aggregation goes above this amount, it will spill the data into disks.
+    format as JVM memory strings with a size unit suffix ("k", "m", "g" or "t")
+    (e.g. <code>512m</code>, <code>2g</code>).
+    If the memory used during aggregation goes above this amount, it will spill the data into disks.
   </td>
 </tr>
 <tr>
@@ -540,9 +546,10 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.reducer.maxSizeInFlight</code></td>
   <td>48m</td>
   <td>
-    Maximum size of map outputs to fetch simultaneously from each reduce task. Since
-    each output requires us to create a buffer to receive it, this represents a fixed memory
-    overhead per reduce task, so keep it small unless you have a large amount of memory.
+    Maximum size of map outputs to fetch simultaneously from each reduce task, in MiB unless 
+    otherwise specified. Since each output requires us to create a buffer to receive it, this 
+    represents a fixed memory overhead per reduce task, so keep it small unless you have a
+    large amount of memory.
   </td>
 </tr>
 <tr>
@@ -570,9 +577,9 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.maxRemoteBlockSizeFetchToMem</code></td>
   <td>Long.MaxValue</td>
   <td>
-    The remote block will be fetched to disk when size of the block is above this threshold.
+    The remote block will be fetched to disk when size of the block is above this threshold in bytes.
     This is to avoid a giant request takes too much memory. We can enable this config by setting
-    a specific value(e.g. 200m). Note this configuration will affect both shuffle fetch 
+    a specific value(e.g. 200m). Note this configuration will affect both shuffle fetch
     and block manager remote block fetch. For users who enabled external shuffle service,
     this feature can only be worked when external shuffle service is newer than Spark 2.2.
   </td>
@@ -589,8 +596,9 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.shuffle.file.buffer</code></td>
   <td>32k</td>
   <td>
-    Size of the in-memory buffer for each shuffle file output stream. These buffers
-    reduce the number of disk seeks and system calls made in creating intermediate shuffle files.
+    Size of the in-memory buffer for each shuffle file output stream, in KiB unless otherwise 
+    specified. These buffers reduce the number of disk seeks and system calls made in creating 
+    intermediate shuffle files.
   </td>
 </tr>
 <tr>
@@ -651,7 +659,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.shuffle.service.index.cache.size</code></td>
   <td>100m</td>
   <td>
-    Cache entries limited to the specified memory footprint.
+    Cache entries limited to the specified memory footprint in bytes.
   </td>
 </tr>
 <tr>
@@ -685,9 +693,9 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.shuffle.accurateBlockThreshold</code></td>
   <td>100 * 1024 * 1024</td>
   <td>
-    When we compress the size of shuffle blocks in HighlyCompressedMapStatus, we will record the
-    size accurately if it's above this config. This helps to prevent OOM by avoiding
-    underestimating shuffle block size when fetch shuffle blocks.
+    Threshold in bytes above which the size of shuffle blocks in HighlyCompressedMapStatus is 
+    accurately recorded. This helps to prevent OOM by avoiding underestimating shuffle 
+    block size when fetch shuffle blocks.
   </td>
 </tr>
 <tr>
@@ -779,7 +787,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.eventLog.buffer.kb</code></td>
   <td>100k</td>
   <td>
-    Buffer size in KB to use when writing to output streams.
+    Buffer size to use when writing to output streams, in KiB unless otherwise specified.
   </td>
 </tr>
 <tr>
@@ -917,7 +925,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.io.compression.lz4.blockSize</code></td>
   <td>32k</td>
   <td>
-    Block size used in LZ4 compression, in the case when LZ4 compression codec
+    Block size in bytes used in LZ4 compression, in the case when LZ4 compression codec
     is used. Lowering this block size will also lower shuffle memory usage when LZ4 is used.
   </td>
 </tr>
@@ -925,7 +933,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.io.compression.snappy.blockSize</code></td>
   <td>32k</td>
   <td>
-    Block size used in Snappy compression, in the case when Snappy compression codec
+    Block size in bytes used in Snappy compression, in the case when Snappy compression codec
     is used. Lowering this block size will also lower shuffle memory usage when Snappy is used.
   </td>
 </tr>
@@ -941,7 +949,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.io.compression.zstd.bufferSize</code></td>
   <td>32k</td>
   <td>
-    Buffer size used in Zstd compression, in the case when Zstd compression codec
+    Buffer size in bytes used in Zstd compression, in the case when Zstd compression codec
     is used. Lowering this size will lower the shuffle memory usage when Zstd is used, but it
     might increase the compression cost because of excessive JNI call overhead.
   </td>
@@ -1001,8 +1009,8 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.kryoserializer.buffer.max</code></td>
   <td>64m</td>
   <td>
-    Maximum allowable size of Kryo serialization buffer. This must be larger than any
-    object you attempt to serialize and must be less than 2048m.
+    Maximum allowable size of Kryo serialization buffer, in MiB unless otherwise specified.
+    This must be larger than any object you attempt to serialize and must be less than 2048m.
     Increase this if you get a "buffer limit exceeded" exception inside Kryo.
   </td>
 </tr>
@@ -1010,9 +1018,9 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.kryoserializer.buffer</code></td>
   <td>64k</td>
   <td>
-    Initial size of Kryo's serialization buffer. Note that there will be one buffer
-     <i>per core</i> on each worker. This buffer will grow up to
-     <code>spark.kryoserializer.buffer.max</code> if needed.
+    Initial size of Kryo's serialization buffer, in KiB unless otherwise specified. 
+    Note that there will be one buffer <i>per core</i> on each worker. This buffer will grow up to
+    <code>spark.kryoserializer.buffer.max</code> if needed.
   </td>
 </tr>
 <tr>
@@ -1086,7 +1094,8 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.memory.offHeap.enabled</code></td>
   <td>false</td>
   <td>
-    If true, Spark will attempt to use off-heap memory for certain operations. If off-heap memory use is enabled, then <code>spark.memory.offHeap.size</code> must be positive.
+    If true, Spark will attempt to use off-heap memory for certain operations. If off-heap memory 
+    use is enabled, then <code>spark.memory.offHeap.size</code> must be positive.
   </td>
 </tr>
 <tr>
@@ -1094,7 +1103,8 @@ Apart from these, the following properties are also available, and may be useful
   <td>0</td>
   <td>
     The absolute amount of memory in bytes which can be used for off-heap allocation.
-    This setting has no impact on heap memory usage, so if your executors' total memory consumption must fit within some hard limit then be sure to shrink your JVM heap size accordingly.
+    This setting has no impact on heap memory usage, so if your executors' total memory consumption 
+    must fit within some hard limit then be sure to shrink your JVM heap size accordingly.
     This must be set to a positive value when <code>spark.memory.offHeap.enabled=true</code>.
   </td>
 </tr>
@@ -1202,9 +1212,9 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.broadcast.blockSize</code></td>
   <td>4m</td>
   <td>
-    Size of each piece of a block for <code>TorrentBroadcastFactory</code>.
-    Too large a value decreases parallelism during broadcast (makes it slower); however, if it is
-    too small, <code>BlockManager</code> might take a performance hit.
+    Size of each piece of a block for <code>TorrentBroadcastFactory</code>, in KiB unless otherwise 
+    specified. Too large a value decreases parallelism during broadcast (makes it slower); however, 
+    if it is too small, <code>BlockManager</code> might take a performance hit.
   </td>
 </tr>
 <tr>
@@ -1312,7 +1322,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.storage.memoryMapThreshold</code></td>
   <td>2m</td>
   <td>
-    Size of a block above which Spark memory maps when reading a block from disk.
+    Size in bytes of a block above which Spark memory maps when reading a block from disk.
     This prevents Spark from memory mapping very small blocks. In general, memory
     mapping has high overhead for blocks close to or below the page size of the operating system.
   </td>
@@ -2490,4 +2500,4 @@ Also, you can modify or add configurations at runtime:
   --conf "spark.executor.extraJavaOptions=-XX:+PrintGCDetails -XX:+PrintGCTimeStamps" \ 
   --conf spark.hadoop.abc.def=xyz \ 
   myApp.jar
-{% endhighlight %}
\ No newline at end of file
+{% endhighlight %}

From e6e8bbe84625861f3a4834a2d71cb2f0fe7f6b5a Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 18 Jan 2018 12:25:52 -0800
Subject: [PATCH 128/566] [SPARK-23143][SS][PYTHON] Added python API for
 setting continuous trigger

## What changes were proposed in this pull request?
Self-explanatory.

## How was this patch tested?
New python tests.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #20309 from tdas/SPARK-23143.

(cherry picked from commit 2d41f040a34d6483919fd5d491cf90eee5429290)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 python/pyspark/sql/streaming.py | 23 +++++++++++++++++++----
 python/pyspark/sql/tests.py     |  6 ++++++
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 24ae3776a217b..e2a97acb5e2a7 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -786,7 +786,7 @@ def queryName(self, queryName):
 
     @keyword_only
     @since(2.0)
-    def trigger(self, processingTime=None, once=None):
+    def trigger(self, processingTime=None, once=None, continuous=None):
         """Set the trigger for the stream query. If this is not set it will run the query as fast
         as possible, which is equivalent to setting the trigger to ``processingTime='0 seconds'``.
 
@@ -802,23 +802,38 @@ def trigger(self, processingTime=None, once=None):
         >>> writer = sdf.writeStream.trigger(processingTime='5 seconds')
         >>> # trigger the query for just once batch of data
         >>> writer = sdf.writeStream.trigger(once=True)
+        >>> # trigger the query for execution every 5 seconds
+        >>> writer = sdf.writeStream.trigger(continuous='5 seconds')
         """
+        params = [processingTime, once, continuous]
+
+        if params.count(None) == 3:
+            raise ValueError('No trigger provided')
+        elif params.count(None) < 2:
+            raise ValueError('Multiple triggers not allowed.')
+
         jTrigger = None
         if processingTime is not None:
-            if once is not None:
-                raise ValueError('Multiple triggers not allowed.')
             if type(processingTime) != str or len(processingTime.strip()) == 0:
                 raise ValueError('Value for processingTime must be a non empty string. Got: %s' %
                                  processingTime)
             interval = processingTime.strip()
             jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.ProcessingTime(
                 interval)
+
         elif once is not None:
             if once is not True:
                 raise ValueError('Value for once must be True. Got: %s' % once)
             jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.Once()
+
         else:
-            raise ValueError('No trigger provided')
+            if type(continuous) != str or len(continuous.strip()) == 0:
+                raise ValueError('Value for continuous must be a non empty string. Got: %s' %
+                                 continuous)
+            interval = continuous.strip()
+            jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.Continuous(
+                interval)
+
         self._jwrite = self._jwrite.trigger(jTrigger)
         return self
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index f84aa3d68b808..25483594f2725 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1538,6 +1538,12 @@ def test_stream_trigger(self):
         except ValueError:
             pass
 
+        # Should not take multiple args
+        try:
+            df.writeStream.trigger(processingTime='5 seconds', continuous='1 second')
+        except ValueError:
+            pass
+
         # Should take only keyword args
         try:
             df.writeStream.trigger('5 seconds')

From 1f88fcd41c6c5521d732b25e83d6c9d150d7f24a Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 18 Jan 2018 12:33:39 -0800
Subject: [PATCH 129/566] [SPARK-23144][SS] Added console sink for continuous
 processing

## What changes were proposed in this pull request?
Refactored ConsoleWriter into ConsoleMicrobatchWriter and ConsoleContinuousWriter.

## How was this patch tested?
new unit test

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #20311 from tdas/SPARK-23144.

(cherry picked from commit bf34d665b9c865e00fac7001500bf6d521c2dff9)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../sql/execution/streaming/console.scala     | 20 +++--
 .../streaming/sources/ConsoleWriter.scala     | 80 ++++++++++++++-----
 .../sources/ConsoleWriterSuite.scala          | 26 +++++-
 3 files changed, 96 insertions(+), 30 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
index 94820376ff7e7..f2aa3259731d1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
@@ -19,13 +19,12 @@ package org.apache.spark.sql.execution.streaming
 
 import java.util.Optional
 
-import scala.collection.JavaConverters._
-
 import org.apache.spark.sql._
-import org.apache.spark.sql.execution.streaming.sources.ConsoleWriter
+import org.apache.spark.sql.execution.streaming.sources.{ConsoleContinuousWriter, ConsoleMicroBatchWriter, ConsoleWriter}
 import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister}
 import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
-import org.apache.spark.sql.sources.v2.streaming.MicroBatchWriteSupport
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousWriteSupport, MicroBatchWriteSupport}
+import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
 import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
@@ -37,16 +36,25 @@ case class ConsoleRelation(override val sqlContext: SQLContext, data: DataFrame)
 
 class ConsoleSinkProvider extends DataSourceV2
   with MicroBatchWriteSupport
+  with ContinuousWriteSupport
   with DataSourceRegister
   with CreatableRelationProvider {
 
   override def createMicroBatchWriter(
       queryId: String,
-      epochId: Long,
+      batchId: Long,
       schema: StructType,
       mode: OutputMode,
       options: DataSourceV2Options): Optional[DataSourceV2Writer] = {
-    Optional.of(new ConsoleWriter(epochId, schema, options))
+    Optional.of(new ConsoleMicroBatchWriter(batchId, schema, options))
+  }
+
+  override def createContinuousWriter(
+      queryId: String,
+      schema: StructType,
+      mode: OutputMode,
+      options: DataSourceV2Options): Optional[ContinuousWriter] = {
+    Optional.of(new ConsoleContinuousWriter(schema, options))
   }
 
   def createRelation(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala
index 361979984bbec..6fb61dff60045 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala
@@ -20,45 +20,85 @@ package org.apache.spark.sql.execution.streaming.sources
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.sources.v2.DataSourceV2Options
+import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
 import org.apache.spark.sql.sources.v2.writer.{DataSourceV2Writer, DataWriterFactory, WriterCommitMessage}
 import org.apache.spark.sql.types.StructType
 
-/**
- * A [[DataSourceV2Writer]] that collects results to the driver and prints them in the console.
- * Generated by [[org.apache.spark.sql.execution.streaming.ConsoleSinkProvider]].
- *
- * This sink should not be used for production, as it requires sending all rows to the driver
- * and does not support recovery.
- */
-class ConsoleWriter(batchId: Long, schema: StructType, options: DataSourceV2Options)
-    extends DataSourceV2Writer with Logging {
+/** Common methods used to create writes for the the console sink */
+trait ConsoleWriter extends Logging {
+
+  def options: DataSourceV2Options
+
   // Number of rows to display, by default 20 rows
-  private val numRowsToShow = options.getInt("numRows", 20)
+  protected val numRowsToShow = options.getInt("numRows", 20)
 
   // Truncate the displayed data if it is too long, by default it is true
-  private val isTruncated = options.getBoolean("truncate", true)
+  protected val isTruncated = options.getBoolean("truncate", true)
 
   assert(SparkSession.getActiveSession.isDefined)
-  private val spark = SparkSession.getActiveSession.get
+  protected val spark = SparkSession.getActiveSession.get
+
+  def createWriterFactory(): DataWriterFactory[Row] = PackedRowWriterFactory
 
-  override def createWriterFactory(): DataWriterFactory[Row] = PackedRowWriterFactory
+  def abort(messages: Array[WriterCommitMessage]): Unit = {}
 
-  override def commit(messages: Array[WriterCommitMessage]): Unit = synchronized {
-    val batch = messages.collect {
+  protected def printRows(
+      commitMessages: Array[WriterCommitMessage],
+      schema: StructType,
+      printMessage: String): Unit = {
+    val rows = commitMessages.collect {
       case PackedRowCommitMessage(rows) => rows
     }.flatten
 
     // scalastyle:off println
     println("-------------------------------------------")
-    println(s"Batch: $batchId")
+    println(printMessage)
     println("-------------------------------------------")
     // scalastyle:off println
-    spark.createDataFrame(
-      spark.sparkContext.parallelize(batch), schema)
+    spark
+      .createDataFrame(spark.sparkContext.parallelize(rows), schema)
       .show(numRowsToShow, isTruncated)
   }
+}
+
+
+/**
+ * A [[DataSourceV2Writer]] that collects results from a micro-batch query to the driver and
+ * prints them in the console. Created by
+ * [[org.apache.spark.sql.execution.streaming.ConsoleSinkProvider]].
+ *
+ * This sink should not be used for production, as it requires sending all rows to the driver
+ * and does not support recovery.
+ */
+class ConsoleMicroBatchWriter(batchId: Long, schema: StructType, val options: DataSourceV2Options)
+  extends DataSourceV2Writer with ConsoleWriter {
+
+  override def commit(messages: Array[WriterCommitMessage]): Unit = {
+    printRows(messages, schema, s"Batch: $batchId")
+  }
+
+  override def toString(): String = {
+    s"ConsoleMicroBatchWriter[numRows=$numRowsToShow, truncate=$isTruncated]"
+  }
+}
 
-  override def abort(messages: Array[WriterCommitMessage]): Unit = {}
 
-  override def toString(): String = s"ConsoleWriter[numRows=$numRowsToShow, truncate=$isTruncated]"
+/**
+ * A [[DataSourceV2Writer]] that collects results from a continuous query to the driver and
+ * prints them in the console. Created by
+ * [[org.apache.spark.sql.execution.streaming.ConsoleSinkProvider]].
+ *
+ * This sink should not be used for production, as it requires sending all rows to the driver
+ * and does not support recovery.
+ */
+class ConsoleContinuousWriter(schema: StructType, val options: DataSourceV2Options)
+  extends ContinuousWriter with ConsoleWriter {
+
+  override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {
+    printRows(messages, schema, s"Continuous processing epoch $epochId")
+  }
+
+  override def toString(): String = {
+    s"ConsoleContinuousWriter[numRows=$numRowsToShow, truncate=$isTruncated]"
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriterSuite.scala
index 60ffee9b9b42c..55acf2ba28d2f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriterSuite.scala
@@ -19,13 +19,15 @@ package org.apache.spark.sql.execution.streaming.sources
 
 import java.io.ByteArrayOutputStream
 
+import org.scalatest.time.SpanSugar._
+
 import org.apache.spark.sql.execution.streaming.MemoryStream
-import org.apache.spark.sql.streaming.StreamTest
+import org.apache.spark.sql.streaming.{StreamTest, Trigger}
 
 class ConsoleWriterSuite extends StreamTest {
   import testImplicits._
 
-  test("console") {
+  test("microbatch - default") {
     val input = MemoryStream[Int]
 
     val captured = new ByteArrayOutputStream()
@@ -77,7 +79,7 @@ class ConsoleWriterSuite extends StreamTest {
         |""".stripMargin)
   }
 
-  test("console with numRows") {
+  test("microbatch - with numRows") {
     val input = MemoryStream[Int]
 
     val captured = new ByteArrayOutputStream()
@@ -106,7 +108,7 @@ class ConsoleWriterSuite extends StreamTest {
         |""".stripMargin)
   }
 
-  test("console with truncation") {
+  test("microbatch - truncation") {
     val input = MemoryStream[String]
 
     val captured = new ByteArrayOutputStream()
@@ -132,4 +134,20 @@ class ConsoleWriterSuite extends StreamTest {
         |
         |""".stripMargin)
   }
+
+  test("continuous - default") {
+    val captured = new ByteArrayOutputStream()
+    Console.withOut(captured) {
+      val input = spark.readStream
+        .format("rate")
+        .option("numPartitions", "1")
+        .option("rowsPerSecond", "5")
+        .load()
+        .select('value)
+
+      val query = input.writeStream.format("console").trigger(Trigger.Continuous(200)).start()
+      assert(query.isActive)
+      query.stop()
+    }
+  }
 }

From b8c6d9303d029f6bf8ee43bae3f159112eb0fb79 Mon Sep 17 00:00:00 2001
From: Andrew Korzhuev <korzhuev@andrusha.me>
Date: Thu, 18 Jan 2018 14:00:12 -0800
Subject: [PATCH 130/566] [SPARK-23133][K8S] Fix passing java options to
 Executor

Pass through spark java options to the executor in context of docker image.
Closes #20296

andrusha: Deployed two version of containers to local k8s, checked that java options were present in the updated image on the running executor.
Manual test

Author: Andrew Korzhuev <korzhuev@andrusha.me>

Closes #20322 from foxish/patch-1.

(cherry picked from commit f568e9cf76f657d094f1d036ab5a95f2531f5761)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../docker/src/main/dockerfiles/spark/entrypoint.sh         | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
index 0c28c75857871..b9090dc2852a5 100755
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
@@ -42,7 +42,7 @@ shift 1
 
 SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*"
 env | grep SPARK_JAVA_OPT_ | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt
-readarray -t SPARK_DRIVER_JAVA_OPTS < /tmp/java_opts.txt
+readarray -t SPARK_JAVA_OPTS < /tmp/java_opts.txt
 if [ -n "$SPARK_MOUNTED_CLASSPATH" ]; then
   SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_MOUNTED_CLASSPATH"
 fi
@@ -54,7 +54,7 @@ case "$SPARK_K8S_CMD" in
   driver)
     CMD=(
       ${JAVA_HOME}/bin/java
-      "${SPARK_DRIVER_JAVA_OPTS[@]}"
+      "${SPARK_JAVA_OPTS[@]}"
       -cp "$SPARK_CLASSPATH"
       -Xms$SPARK_DRIVER_MEMORY
       -Xmx$SPARK_DRIVER_MEMORY
@@ -67,7 +67,7 @@ case "$SPARK_K8S_CMD" in
   executor)
     CMD=(
       ${JAVA_HOME}/bin/java
-      "${SPARK_EXECUTOR_JAVA_OPTS[@]}"
+      "${SPARK_JAVA_OPTS[@]}"
       -Xms$SPARK_EXECUTOR_MEMORY
       -Xmx$SPARK_EXECUTOR_MEMORY
       -cp "$SPARK_CLASSPATH"

From a295034da6178f8654c3977903435384b3765b5e Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Fri, 19 Jan 2018 07:36:06 +0900
Subject: [PATCH 131/566] [SPARK-23094] Fix invalid character handling in
 JsonDataSource

## What changes were proposed in this pull request?

There were two related fixes regarding `from_json`, `get_json_object` and `json_tuple` ([Fix #1](https://github.com/apache/spark/commit/c8803c06854683c8761fdb3c0e4c55d5a9e22a95),
 [Fix #2](https://github.com/apache/spark/commit/86174ea89b39a300caaba6baffac70f3dc702788)), but they weren't comprehensive it seems. I wanted to extend those fixes to all the parsers, and add tests for each case.

## How was this patch tested?

Regression tests

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #20302 from brkyvz/json-invfix.

(cherry picked from commit e01919e834d301e13adc8919932796ebae900576)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 .../catalyst/json/CreateJacksonParser.scala   |  5 +--
 .../sources/JsonHadoopFsRelationSuite.scala   | 34 +++++++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala
index 025a388aacaa5..b1672e7e2fca2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala
@@ -40,10 +40,11 @@ private[sql] object CreateJacksonParser extends Serializable {
   }
 
   def text(jsonFactory: JsonFactory, record: Text): JsonParser = {
-    jsonFactory.createParser(record.getBytes, 0, record.getLength)
+    val bain = new ByteArrayInputStream(record.getBytes, 0, record.getLength)
+    jsonFactory.createParser(new InputStreamReader(bain, "UTF-8"))
   }
 
   def inputStream(jsonFactory: JsonFactory, record: InputStream): JsonParser = {
-    jsonFactory.createParser(record)
+    jsonFactory.createParser(new InputStreamReader(record, "UTF-8"))
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
index 49be30435ad2f..27f398ebf301a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
@@ -28,6 +28,8 @@ import org.apache.spark.sql.types._
 class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
   override val dataSourceName: String = "json"
 
+  private val badJson = "\u0000\u0000\u0000A\u0001AAA"
+
   // JSON does not write data of NullType and does not play well with BinaryType.
   override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
     case _: NullType => false
@@ -105,4 +107,36 @@ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
       )
     }
   }
+
+  test("invalid json with leading nulls - from file (multiLine=true)") {
+    import testImplicits._
+    withTempDir { tempDir =>
+      val path = tempDir.getAbsolutePath
+      Seq(badJson, """{"a":1}""").toDS().write.mode("overwrite").text(path)
+      val expected = s"""$badJson\n{"a":1}\n"""
+      val schema = new StructType().add("a", IntegerType).add("_corrupt_record", StringType)
+      val df =
+        spark.read.format(dataSourceName).option("multiLine", true).schema(schema).load(path)
+      checkAnswer(df, Row(null, expected))
+    }
+  }
+
+  test("invalid json with leading nulls - from file (multiLine=false)") {
+    import testImplicits._
+    withTempDir { tempDir =>
+      val path = tempDir.getAbsolutePath
+      Seq(badJson, """{"a":1}""").toDS().write.mode("overwrite").text(path)
+      val schema = new StructType().add("a", IntegerType).add("_corrupt_record", StringType)
+      val df =
+        spark.read.format(dataSourceName).option("multiLine", false).schema(schema).load(path)
+      checkAnswer(df, Seq(Row(1, null), Row(null, badJson)))
+    }
+  }
+
+  test("invalid json with leading nulls - from dataset") {
+    import testImplicits._
+    checkAnswer(
+      spark.read.json(Seq(badJson).toDS()),
+      Row(badJson))
+  }
 }

From 7057e310ab3756c83c13586137e8390fe9ef7e9a Mon Sep 17 00:00:00 2001
From: Yinan Li <liyinan926@gmail.com>
Date: Thu, 18 Jan 2018 14:44:22 -0800
Subject: [PATCH 132/566] [SPARK-22962][K8S] Fail fast if submission client
 local files are used

## What changes were proposed in this pull request?

In the Kubernetes mode, fails fast in the submission process if any submission client local dependencies are used as the use case is not supported yet.

## How was this patch tested?

Unit tests, integration tests, and manual tests.

vanzin foxish

Author: Yinan Li <liyinan926@gmail.com>

Closes #20320 from liyinan926/master.

(cherry picked from commit 5d7c4ba4d73a72f26d591108db3c20b4a6c84f3f)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 docs/running-on-kubernetes.md                 |  5 ++-
 .../k8s/submit/DriverConfigOrchestrator.scala | 14 ++++++++-
 .../DriverConfigOrchestratorSuite.scala       | 31 ++++++++++++++++++-
 3 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 08ec34c63ba3f..d6b1735ce5550 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -117,7 +117,10 @@ This URI is the location of the example jar that is already in the Docker image.
 If your application's dependencies are all hosted in remote locations like HDFS or HTTP servers, they may be referred to
 by their appropriate remote URIs. Also, application dependencies can be pre-mounted into custom-built Docker images.
 Those dependencies can be added to the classpath by referencing them with `local://` URIs and/or setting the
-`SPARK_EXTRA_CLASSPATH` environment variable in your Dockerfiles.
+`SPARK_EXTRA_CLASSPATH` environment variable in your Dockerfiles. The `local://` scheme is also required when referring to
+dependencies in custom-built Docker images in `spark-submit`. Note that using application dependencies from the submission
+client's local file system is currently not yet supported.
+
 
 ### Using Remote Dependencies
 When there are application dependencies hosted in remote locations like HDFS or HTTP servers, the driver and executor pods
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala
index c9cc300d65569..ae70904621184 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala
@@ -20,7 +20,7 @@ import java.util.UUID
 
 import com.google.common.primitives.Longs
 
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.deploy.k8s.{KubernetesUtils, MountSecretsBootstrap}
 import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.Constants._
@@ -117,6 +117,12 @@ private[spark] class DriverConfigOrchestrator(
       .map(_.split(","))
       .getOrElse(Array.empty[String])
 
+    // TODO(SPARK-23153): remove once submission client local dependencies are supported.
+    if (existSubmissionLocalFiles(sparkJars) || existSubmissionLocalFiles(sparkFiles)) {
+      throw new SparkException("The Kubernetes mode does not yet support referencing application " +
+        "dependencies in the local file system.")
+    }
+
     val dependencyResolutionStep = if (sparkJars.nonEmpty || sparkFiles.nonEmpty) {
       Seq(new DependencyResolutionStep(
         sparkJars,
@@ -162,6 +168,12 @@ private[spark] class DriverConfigOrchestrator(
       initContainerBootstrapStep
   }
 
+  private def existSubmissionLocalFiles(files: Seq[String]): Boolean = {
+    files.exists { uri =>
+      Utils.resolveURI(uri).getScheme == "file"
+    }
+  }
+
   private def existNonContainerLocalFiles(files: Seq[String]): Boolean = {
     files.exists { uri =>
       Utils.resolveURI(uri).getScheme != "local"
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala
index 65274c6f50e01..033d303e946fd 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala
@@ -16,7 +16,7 @@
  */
 package org.apache.spark.deploy.k8s.submit
 
-import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
 import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.submit.steps._
 
@@ -117,6 +117,35 @@ class DriverConfigOrchestratorSuite extends SparkFunSuite {
       classOf[DriverMountSecretsStep])
   }
 
+  test("Submission using client local dependencies") {
+    val sparkConf = new SparkConf(false)
+      .set(CONTAINER_IMAGE, DRIVER_IMAGE)
+    var orchestrator = new DriverConfigOrchestrator(
+      APP_ID,
+      LAUNCH_TIME,
+      Some(JavaMainAppResource("file:///var/apps/jars/main.jar")),
+      APP_NAME,
+      MAIN_CLASS,
+      APP_ARGS,
+      sparkConf)
+    assertThrows[SparkException] {
+      orchestrator.getAllConfigurationSteps
+    }
+
+    sparkConf.set("spark.files", "/path/to/file1,/path/to/file2")
+    orchestrator = new DriverConfigOrchestrator(
+      APP_ID,
+      LAUNCH_TIME,
+      Some(JavaMainAppResource("local:///var/apps/jars/main.jar")),
+      APP_NAME,
+      MAIN_CLASS,
+      APP_ARGS,
+      sparkConf)
+    assertThrows[SparkException] {
+      orchestrator.getAllConfigurationSteps
+    }
+  }
+
   private def validateStepTypes(
       orchestrator: DriverConfigOrchestrator,
       types: Class[_ <: DriverConfigurationStep]*): Unit = {

From acf3b70d16cc4d2416b4ce3f42b3cf95836170ed Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 18 Jan 2018 16:29:45 -0800
Subject: [PATCH 133/566] [SPARK-23142][SS][DOCS] Added docs for continuous
 processing

## What changes were proposed in this pull request?

Added documentation for continuous processing. Modified two locations.
- Modified the overview to have a mention of Continuous Processing.
- Added a new section on Continuous Processing at the end.

![image](https://user-images.githubusercontent.com/663212/35083551-a3dd23f6-fbd4-11e7-9e7e-90866f131ca9.png)
![image](https://user-images.githubusercontent.com/663212/35083618-d844027c-fbd4-11e7-9fde-75992cc517bd.png)

## How was this patch tested?
N/A

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #20308 from tdas/SPARK-23142.

(cherry picked from commit 4cd2ecc0c7222fef1337e04f1948333296c3be86)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../structured-streaming-programming-guide.md | 98 ++++++++++++++++++-
 1 file changed, 97 insertions(+), 1 deletion(-)

diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index 1779a4215e085..2ddba2f0d942e 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -10,7 +10,9 @@ title: Structured Streaming Programming Guide
 # Overview
 Structured Streaming is a scalable and fault-tolerant stream processing engine built on the Spark SQL engine. You can express your streaming computation the same way you would express a batch computation on static data. The Spark SQL engine will take care of running it incrementally and continuously and updating the final result as streaming data continues to arrive. You can use the [Dataset/DataFrame API](sql-programming-guide.html) in Scala, Java, Python or R to express streaming aggregations, event-time windows, stream-to-batch joins, etc. The computation is executed on the same optimized Spark SQL engine. Finally, the system ensures end-to-end exactly-once fault-tolerance guarantees through checkpointing and Write Ahead Logs. In short, *Structured Streaming provides fast, scalable, fault-tolerant, end-to-end exactly-once stream processing without the user having to reason about streaming.*
 
-In this guide, we are going to walk you through the programming model and the APIs. First, let's start with a simple example - a streaming word count.
+Internally, by default, Structured Streaming queries are processed using a *micro-batch processing* engine, which processes data streams as a series of small batch jobs thereby achieving end-to-end latencies as low as 100 milliseconds and exactly-once fault-tolerance guarantees. However, since Spark 2.3, we have introduced a new low-latency processing mode called **Continuous Processing**, which can achieve end-to-end latencies as low as 1 millisecond with at-least-once guarantees. Without changing the Dataset/DataFrame operations in your queries, you will be able choose the mode based on your application requirements. 
+
+In this guide, we are going to walk you through the programming model and the APIs. We are going to explain the concepts mostly using the default micro-batch processing model, and then [later](#continuous-processing-experimental) discuss Continuous Processing model. First, let's start with a simple example of a Structured Streaming query - a streaming word count.
 
 # Quick Example
 Let’s say you want to maintain a running word count of text data received from a data server listening on a TCP socket. Let’s see how you can express this using Structured Streaming. You can see the full code in
@@ -2434,6 +2436,100 @@ write.stream(aggDF, "memory", outputMode = "complete", checkpointLocation = "pat
 </div>
 </div>
 
+# Continuous Processing [Experimental]
+**Continuous processing** is a new, experimental streaming execution mode introduced in Spark 2.3 that enables low (~1 ms) end-to-end latency with at-least-once fault-tolerance guarantees. Compare this with the default *micro-batch processing* engine which can achieve exactly-once guarantees but achieve latencies of ~100ms at best. For some types of queries (discussed below), you can choose which mode to execute them in without modifying the application logic (i.e. without changing the DataFrame/Dataset operations). 
+
+To run a supported query in continuous processing mode, all you need to do is specify a **continuous trigger** with the desired checkpoint interval as a parameter. For example, 
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+{% highlight scala %}
+import org.apache.spark.sql.streaming.Trigger
+
+spark
+  .readStream
+  .format("rate")
+  .option("rowsPerSecond", "10")
+  .option("")
+
+spark
+  .readStream
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribe", "topic1")
+  .load()
+  .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+  .writeStream
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("topic", "topic1")
+  .trigger(Trigger.Continuous("1 second"))  // only change in query
+  .start()
+{% endhighlight %}
+</div>
+<div data-lang="java"  markdown="1">  
+{% highlight java %}
+import org.apache.spark.sql.streaming.Trigger;
+
+spark
+  .readStream
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribe", "topic1")
+  .load()
+  .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+  .writeStream
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("topic", "topic1")
+  .trigger(Trigger.Continuous("1 second"))  // only change in query
+  .start();
+{% endhighlight %}
+</div>
+<div data-lang="python"  markdown="1">  
+{% highlight python %}
+spark \
+  .readStream \
+  .format("kafka") \
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2") \
+  .option("subscribe", "topic1") \
+  .load() \
+  .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
+  .writeStream \
+  .format("kafka") \
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2") \
+  .option("topic", "topic1") \
+  .trigger(continuous="1 second") \     # only change in query
+  .start()
+
+{% endhighlight %}
+</div>
+</div>
+
+A checkpoint interval of 1 second means that the continuous processing engine will records the progress of the query every second. The resulting checkpoints are in a format compatible with the micro-batch engine, hence any query can be restarted with any trigger. For example, a supported query started with the micro-batch mode can be restarted in continuous mode, and vice versa. Note that any time you switch to continuous mode, you will get at-least-once fault-tolerance guarantees.
+
+## Supported Queries
+As of Spark 2.3, only the following type of queries are supported in the continuous processing mode.
+
+- *Operations*: Only map-like Dataset/DataFrame operations are supported in continuous mode, that is, only projections (`select`, `map`, `flatMap`, `mapPartitions`, etc.) and selections (`where`, `filter`, etc.).
+  + All SQL functions are supported except aggregation functions (since aggregations are not yet supported), `current_timestamp()` and `current_date()` (deterministic computations using time is challenging).
+
+- *Sources*:
+  + Kafka source: All options are supported.
+  + Rate source: Good for testing. Only options that are supported in the continuous mode are `numPartitions` and `rowsPerSecond`.
+
+- *Sinks*: 
+  + Kafka sink: All options are supported.
+  + Memory sink: Good for debugging.
+  + Console sink: Good for debugging. All options are supported. Note that the console will print every checkpoint interval that you have specified in the continuous trigger. 
+
+See [Input Sources](#input-sources) and [Output Sinks](#output-sinks) sections for more details on them. While the console sink is good for testing, the end-to-end low-latency processing can be best observed with Kafka as the source and sink, as this allows the engine to process the data and make the results available in the output topic within milliseconds of the input data being available in the input topic.
+
+## Caveats
+- Continuous processing engine launches multiple long-running tasks that continuously read data from sources, process it and continuously write to sinks. The number of tasks required by the query depends on how many partitions the query can read from the sources in parallel. Therefore, before starting a continuous processing query, you must ensure there are enough cores in the cluster to all the tasks in parallel. For example, if you are reading from a Kafka topic that has 10 partitions, then the cluster must have at least 10 cores for the query to make progress.
+- Stopping a continuous processing stream may produce spurious task termination warnings. These can be safely ignored.
+- There are currently no automatic retries of failed tasks. Any failure will lead to the query being stopped and it needs to be manually restarted from the checkpoint.
+
 # Additional Information
 
 **Further Reading**

From 225b1afdd1582cd4087e7cb98834505eaf16743e Mon Sep 17 00:00:00 2001
From: brandonJY <brandonJY@users.noreply.github.com>
Date: Thu, 18 Jan 2018 18:57:49 -0600
Subject: [PATCH 134/566] [DOCS] change to dataset for java code in
 structured-streaming-kafka-integration document

## What changes were proposed in this pull request?

In latest structured-streaming-kafka-integration document, Java code example for Kafka integration is using `DataFrame<Row>`, shouldn't it be changed to `DataSet<Row>`?

## How was this patch tested?

manual test has been performed to test the updated example Java code in Spark 2.2.1 with Kafka 1.0

Author: brandonJY <brandonJY@users.noreply.github.com>

Closes #20312 from brandonJY/patch-2.

(cherry picked from commit 6121e91b7f5c9513d68674e4d5edbc3a4a5fd5fd)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 docs/structured-streaming-kafka-integration.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/structured-streaming-kafka-integration.md b/docs/structured-streaming-kafka-integration.md
index bab0be8ddeb9f..461c29ce1ba89 100644
--- a/docs/structured-streaming-kafka-integration.md
+++ b/docs/structured-streaming-kafka-integration.md
@@ -61,7 +61,7 @@ df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
 {% highlight java %}
 
 // Subscribe to 1 topic
-DataFrame<Row> df = spark
+Dataset<Row> df = spark
   .readStream()
   .format("kafka")
   .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
@@ -70,7 +70,7 @@ DataFrame<Row> df = spark
 df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
 
 // Subscribe to multiple topics
-DataFrame<Row> df = spark
+Dataset<Row> df = spark
   .readStream()
   .format("kafka")
   .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
@@ -79,7 +79,7 @@ DataFrame<Row> df = spark
 df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
 
 // Subscribe to a pattern
-DataFrame<Row> df = spark
+Dataset<Row> df = spark
   .readStream()
   .format("kafka")
   .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
@@ -171,7 +171,7 @@ df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
 {% highlight java %}
 
 // Subscribe to 1 topic defaults to the earliest and latest offsets
-DataFrame<Row> df = spark
+Dataset<Row> df = spark
   .read()
   .format("kafka")
   .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
@@ -180,7 +180,7 @@ DataFrame<Row> df = spark
 df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)");
 
 // Subscribe to multiple topics, specifying explicit Kafka offsets
-DataFrame<Row> df = spark
+Dataset<Row> df = spark
   .read()
   .format("kafka")
   .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
@@ -191,7 +191,7 @@ DataFrame<Row> df = spark
 df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)");
 
 // Subscribe to a pattern, at the earliest and latest offsets
-DataFrame<Row> df = spark
+Dataset<Row> df = spark
   .read()
   .format("kafka")
   .option("kafka.bootstrap.servers", "host1:port1,host2:port2")

From 541dbc00b24f17d83ea2531970f2e9fe57fe3718 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Fri, 19 Jan 2018 11:37:08 +0800
Subject: [PATCH 135/566] [SPARK-23054][SQL][PYSPARK][FOLLOWUP] Use sqlType
 casting when casting PythonUserDefinedType to String.

## What changes were proposed in this pull request?

This is a follow-up of #20246.

If a UDT in Python doesn't have its corresponding Scala UDT, cast to string will be the raw string of the internal value, e.g. `"org.apache.spark.sql.catalyst.expressions.UnsafeArrayDataxxxxxxxx"` if the internal type is `ArrayType`.

This pr fixes it by using its `sqlType` casting.

## How was this patch tested?

Added a test and existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #20306 from ueshin/issues/SPARK-23054/fup1.

(cherry picked from commit 568055da93049c207bb830f244ff9b60c638837c)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 python/pyspark/sql/tests.py                           | 11 +++++++++++
 .../apache/spark/sql/catalyst/expressions/Cast.scala  |  2 ++
 .../org/apache/spark/sql/test/ExamplePointUDT.scala   |  2 ++
 3 files changed, 15 insertions(+)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 25483594f2725..4fee2ecde391b 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1189,6 +1189,17 @@ def test_union_with_udt(self):
             ]
         )
 
+    def test_cast_to_string_with_udt(self):
+        from pyspark.sql.tests import ExamplePointUDT, ExamplePoint
+        from pyspark.sql.functions import col
+        row = (ExamplePoint(1.0, 2.0), PythonOnlyPoint(3.0, 4.0))
+        schema = StructType([StructField("point", ExamplePointUDT(), False),
+                             StructField("pypoint", PythonOnlyUDT(), False)])
+        df = self.spark.createDataFrame([row], schema)
+
+        result = df.select(col('point').cast('string'), col('pypoint').cast('string')).head()
+        self.assertEqual(result, Row(point=u'(1.0, 2.0)', pypoint=u'[3.0, 4.0]'))
+
     def test_column_operators(self):
         ci = self.df.key
         cs = self.df.value
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index a95ebe301b9d1..79b051670e9e4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -282,6 +282,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
         builder.append("]")
         builder.build()
       })
+    case pudt: PythonUserDefinedType => castToString(pudt.sqlType)
     case udt: UserDefinedType[_] =>
       buildCast[Any](_, o => UTF8String.fromString(udt.deserialize(o).toString))
     case _ => buildCast[Any](_, o => UTF8String.fromString(o.toString))
@@ -838,6 +839,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
              |$evPrim = $buffer.build();
            """.stripMargin
         }
+      case pudt: PythonUserDefinedType => castToStringCode(pudt.sqlType, ctx)
       case udt: UserDefinedType[_] =>
         val udtRef = ctx.addReferenceObj("udt", udt)
         (c, evPrim, evNull) => {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
index a73e4272950a4..8bab7e1c58762 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
@@ -34,6 +34,8 @@ private[sql] class ExamplePoint(val x: Double, val y: Double) extends Serializab
     case that: ExamplePoint => this.x == that.x && this.y == that.y
     case _ => false
   }
+
+  override def toString(): String = s"($x, $y)"
 }
 
 /**

From 54c1fae12df654c7713ac5e7eb4da7bb2f785401 Mon Sep 17 00:00:00 2001
From: Sameer Agarwal <sameerag@apache.org>
Date: Fri, 19 Jan 2018 01:38:08 -0800
Subject: [PATCH 136/566] [BUILD][MINOR] Fix java style check issues

## What changes were proposed in this pull request?

This patch fixes a few recently introduced java style check errors in master and release branch.

As an aside, given that [java linting currently fails](https://github.com/apache/spark/pull/10763
) on machines with a clean maven cache, it'd be great to find another workaround to [re-enable the java style checks](https://github.com/apache/spark/blob/3a07eff5af601511e97a05e6fea0e3d48f74c4f0/dev/run-tests.py#L577) as part of Spark PRB.

/cc zsxwing JoshRosen srowen for any suggestions

## How was this patch tested?

Manual Check

Author: Sameer Agarwal <sameerag@apache.org>

Closes #20323 from sameeragarwal/java.

(cherry picked from commit 9c4b99861cda3f9ec44ca8c1adc81a293508190c)
Signed-off-by: Sameer Agarwal <sameerag@apache.org>
---
 .../spark/sql/sources/v2/writer/DataSourceV2Writer.java     | 6 ++++--
 .../org/apache/spark/sql/vectorized/ArrowColumnVector.java  | 5 +++--
 .../apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java  | 3 ++-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java
index 317ac45bcfd74..f1ef411423162 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java
@@ -28,8 +28,10 @@
 /**
  * A data source writer that is returned by
  * {@link WriteSupport#createWriter(String, StructType, SaveMode, DataSourceV2Options)}/
- * {@link org.apache.spark.sql.sources.v2.streaming.MicroBatchWriteSupport#createMicroBatchWriter(String, long, StructType, OutputMode, DataSourceV2Options)}/
- * {@link org.apache.spark.sql.sources.v2.streaming.ContinuousWriteSupport#createContinuousWriter(String, StructType, OutputMode, DataSourceV2Options)}.
+ * {@link org.apache.spark.sql.sources.v2.streaming.MicroBatchWriteSupport#createMicroBatchWriter(
+ * String, long, StructType, OutputMode, DataSourceV2Options)}/
+ * {@link org.apache.spark.sql.sources.v2.streaming.ContinuousWriteSupport#createContinuousWriter(
+ * String, StructType, OutputMode, DataSourceV2Options)}.
  * It can mix in various writing optimization interfaces to speed up the data saving. The actual
  * writing logic is delegated to {@link DataWriter}.
  *
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
index eb69001fe677e..bfd1b4cb0ef12 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
@@ -556,8 +556,9 @@ final int getArrayOffset(int rowId) {
   /**
    * Any call to "get" method will throw UnsupportedOperationException.
    *
-   * Access struct values in a ArrowColumnVector doesn't use this accessor. Instead, it uses getStruct() method defined
-   * in the parent class. Any call to "get" method in this class is a bug in the code.
+   * Access struct values in a ArrowColumnVector doesn't use this accessor. Instead, it uses
+   * getStruct() method defined in the parent class. Any call to "get" method in this class is a
+   * bug in the code.
    *
    */
   private static class StructAccessor extends ArrowVectorAccessor {
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java
index 44e5146d7c553..98d6a53b54d28 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java
@@ -69,7 +69,8 @@ public DataReader<ColumnarBatch> createDataReader() {
       ColumnVector[] vectors = new ColumnVector[2];
       vectors[0] = i;
       vectors[1] = j;
-      this.batch = new ColumnarBatch(new StructType().add("i", "int").add("j", "int"), vectors, BATCH_SIZE);
+      this.batch =
+        new ColumnarBatch(new StructType().add("i", "int").add("j", "int"), vectors, BATCH_SIZE);
       return this;
     }
 

From e58223171ecae6450482aadf4e7994c3b8d8a58d Mon Sep 17 00:00:00 2001
From: Nick Pentreath <nickp@za.ibm.com>
Date: Fri, 19 Jan 2018 12:43:23 +0200
Subject: [PATCH 137/566] [SPARK-23127][DOC] Update FeatureHasher guide for
 categoricalCols parameter

Update user guide entry for `FeatureHasher` to match the Scala / Python doc, to describe the `categoricalCols` parameter.

## How was this patch tested?

Doc only

Author: Nick Pentreath <nickp@za.ibm.com>

Closes #20293 from MLnick/SPARK-23127-catCol-userguide.

(cherry picked from commit 60203fca6a605ad158184e1e0ce5187e144a3ea7)
Signed-off-by: Nick Pentreath <nickp@za.ibm.com>
---
 docs/ml-features.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 72643137d96b1..10183c3e78c76 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -222,9 +222,9 @@ The `FeatureHasher` transformer operates on multiple columns. Each column may co
 numeric or categorical features. Behavior and handling of column data types is as follows:
 
 - Numeric columns: For numeric features, the hash value of the column name is used to map the
-feature value to its index in the feature vector. Numeric features are never treated as
-categorical, even when they are integers. You must explicitly convert numeric columns containing
-categorical features to strings first.
+feature value to its index in the feature vector. By default, numeric features are not treated
+as categorical (even when they are integers). To treat them as categorical, specify the relevant
+columns using the `categoricalCols` parameter.
 - String columns: For categorical features, the hash value of the string "column_name=value"
 is used to map to the vector index, with an indicator value of `1.0`. Thus, categorical features
 are "one-hot" encoded (similarly to using [OneHotEncoder](ml-features.html#onehotencoder) with

From ef7989d55b65f386ed1ab87535a44e9367029a52 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 19 Jan 2018 12:48:42 +0200
Subject: [PATCH 138/566] [SPARK-23048][ML] Add OneHotEncoderEstimator document
 and examples

## What changes were proposed in this pull request?

We have `OneHotEncoderEstimator` now and `OneHotEncoder` will be deprecated since 2.3.0. We should add `OneHotEncoderEstimator` into mllib document.

We also need to provide corresponding examples for `OneHotEncoderEstimator` which are used in the document too.

## How was this patch tested?

Existing tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #20257 from viirya/SPARK-23048.

(cherry picked from commit b74366481cc87490adf4e69d26389ec737548c15)
Signed-off-by: Nick Pentreath <nickp@za.ibm.com>
---
 docs/ml-features.md                           | 28 ++++++++-----
 ...=> JavaOneHotEncoderEstimatorExample.java} | 41 ++++++++-----------
 ...py => onehot_encoder_estimator_example.py} | 29 +++++++------
 ...la => OneHotEncoderEstimatorExample.scala} | 40 ++++++++----------
 4 files changed, 68 insertions(+), 70 deletions(-)
 rename examples/src/main/java/org/apache/spark/examples/ml/{JavaOneHotEncoderExample.java => JavaOneHotEncoderEstimatorExample.java} (62%)
 rename examples/src/main/python/ml/{onehot_encoder_example.py => onehot_encoder_estimator_example.py} (65%)
 rename examples/src/main/scala/org/apache/spark/examples/ml/{OneHotEncoderExample.scala => OneHotEncoderEstimatorExample.scala} (65%)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 10183c3e78c76..466a8fbe99cf6 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -775,35 +775,43 @@ for more details on the API.
 </div>
 </div>
 
-## OneHotEncoder
+## OneHotEncoder (Deprecated since 2.3.0)
 
-[One-hot encoding](http://en.wikipedia.org/wiki/One-hot) maps a column of label indices to a column of binary vectors, with at most a single one-value. This encoding allows algorithms which expect continuous features, such as Logistic Regression, to use categorical features.
+Because this existing `OneHotEncoder` is a stateless transformer, it is not usable on new data where the number of categories may differ from the training data. In order to fix this, a new `OneHotEncoderEstimator` was created that produces an `OneHotEncoderModel` when fitting. For more detail, please see [SPARK-13030](https://issues.apache.org/jira/browse/SPARK-13030).
+
+`OneHotEncoder` has been deprecated in 2.3.0 and will be removed in 3.0.0. Please use [OneHotEncoderEstimator](ml-features.html#onehotencoderestimator) instead.
+
+## OneHotEncoderEstimator
+
+[One-hot encoding](http://en.wikipedia.org/wiki/One-hot) maps a categorical feature, represented as a label index, to a binary vector with at most a single one-value indicating the presence of a specific feature value from among the set of all feature values. This encoding allows algorithms which expect continuous features, such as Logistic Regression, to use categorical features. For string type input data, it is common to encode categorical features using [StringIndexer](ml-features.html#stringindexer) first.
+
+`OneHotEncoderEstimator` can transform multiple columns, returning an one-hot-encoded output vector column for each input column. It is common to merge these vectors into a single feature vector using [VectorAssembler](ml-features.html#vectorassembler).
+
+`OneHotEncoderEstimator` supports the `handleInvalid` parameter to choose how to handle invalid input during transforming data. Available options include 'keep' (any invalid inputs are assigned to an extra categorical index) and 'error' (throw an error).
 
 **Examples**
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [OneHotEncoder Scala docs](api/scala/index.html#org.apache.spark.ml.feature.OneHotEncoder)
-for more details on the API.
+Refer to the [OneHotEncoderEstimator Scala docs](api/scala/index.html#org.apache.spark.ml.feature.OneHotEncoderEstimator) for more details on the API.
 
-{% include_example scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala %}
+{% include_example scala/org/apache/spark/examples/ml/OneHotEncoderEstimatorExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
 
-Refer to the [OneHotEncoder Java docs](api/java/org/apache/spark/ml/feature/OneHotEncoder.html)
+Refer to the [OneHotEncoderEstimator Java docs](api/java/org/apache/spark/ml/feature/OneHotEncoderEstimator.html)
 for more details on the API.
 
-{% include_example java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java %}
+{% include_example java/org/apache/spark/examples/ml/JavaOneHotEncoderEstimatorExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
 
-Refer to the [OneHotEncoder Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.OneHotEncoder)
-for more details on the API.
+Refer to the [OneHotEncoderEstimator Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.OneHotEncoderEstimator) for more details on the API.
 
-{% include_example python/ml/onehot_encoder_example.py %}
+{% include_example python/ml/onehot_encoder_estimator_example.py %}
 </div>
 </div>
 
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderEstimatorExample.java
similarity index 62%
rename from examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java
rename to examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderEstimatorExample.java
index 99af37676ba98..6f93cff94b725 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderEstimatorExample.java
@@ -23,9 +23,8 @@
 import java.util.Arrays;
 import java.util.List;
 
-import org.apache.spark.ml.feature.OneHotEncoder;
-import org.apache.spark.ml.feature.StringIndexer;
-import org.apache.spark.ml.feature.StringIndexerModel;
+import org.apache.spark.ml.feature.OneHotEncoderEstimator;
+import org.apache.spark.ml.feature.OneHotEncoderModel;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;
@@ -35,41 +34,37 @@
 import org.apache.spark.sql.types.StructType;
 // $example off$
 
-public class JavaOneHotEncoderExample {
+public class JavaOneHotEncoderEstimatorExample {
   public static void main(String[] args) {
     SparkSession spark = SparkSession
       .builder()
-      .appName("JavaOneHotEncoderExample")
+      .appName("JavaOneHotEncoderEstimatorExample")
       .getOrCreate();
 
+    // Note: categorical features are usually first encoded with StringIndexer
     // $example on$
     List<Row> data = Arrays.asList(
-      RowFactory.create(0, "a"),
-      RowFactory.create(1, "b"),
-      RowFactory.create(2, "c"),
-      RowFactory.create(3, "a"),
-      RowFactory.create(4, "a"),
-      RowFactory.create(5, "c")
+      RowFactory.create(0.0, 1.0),
+      RowFactory.create(1.0, 0.0),
+      RowFactory.create(2.0, 1.0),
+      RowFactory.create(0.0, 2.0),
+      RowFactory.create(0.0, 1.0),
+      RowFactory.create(2.0, 0.0)
     );
 
     StructType schema = new StructType(new StructField[]{
-      new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
-      new StructField("category", DataTypes.StringType, false, Metadata.empty())
+      new StructField("categoryIndex1", DataTypes.DoubleType, false, Metadata.empty()),
+      new StructField("categoryIndex2", DataTypes.DoubleType, false, Metadata.empty())
     });
 
     Dataset<Row> df = spark.createDataFrame(data, schema);
 
-    StringIndexerModel indexer = new StringIndexer()
-      .setInputCol("category")
-      .setOutputCol("categoryIndex")
-      .fit(df);
-    Dataset<Row> indexed = indexer.transform(df);
+    OneHotEncoderEstimator encoder = new OneHotEncoderEstimator()
+      .setInputCols(new String[] {"categoryIndex1", "categoryIndex2"})
+      .setOutputCols(new String[] {"categoryVec1", "categoryVec2"});
 
-    OneHotEncoder encoder = new OneHotEncoder()
-      .setInputCol("categoryIndex")
-      .setOutputCol("categoryVec");
-
-    Dataset<Row> encoded = encoder.transform(indexed);
+    OneHotEncoderModel model = encoder.fit(df);
+    Dataset<Row> encoded = model.transform(df);
     encoded.show();
     // $example off$
 
diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_estimator_example.py
similarity index 65%
rename from examples/src/main/python/ml/onehot_encoder_example.py
rename to examples/src/main/python/ml/onehot_encoder_estimator_example.py
index e1996c7f0a55b..2723e681cea7c 100644
--- a/examples/src/main/python/ml/onehot_encoder_example.py
+++ b/examples/src/main/python/ml/onehot_encoder_estimator_example.py
@@ -18,32 +18,31 @@
 from __future__ import print_function
 
 # $example on$
-from pyspark.ml.feature import OneHotEncoder, StringIndexer
+from pyspark.ml.feature import OneHotEncoderEstimator
 # $example off$
 from pyspark.sql import SparkSession
 
 if __name__ == "__main__":
     spark = SparkSession\
         .builder\
-        .appName("OneHotEncoderExample")\
+        .appName("OneHotEncoderEstimatorExample")\
         .getOrCreate()
 
+    # Note: categorical features are usually first encoded with StringIndexer
     # $example on$
     df = spark.createDataFrame([
-        (0, "a"),
-        (1, "b"),
-        (2, "c"),
-        (3, "a"),
-        (4, "a"),
-        (5, "c")
-    ], ["id", "category"])
+        (0.0, 1.0),
+        (1.0, 0.0),
+        (2.0, 1.0),
+        (0.0, 2.0),
+        (0.0, 1.0),
+        (2.0, 0.0)
+    ], ["categoryIndex1", "categoryIndex2"])
 
-    stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
-    model = stringIndexer.fit(df)
-    indexed = model.transform(df)
-
-    encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec")
-    encoded = encoder.transform(indexed)
+    encoder = OneHotEncoderEstimator(inputCols=["categoryIndex1", "categoryIndex2"],
+                                     outputCols=["categoryVec1", "categoryVec2"])
+    model = encoder.fit(df)
+    encoded = model.transform(df)
     encoded.show()
     # $example off$
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderEstimatorExample.scala
similarity index 65%
rename from examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
rename to examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderEstimatorExample.scala
index 274cc1268f4d1..45d816808ed8e 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderEstimatorExample.scala
@@ -19,38 +19,34 @@
 package org.apache.spark.examples.ml
 
 // $example on$
-import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}
+import org.apache.spark.ml.feature.OneHotEncoderEstimator
 // $example off$
 import org.apache.spark.sql.SparkSession
 
-object OneHotEncoderExample {
+object OneHotEncoderEstimatorExample {
   def main(args: Array[String]): Unit = {
     val spark = SparkSession
       .builder
-      .appName("OneHotEncoderExample")
+      .appName("OneHotEncoderEstimatorExample")
       .getOrCreate()
 
+    // Note: categorical features are usually first encoded with StringIndexer
     // $example on$
     val df = spark.createDataFrame(Seq(
-      (0, "a"),
-      (1, "b"),
-      (2, "c"),
-      (3, "a"),
-      (4, "a"),
-      (5, "c")
-    )).toDF("id", "category")
-
-    val indexer = new StringIndexer()
-      .setInputCol("category")
-      .setOutputCol("categoryIndex")
-      .fit(df)
-    val indexed = indexer.transform(df)
-
-    val encoder = new OneHotEncoder()
-      .setInputCol("categoryIndex")
-      .setOutputCol("categoryVec")
-
-    val encoded = encoder.transform(indexed)
+      (0.0, 1.0),
+      (1.0, 0.0),
+      (2.0, 1.0),
+      (0.0, 2.0),
+      (0.0, 1.0),
+      (2.0, 0.0)
+    )).toDF("categoryIndex1", "categoryIndex2")
+
+    val encoder = new OneHotEncoderEstimator()
+      .setInputCols(Array("categoryIndex1", "categoryIndex2"))
+      .setOutputCols(Array("categoryVec1", "categoryVec2"))
+    val model = encoder.fit(df)
+
+    val encoded = model.transform(df)
     encoded.show()
     // $example off$
 

From b7a81999df8f43223403c77db9c1aedddb58370d Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Fri, 19 Jan 2018 19:46:48 +0800
Subject: [PATCH 139/566] [SPARK-23089][STS] Recreate session log directory if
 it doesn't exist

## What changes were proposed in this pull request?

When creating a session directory, Thrift should create the parent directory (i.e. /tmp/base_session_log_dir) if it is not present. It is common that many tools delete empty directories, so the directory may be deleted. This can cause the session log to be disabled.

This was fixed in HIVE-12262: this PR brings it in Spark too.

## How was this patch tested?

manual tests

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #20281 from mgaido91/SPARK-23089.

(cherry picked from commit e41400c3c8aace9eb72e6134173f222627fb0faf)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../hive/service/cli/session/HiveSessionImpl.java    | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java
index 47bfaa86021d6..108074cce3d6d 100644
--- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java
+++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java
@@ -223,6 +223,18 @@ private void configureSession(Map<String, String> sessionConfMap) throws HiveSQL
 
   @Override
   public void setOperationLogSessionDir(File operationLogRootDir) {
+    if (!operationLogRootDir.exists()) {
+      LOG.warn("The operation log root directory is removed, recreating: " +
+          operationLogRootDir.getAbsolutePath());
+      if (!operationLogRootDir.mkdirs()) {
+        LOG.warn("Unable to create operation log root directory: " +
+            operationLogRootDir.getAbsolutePath());
+      }
+    }
+    if (!operationLogRootDir.canWrite()) {
+      LOG.warn("The operation log root directory is not writable: " +
+          operationLogRootDir.getAbsolutePath());
+    }
     sessionLogDir = new File(operationLogRootDir, sessionHandle.getHandleIdentifier().toString());
     isOperationLogEnabled = true;
     if (!sessionLogDir.exists()) {

From 8d6845cf926a14e21ca29a43f2cc9a3a9475afd5 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Fri, 19 Jan 2018 22:47:18 +0800
Subject: [PATCH 140/566] [SPARK-23000][TEST] Keep Derby DB Location Unchanged
 After Session Cloning

## What changes were proposed in this pull request?
After session cloning in `TestHive`, the conf of the singleton SparkContext for derby DB location is changed to a new directory. The new directory is created in `HiveUtils.newTemporaryConfiguration(useInMemoryDerby = false)`.

This PR is to keep the conf value of `ConfVars.METASTORECONNECTURLKEY.varname` unchanged during the session clone.

## How was this patch tested?
The issue can be reproduced by the command:
> build/sbt -Phive "hive/test-only org.apache.spark.sql.hive.HiveSessionStateSuite org.apache.spark.sql.hive.DataSourceWithHiveMetastoreCatalogSuite"

Also added a test case.

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20328 from gatorsmile/fixTestFailure.

(cherry picked from commit 6c39654efcb2aa8cb4d082ab7277a6fa38fb48e4)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../org/apache/spark/sql/SessionStateSuite.scala |  5 +----
 .../apache/spark/sql/hive/test/TestHive.scala    |  8 +++++++-
 .../spark/sql/hive/HiveSessionStateSuite.scala   | 16 +++++++++++++---
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala
index c01666770720c..8386f324d8fed 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql
 
-import org.scalatest.BeforeAndAfterAll
-import org.scalatest.BeforeAndAfterEach
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.SparkFunSuite
@@ -28,8 +26,7 @@ import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.QueryExecution
 import org.apache.spark.sql.util.QueryExecutionListener
 
-class SessionStateSuite extends SparkFunSuite
-    with BeforeAndAfterEach with BeforeAndAfterAll {
+class SessionStateSuite extends SparkFunSuite {
 
   /**
    * A shared SparkSession for all tests in this suite. Make sure you reset any changes to this
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index b6be00dbb3a73..c84131fc3212a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -180,7 +180,13 @@ private[hive] class TestHiveSparkSession(
       ConfVars.METASTORE_INTEGER_JDO_PUSHDOWN.varname -> "true",
       // scratch directory used by Hive's metastore client
       ConfVars.SCRATCHDIR.varname -> TestHiveContext.makeScratchDir().toURI.toString,
-      ConfVars.METASTORE_CLIENT_CONNECT_RETRY_DELAY.varname -> "1")
+      ConfVars.METASTORE_CLIENT_CONNECT_RETRY_DELAY.varname -> "1") ++
+      // After session cloning, the JDBC connect string for a JDBC metastore should not be changed.
+      existingSharedState.map { state =>
+        val connKey =
+          state.sparkContext.hadoopConfiguration.get(ConfVars.METASTORECONNECTURLKEY.varname)
+        ConfVars.METASTORECONNECTURLKEY.varname -> connKey
+      }
 
     metastoreTempConf.foreach { case (k, v) =>
       sc.hadoopConfiguration.set(k, v)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSessionStateSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSessionStateSuite.scala
index 958ad3e1c3ce8..3d1a0b054dc31 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSessionStateSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSessionStateSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.hive
 
-import org.scalatest.BeforeAndAfterEach
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.hive.test.TestHiveSingleton
@@ -25,8 +25,7 @@ import org.apache.spark.sql.hive.test.TestHiveSingleton
 /**
  * Run all tests from `SessionStateSuite` with a Hive based `SessionState`.
  */
-class HiveSessionStateSuite extends SessionStateSuite
-  with TestHiveSingleton with BeforeAndAfterEach {
+class HiveSessionStateSuite extends SessionStateSuite with TestHiveSingleton {
 
   override def beforeAll(): Unit = {
     // Reuse the singleton session
@@ -38,4 +37,15 @@ class HiveSessionStateSuite extends SessionStateSuite
     activeSession = null
     super.afterAll()
   }
+
+  test("Clone then newSession") {
+    val sparkSession = hiveContext.sparkSession
+    val conf = sparkSession.sparkContext.hadoopConfiguration
+    val oldValue = conf.get(ConfVars.METASTORECONNECTURLKEY.varname)
+    sparkSession.cloneSession()
+    sparkSession.sharedState.externalCatalog.client.newSession()
+    val newValue = conf.get(ConfVars.METASTORECONNECTURLKEY.varname)
+    assert(oldValue == newValue,
+      "cloneSession and then newSession should not affect the Derby directory")
+  }
 }

From 55efeffd774a776806f379df5b2209af05270cc4 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 19 Jan 2018 08:58:21 -0800
Subject: [PATCH 141/566] [SPARK-23149][SQL] polish ColumnarBatch

## What changes were proposed in this pull request?

Several cleanups in `ColumnarBatch`
* remove `schema`. The `ColumnVector`s inside `ColumnarBatch` already have the data type information, we don't need this `schema`.
* remove `capacity`. `ColumnarBatch` is just a wrapper of `ColumnVector`s, not builders, it doesn't need a capacity property.
* remove `DEFAULT_BATCH_SIZE`. As a wrapper, `ColumnarBatch` can't decide the batch size, it should be decided by the reader, e.g. parquet reader, orc reader, cached table reader. The default batch size should also be defined by the reader.

## How was this patch tested?

existing tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20316 from cloud-fan/columnar-batch.

(cherry picked from commit d8aaa771e249b3f54b57ce24763e53fd65a0dbf7)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../orc/OrcColumnarBatchReader.java           | 49 +++++++------------
 .../SpecificParquetRecordReaderBase.java      | 12 ++---
 .../VectorizedParquetRecordReader.java        | 24 ++++-----
 .../vectorized/ColumnVectorUtils.java         | 18 +++----
 .../spark/sql/vectorized/ColumnarBatch.java   | 20 +-------
 .../VectorizedHashMapGenerator.scala          |  2 +-
 .../sql/execution/arrow/ArrowConverters.scala |  2 +-
 .../columnar/InMemoryTableScanExec.scala      |  5 +-
 .../python/ArrowEvalPythonExec.scala          |  8 +--
 .../execution/python/ArrowPythonRunner.scala  |  2 +-
 .../sql/sources/v2/JavaBatchDataSourceV2.java |  3 +-
 .../vectorized/ColumnarBatchSuite.scala       |  7 ++-
 .../sql/sources/v2/DataSourceV2Suite.scala    |  3 +-
 13 files changed, 61 insertions(+), 94 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
index 36fdf2bdf84d2..89bae4326e93b 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
@@ -49,18 +49,8 @@
  * After creating, `initialize` and `initBatch` should be called sequentially.
  */
 public class OrcColumnarBatchReader extends RecordReader<Void, ColumnarBatch> {
-
-  /**
-   * The default size of batch. We use this value for ORC reader to make it consistent with Spark's
-   * columnar batch, because their default batch sizes are different like the following:
-   *
-   * - ORC's VectorizedRowBatch.DEFAULT_SIZE = 1024
-   * - Spark's ColumnarBatch.DEFAULT_BATCH_SIZE = 4 * 1024
-   */
-  private static final int DEFAULT_SIZE = 4 * 1024;
-
-  // ORC File Reader
-  private Reader reader;
+  // TODO: make this configurable.
+  private static final int CAPACITY = 4 * 1024;
 
   // Vectorized ORC Row Batch
   private VectorizedRowBatch batch;
@@ -98,22 +88,22 @@ public OrcColumnarBatchReader(boolean useOffHeap, boolean copyToSpark) {
 
 
   @Override
-  public Void getCurrentKey() throws IOException, InterruptedException {
+  public Void getCurrentKey() {
     return null;
   }
 
   @Override
-  public ColumnarBatch getCurrentValue() throws IOException, InterruptedException {
+  public ColumnarBatch getCurrentValue() {
     return columnarBatch;
   }
 
   @Override
-  public float getProgress() throws IOException, InterruptedException {
+  public float getProgress() throws IOException {
     return recordReader.getProgress();
   }
 
   @Override
-  public boolean nextKeyValue() throws IOException, InterruptedException {
+  public boolean nextKeyValue() throws IOException {
     return nextBatch();
   }
 
@@ -134,16 +124,15 @@ public void close() throws IOException {
    * Please note that `initBatch` is needed to be called after this.
    */
   @Override
-  public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
-      throws IOException, InterruptedException {
+  public void initialize(
+      InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException {
     FileSplit fileSplit = (FileSplit)inputSplit;
     Configuration conf = taskAttemptContext.getConfiguration();
-    reader = OrcFile.createReader(
+    Reader reader = OrcFile.createReader(
       fileSplit.getPath(),
       OrcFile.readerOptions(conf)
         .maxLength(OrcConf.MAX_FILE_LENGTH.getLong(conf))
         .filesystem(fileSplit.getPath().getFileSystem(conf)));
-
     Reader.Options options =
       OrcInputFormat.buildOptions(conf, reader, fileSplit.getStart(), fileSplit.getLength());
     recordReader = reader.rows(options);
@@ -159,7 +148,7 @@ public void initBatch(
       StructField[] requiredFields,
       StructType partitionSchema,
       InternalRow partitionValues) {
-    batch = orcSchema.createRowBatch(DEFAULT_SIZE);
+    batch = orcSchema.createRowBatch(CAPACITY);
     assert(!batch.selectedInUse); // `selectedInUse` should be initialized with `false`.
 
     this.requiredFields = requiredFields;
@@ -171,19 +160,17 @@ public void initBatch(
       resultSchema = resultSchema.add(f);
     }
 
-    int capacity = DEFAULT_SIZE;
-
     if (copyToSpark) {
       if (MEMORY_MODE == MemoryMode.OFF_HEAP) {
-        columnVectors = OffHeapColumnVector.allocateColumns(capacity, resultSchema);
+        columnVectors = OffHeapColumnVector.allocateColumns(CAPACITY, resultSchema);
       } else {
-        columnVectors = OnHeapColumnVector.allocateColumns(capacity, resultSchema);
+        columnVectors = OnHeapColumnVector.allocateColumns(CAPACITY, resultSchema);
       }
 
       // Initialize the missing columns once.
       for (int i = 0; i < requiredFields.length; i++) {
         if (requestedColIds[i] == -1) {
-          columnVectors[i].putNulls(0, capacity);
+          columnVectors[i].putNulls(0, CAPACITY);
           columnVectors[i].setIsConstant();
         }
       }
@@ -196,7 +183,7 @@ public void initBatch(
         }
       }
 
-      columnarBatch = new ColumnarBatch(resultSchema, columnVectors, capacity);
+      columnarBatch = new ColumnarBatch(columnVectors);
     } else {
       // Just wrap the ORC column vector instead of copying it to Spark column vector.
       orcVectorWrappers = new org.apache.spark.sql.vectorized.ColumnVector[resultSchema.length()];
@@ -206,8 +193,8 @@ public void initBatch(
         int colId = requestedColIds[i];
         // Initialize the missing columns once.
         if (colId == -1) {
-          OnHeapColumnVector missingCol = new OnHeapColumnVector(capacity, dt);
-          missingCol.putNulls(0, capacity);
+          OnHeapColumnVector missingCol = new OnHeapColumnVector(CAPACITY, dt);
+          missingCol.putNulls(0, CAPACITY);
           missingCol.setIsConstant();
           orcVectorWrappers[i] = missingCol;
         } else {
@@ -219,14 +206,14 @@ public void initBatch(
         int partitionIdx = requiredFields.length;
         for (int i = 0; i < partitionValues.numFields(); i++) {
           DataType dt = partitionSchema.fields()[i].dataType();
-          OnHeapColumnVector partitionCol = new OnHeapColumnVector(capacity, dt);
+          OnHeapColumnVector partitionCol = new OnHeapColumnVector(CAPACITY, dt);
           ColumnVectorUtils.populate(partitionCol, partitionValues, i);
           partitionCol.setIsConstant();
           orcVectorWrappers[partitionIdx + i] = partitionCol;
         }
       }
 
-      columnarBatch = new ColumnarBatch(resultSchema, orcVectorWrappers, capacity);
+      columnarBatch = new ColumnarBatch(orcVectorWrappers);
     }
   }
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java
index 80c2f491b48ce..e65cd252c3ddf 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java
@@ -170,7 +170,7 @@ public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptCont
    * Returns the list of files at 'path' recursively. This skips files that are ignored normally
    * by MapReduce.
    */
-  public static List<String> listDirectory(File path) throws IOException {
+  public static List<String> listDirectory(File path) {
     List<String> result = new ArrayList<>();
     if (path.isDirectory()) {
       for (File f: path.listFiles()) {
@@ -231,7 +231,7 @@ protected void initialize(String path, List<String> columns) throws IOException
   }
 
   @Override
-  public Void getCurrentKey() throws IOException, InterruptedException {
+  public Void getCurrentKey() {
     return null;
   }
 
@@ -259,7 +259,7 @@ public ValuesReaderIntIterator(ValuesReader delegate) {
     }
 
     @Override
-    int nextInt() throws IOException {
+    int nextInt() {
       return delegate.readInteger();
     }
   }
@@ -279,15 +279,15 @@ int nextInt() throws IOException {
 
   protected static final class NullIntIterator extends IntIterator {
     @Override
-    int nextInt() throws IOException { return 0; }
+    int nextInt() { return 0; }
   }
 
   /**
    * Creates a reader for definition and repetition levels, returning an optimized one if
    * the levels are not needed.
    */
-  protected static IntIterator createRLEIterator(int maxLevel, BytesInput bytes,
-                                              ColumnDescriptor descriptor) throws IOException {
+  protected static IntIterator createRLEIterator(
+      int maxLevel, BytesInput bytes, ColumnDescriptor descriptor) throws IOException {
     try {
       if (maxLevel == 0) return new NullIntIterator();
       return new RLEIntIterator(
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
index cd745b1f0e4e3..bb1b23611a7d7 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
@@ -50,6 +50,9 @@
  * TODO: make this always return ColumnarBatches.
  */
 public class VectorizedParquetRecordReader extends SpecificParquetRecordReaderBase<Object> {
+  // TODO: make this configurable.
+  private static final int CAPACITY = 4 * 1024;
+
   /**
    * Batch of rows that we assemble and the current index we've returned. Every time this
    * batch is used up (batchIdx == numBatched), we populated the batch.
@@ -152,7 +155,7 @@ public void close() throws IOException {
   }
 
   @Override
-  public boolean nextKeyValue() throws IOException, InterruptedException {
+  public boolean nextKeyValue() throws IOException {
     resultBatch();
 
     if (returnColumnarBatch) return nextBatch();
@@ -165,13 +168,13 @@ public boolean nextKeyValue() throws IOException, InterruptedException {
   }
 
   @Override
-  public Object getCurrentValue() throws IOException, InterruptedException {
+  public Object getCurrentValue() {
     if (returnColumnarBatch) return columnarBatch;
     return columnarBatch.getRow(batchIdx - 1);
   }
 
   @Override
-  public float getProgress() throws IOException, InterruptedException {
+  public float getProgress() {
     return (float) rowsReturned / totalRowCount;
   }
 
@@ -181,7 +184,7 @@ public float getProgress() throws IOException, InterruptedException {
   // Columns 0,1: data columns
   // Column 2: partitionValues[0]
   // Column 3: partitionValues[1]
-  public void initBatch(
+  private void initBatch(
       MemoryMode memMode,
       StructType partitionColumns,
       InternalRow partitionValues) {
@@ -195,13 +198,12 @@ public void initBatch(
       }
     }
 
-    int capacity = ColumnarBatch.DEFAULT_BATCH_SIZE;
     if (memMode == MemoryMode.OFF_HEAP) {
-      columnVectors = OffHeapColumnVector.allocateColumns(capacity, batchSchema);
+      columnVectors = OffHeapColumnVector.allocateColumns(CAPACITY, batchSchema);
     } else {
-      columnVectors = OnHeapColumnVector.allocateColumns(capacity, batchSchema);
+      columnVectors = OnHeapColumnVector.allocateColumns(CAPACITY, batchSchema);
     }
-    columnarBatch = new ColumnarBatch(batchSchema, columnVectors, capacity);
+    columnarBatch = new ColumnarBatch(columnVectors);
     if (partitionColumns != null) {
       int partitionIdx = sparkSchema.fields().length;
       for (int i = 0; i < partitionColumns.fields().length; i++) {
@@ -213,13 +215,13 @@ public void initBatch(
     // Initialize missing columns with nulls.
     for (int i = 0; i < missingColumns.length; i++) {
       if (missingColumns[i]) {
-        columnVectors[i].putNulls(0, columnarBatch.capacity());
+        columnVectors[i].putNulls(0, CAPACITY);
         columnVectors[i].setIsConstant();
       }
     }
   }
 
-  public void initBatch() {
+  private void initBatch() {
     initBatch(MEMORY_MODE, null, null);
   }
 
@@ -255,7 +257,7 @@ public boolean nextBatch() throws IOException {
     if (rowsReturned >= totalRowCount) return false;
     checkEndOfRowGroup();
 
-    int num = (int) Math.min((long) columnarBatch.capacity(), totalCountLoadedSoFar - rowsReturned);
+    int num = (int) Math.min((long) CAPACITY, totalCountLoadedSoFar - rowsReturned);
     for (int i = 0; i < columnReaders.length; ++i) {
       if (columnReaders[i] == null) continue;
       columnReaders[i].readBatch(num, columnVectors[i]);
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
index b5cbe8e2839ba..5ee8cc8da2309 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
@@ -118,19 +118,19 @@ private static void appendValue(WritableColumnVector dst, DataType t, Object o)
       }
     } else {
       if (t == DataTypes.BooleanType) {
-        dst.appendBoolean(((Boolean)o).booleanValue());
+        dst.appendBoolean((Boolean) o);
       } else if (t == DataTypes.ByteType) {
-        dst.appendByte(((Byte) o).byteValue());
+        dst.appendByte((Byte) o);
       } else if (t == DataTypes.ShortType) {
-        dst.appendShort(((Short)o).shortValue());
+        dst.appendShort((Short) o);
       } else if (t == DataTypes.IntegerType) {
-        dst.appendInt(((Integer)o).intValue());
+        dst.appendInt((Integer) o);
       } else if (t == DataTypes.LongType) {
-        dst.appendLong(((Long)o).longValue());
+        dst.appendLong((Long) o);
       } else if (t == DataTypes.FloatType) {
-        dst.appendFloat(((Float)o).floatValue());
+        dst.appendFloat((Float) o);
       } else if (t == DataTypes.DoubleType) {
-        dst.appendDouble(((Double)o).doubleValue());
+        dst.appendDouble((Double) o);
       } else if (t == DataTypes.StringType) {
         byte[] b =((String)o).getBytes(StandardCharsets.UTF_8);
         dst.appendByteArray(b, 0, b.length);
@@ -192,7 +192,7 @@ private static void appendValue(WritableColumnVector dst, DataType t, Row src, i
    */
   public static ColumnarBatch toBatch(
       StructType schema, MemoryMode memMode, Iterator<Row> row) {
-    int capacity = ColumnarBatch.DEFAULT_BATCH_SIZE;
+    int capacity = 4 * 1024;
     WritableColumnVector[] columnVectors;
     if (memMode == MemoryMode.OFF_HEAP) {
       columnVectors = OffHeapColumnVector.allocateColumns(capacity, schema);
@@ -208,7 +208,7 @@ public static ColumnarBatch toBatch(
       }
       n++;
     }
-    ColumnarBatch batch = new ColumnarBatch(schema, columnVectors, capacity);
+    ColumnarBatch batch = new ColumnarBatch(columnVectors);
     batch.setNumRows(n);
     return batch;
   }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java
index 9ae1c6d9993f0..4dc826cf60c15 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java
@@ -20,7 +20,6 @@
 
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.execution.vectorized.MutableColumnarRow;
-import org.apache.spark.sql.types.StructType;
 
 /**
  * This class wraps multiple ColumnVectors as a row-wise table. It provides a row view of this
@@ -28,10 +27,6 @@
  * the entire data loading process.
  */
 public final class ColumnarBatch {
-  public static final int DEFAULT_BATCH_SIZE = 4 * 1024;
-
-  private final StructType schema;
-  private final int capacity;
   private int numRows;
   private final ColumnVector[] columns;
 
@@ -82,7 +77,6 @@ public void remove() {
    * Sets the number of rows in this batch.
    */
   public void setNumRows(int numRows) {
-    assert(numRows <= this.capacity);
     this.numRows = numRows;
   }
 
@@ -96,16 +90,6 @@ public void setNumRows(int numRows) {
    */
   public int numRows() { return numRows; }
 
-  /**
-   * Returns the schema that makes up this batch.
-   */
-  public StructType schema() { return schema; }
-
-  /**
-   * Returns the max capacity (in number of rows) for this batch.
-   */
-  public int capacity() { return capacity; }
-
   /**
    * Returns the column at `ordinal`.
    */
@@ -120,10 +104,8 @@ public InternalRow getRow(int rowId) {
     return row;
   }
 
-  public ColumnarBatch(StructType schema, ColumnVector[] columns, int capacity) {
-    this.schema = schema;
+  public ColumnarBatch(ColumnVector[] columns) {
     this.columns = columns;
-    this.capacity = capacity;
     this.row = new MutableColumnarRow(columns);
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
index 0cf9b53ce1d5d..eb48584d0c1ee 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
@@ -94,7 +94,7 @@ class VectorizedHashMapGenerator(
        |
        |  public $generatedClassName() {
        |    vectors = ${classOf[OnHeapColumnVector].getName}.allocateColumns(capacity, schema);
-       |    batch = new ${classOf[ColumnarBatch].getName}(schema, vectors, capacity);
+       |    batch = new ${classOf[ColumnarBatch].getName}(vectors);
        |
        |    // Generates a projection to return the aggregate buffer only.
        |    ${classOf[OnHeapColumnVector].getName}[] aggBufferVectors =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
index bcd1aa0890ba3..7487564ed64da 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
@@ -175,7 +175,7 @@ private[sql] object ArrowConverters {
           new ArrowColumnVector(vector).asInstanceOf[ColumnVector]
         }.toArray
 
-        val batch = new ColumnarBatch(schemaRead, columns, root.getRowCount)
+        val batch = new ColumnarBatch(columns)
         batch.setNumRows(root.getRowCount)
         batch.rowIterator().asScala
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
index 3565ee3af1b9f..28b3875505cd2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
@@ -78,11 +78,10 @@ case class InMemoryTableScanExec(
     } else {
       OffHeapColumnVector.allocateColumns(rowCount, columnarBatchSchema)
     }
-    val columnarBatch = new ColumnarBatch(
-      columnarBatchSchema, columnVectors.asInstanceOf[Array[ColumnVector]], rowCount)
+    val columnarBatch = new ColumnarBatch(columnVectors.asInstanceOf[Array[ColumnVector]])
     columnarBatch.setNumRows(rowCount)
 
-    for (i <- 0 until attributes.length) {
+    for (i <- attributes.indices) {
       ColumnAccessor.decompress(
         cachedColumnarBatch.buffers(columnIndices(i)),
         columnarBatch.column(i).asInstanceOf[WritableColumnVector],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
index c06bc7b66ff39..47b146f076b62 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
@@ -74,8 +74,7 @@ case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chi
       schema: StructType,
       context: TaskContext): Iterator[InternalRow] = {
 
-    val schemaOut = StructType.fromAttributes(output.drop(child.output.length).zipWithIndex
-      .map { case (attr, i) => attr.withName(s"_$i") })
+    val outputTypes = output.drop(child.output.length).map(_.dataType)
 
     // DO NOT use iter.grouped(). See BatchIterator.
     val batchIter = if (batchSize > 0) new BatchIterator(iter, batchSize) else Iterator(iter)
@@ -90,8 +89,9 @@ case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chi
 
       private var currentIter = if (columnarBatchIter.hasNext) {
         val batch = columnarBatchIter.next()
-        assert(schemaOut.equals(batch.schema),
-          s"Invalid schema from pandas_udf: expected $schemaOut, got ${batch.schema}")
+        val actualDataTypes = (0 until batch.numCols()).map(i => batch.column(i).dataType())
+        assert(outputTypes == actualDataTypes, "Invalid schema from pandas_udf: " +
+          s"expected ${outputTypes.mkString(", ")}, got ${actualDataTypes.mkString(", ")}")
         batch.rowIterator.asScala
       } else {
         Iterator.empty
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
index dc5ba96e69aec..5fcdcddca7d51 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
@@ -138,7 +138,7 @@ class ArrowPythonRunner(
           if (reader != null && batchLoaded) {
             batchLoaded = reader.loadNextBatch()
             if (batchLoaded) {
-              val batch = new ColumnarBatch(schema, vectors, root.getRowCount)
+              val batch = new ColumnarBatch(vectors)
               batch.setNumRows(root.getRowCount)
               batch
             } else {
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java
index 98d6a53b54d28..a5d77a90ece42 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java
@@ -69,8 +69,7 @@ public DataReader<ColumnarBatch> createDataReader() {
       ColumnVector[] vectors = new ColumnVector[2];
       vectors[0] = i;
       vectors[1] = j;
-      this.batch =
-        new ColumnarBatch(new StructType().add("i", "int").add("j", "int"), vectors, BATCH_SIZE);
+      this.batch = new ColumnarBatch(vectors);
       return this;
     }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index 675f06b31b970..cd90681ecabc6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -875,14 +875,13 @@ class ColumnarBatchSuite extends SparkFunSuite {
         .add("intCol2", IntegerType)
         .add("string", BinaryType)
 
-      val capacity = ColumnarBatch.DEFAULT_BATCH_SIZE
+      val capacity = 4 * 1024
       val columns = schema.fields.map { field =>
         allocate(capacity, field.dataType, memMode)
       }
-      val batch = new ColumnarBatch(schema, columns.toArray, ColumnarBatch.DEFAULT_BATCH_SIZE)
+      val batch = new ColumnarBatch(columns.toArray)
       assert(batch.numCols() == 4)
       assert(batch.numRows() == 0)
-      assert(batch.capacity() > 0)
       assert(batch.rowIterator().hasNext == false)
 
       // Add a row [1, 1.1, NULL]
@@ -1153,7 +1152,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
     val columnVectors = Seq(new ArrowColumnVector(vector1), new ArrowColumnVector(vector2))
 
     val schema = StructType(Seq(StructField("int1", IntegerType), StructField("int2", IntegerType)))
-    val batch = new ColumnarBatch(schema, columnVectors.toArray[ColumnVector], 11)
+    val batch = new ColumnarBatch(columnVectors.toArray)
     batch.setNumRows(11)
 
     assert(batch.numCols() == 2)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
index a89f7c55bf4f7..0ca29524c6d05 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
@@ -333,8 +333,7 @@ class BatchReadTask(start: Int, end: Int)
   private final val BATCH_SIZE = 20
   private lazy val i = new OnHeapColumnVector(BATCH_SIZE, IntegerType)
   private lazy val j = new OnHeapColumnVector(BATCH_SIZE, IntegerType)
-  private lazy val batch = new ColumnarBatch(
-    new StructType().add("i", "int").add("j", "int"), Array(i, j), BATCH_SIZE)
+  private lazy val batch = new ColumnarBatch(Array(i, j))
 
   private var current = start
 

From ffe45913d0c666185f8c252be30b5e269a909c07 Mon Sep 17 00:00:00 2001
From: foxish <ramanathana@google.com>
Date: Fri, 19 Jan 2018 10:23:13 -0800
Subject: [PATCH 142/566] [SPARK-23104][K8S][DOCS] Changes to Kubernetes
 scheduler documentation

## What changes were proposed in this pull request?

Docs changes:
- Adding a warning that the backend is experimental.
- Removing a defunct internal-only option from documentation
- Clarifying that node selectors can be used right away, and other minor cosmetic changes

## How was this patch tested?

Docs only change

Author: foxish <ramanathana@google.com>

Closes #20314 from foxish/ambiguous-docs.

(cherry picked from commit 73d3b230f3816a854a181c0912d87b180e347271)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 docs/cluster-overview.md      |  4 ++--
 docs/running-on-kubernetes.md | 43 ++++++++++++++++-------------------
 2 files changed, 22 insertions(+), 25 deletions(-)

diff --git a/docs/cluster-overview.md b/docs/cluster-overview.md
index 658e67f99dd71..7277e2fb2731d 100644
--- a/docs/cluster-overview.md
+++ b/docs/cluster-overview.md
@@ -52,8 +52,8 @@ The system currently supports three cluster managers:
 * [Apache Mesos](running-on-mesos.html) -- a general cluster manager that can also run Hadoop MapReduce
   and service applications.
 * [Hadoop YARN](running-on-yarn.html) -- the resource manager in Hadoop 2.
-* [Kubernetes](running-on-kubernetes.html) -- [Kubernetes](https://kubernetes.io/docs/concepts/overview/what-is-kubernetes/)
-is an open-source platform that provides container-centric infrastructure.
+* [Kubernetes](running-on-kubernetes.html) -- an open-source system for automating deployment, scaling,
+  and management of containerized applications.
 
 A third-party project (not supported by the Spark project) exists to add support for
 [Nomad](https://github.com/hashicorp/nomad-spark) as a cluster manager.
diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index d6b1735ce5550..3c7586e8544ba 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -8,6 +8,10 @@ title: Running Spark on Kubernetes
 Spark can run on clusters managed by [Kubernetes](https://kubernetes.io). This feature makes use of native
 Kubernetes scheduler that has been added to Spark.
 
+**The Kubernetes scheduler is currently experimental.
+In future versions, there may be behavioral changes around configuration,
+container images and entrypoints.**
+
 # Prerequisites
 
 * A runnable distribution of Spark 2.3 or above.
@@ -41,11 +45,10 @@ logs and remains in "completed" state in the Kubernetes API until it's eventuall
 
 Note that in the completed state, the driver pod does *not* use any computational or memory resources.
 
-The driver and executor pod scheduling is handled by Kubernetes. It will be possible to affect Kubernetes scheduling
-decisions for driver and executor pods using advanced primitives like
-[node selectors](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#nodeselector)
-and [node/pod affinities](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity)
-in a future release.
+The driver and executor pod scheduling is handled by Kubernetes. It is possible to schedule the
+driver and executor pods on a subset of available nodes through a [node selector](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#nodeselector)
+using the configuration property for it. It will be possible to use more advanced
+scheduling hints like [node/pod affinities](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity) in a future release.
 
 # Submitting Applications to Kubernetes
 
@@ -62,8 +65,10 @@ use with the Kubernetes backend.
 
 Example usage is:
 
-    ./bin/docker-image-tool.sh -r <repo> -t my-tag build
-    ./bin/docker-image-tool.sh -r <repo> -t my-tag push
+```bash
+$ ./bin/docker-image-tool.sh -r <repo> -t my-tag build
+$ ./bin/docker-image-tool.sh -r <repo> -t my-tag push
+```
 
 ## Cluster Mode
 
@@ -94,7 +99,7 @@ must consist of lower case alphanumeric characters, `-`, and `.`  and must start
 If you have a Kubernetes cluster setup, one way to discover the apiserver URL is by executing `kubectl cluster-info`.
 
 ```bash
-kubectl cluster-info
+$ kubectl cluster-info
 Kubernetes master is running at http://127.0.0.1:6443
 ```
 
@@ -105,7 +110,7 @@ authenticating proxy, `kubectl proxy` to communicate to the Kubernetes API.
 The local proxy can be started by:
 
 ```bash
-kubectl proxy
+$ kubectl proxy
 ```
 
 If the local proxy is running at localhost:8001, `--master k8s://http://127.0.0.1:8001` can be used as the argument to
@@ -173,7 +178,7 @@ Logs can be accessed using the Kubernetes API and the `kubectl` CLI. When a Spar
 to stream logs from the application using:
 
 ```bash
-kubectl -n=<namespace> logs -f <driver-pod-name>
+$ kubectl -n=<namespace> logs -f <driver-pod-name>
 ```
 
 The same logs can also be accessed through the
@@ -186,7 +191,7 @@ The UI associated with any application can be accessed locally using
 [`kubectl port-forward`](https://kubernetes.io/docs/tasks/access-application-cluster/port-forward-access-application-cluster/#forward-a-local-port-to-a-port-on-the-pod).
 
 ```bash
-kubectl port-forward <driver-pod-name> 4040:4040
+$ kubectl port-forward <driver-pod-name> 4040:4040
 ```
 
 Then, the Spark driver UI can be accessed on `http://localhost:4040`.
@@ -200,13 +205,13 @@ are errors during the running of the application, often, the best way to investi
 To get some basic information about the scheduling decisions made around the driver pod, you can run:
 
 ```bash
-kubectl describe pod <spark-driver-pod>
+$ kubectl describe pod <spark-driver-pod>
 ```
 
 If the pod has encountered a runtime error, the status can be probed further using:
 
 ```bash
-kubectl logs <spark-driver-pod>
+$ kubectl logs <spark-driver-pod>
 ```
 
 Status and logs of failed executor pods can be checked in similar ways. Finally, deleting the driver pod will clean up the entire spark
@@ -254,7 +259,7 @@ To create a custom service account, a user can use the `kubectl create serviceac
 following command creates a service account named `spark`:
 
 ```bash
-kubectl create serviceaccount spark
+$ kubectl create serviceaccount spark
 ```
 
 To grant a service account a `Role` or `ClusterRole`, a `RoleBinding` or `ClusterRoleBinding` is needed. To create
@@ -263,7 +268,7 @@ for `ClusterRoleBinding`) command. For example, the following command creates an
 namespace and grants it to the `spark` service account created above:
 
 ```bash
-kubectl create clusterrolebinding spark-role --clusterrole=edit --serviceaccount=default:spark --namespace=default
+$ kubectl create clusterrolebinding spark-role --clusterrole=edit --serviceaccount=default:spark --namespace=default
 ```
 
 Note that a `Role` can only be used to grant access to resources (like pods) within a single namespace, whereas a
@@ -543,14 +548,6 @@ specific to Spark on Kubernetes.
     to avoid name conflicts.
   </td>
 </tr>
-<tr>
-  <td><code>spark.kubernetes.executor.podNamePrefix</code></td>
-  <td>(none)</td>
-  <td>
-    Prefix for naming the executor pods.
-    If not set, the executor pod name is set to driver pod name suffixed by an integer.
-  </td>
-</tr>
 <tr>
   <td><code>spark.kubernetes.executor.lostCheck.maxAttempts</code></td>
   <td><code>10</code></td>

From 4b79514c90ca76674d17fd80d125e9dbfb0e845e Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 19 Jan 2018 13:26:37 -0600
Subject: [PATCH 143/566] [SPARK-20664][CORE] Delete stale application data
 from SHS.

Detect the deletion of event log files from storage, and remove
data about the related application attempt in the SHS.

Also contains code to fix SPARK-21571 based on code by ericvandenbergfb.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20138 from vanzin/SPARK-20664.

(cherry picked from commit fed2139f053fac4a9a6952ff0ab1cc2a5f657bd0)
Signed-off-by: Imran Rashid <irashid@cloudera.com>
---
 .../deploy/history/FsHistoryProvider.scala    | 297 +++++++++++-------
 .../history/FsHistoryProviderSuite.scala      | 117 ++++++-
 .../deploy/history/HistoryServerSuite.scala   |   4 +-
 3 files changed, 306 insertions(+), 112 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 94c80ebd55e74..f9d0b5ee4e23e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -19,7 +19,7 @@ package org.apache.spark.deploy.history
 
 import java.io.{File, FileNotFoundException, IOException}
 import java.util.{Date, ServiceLoader, UUID}
-import java.util.concurrent.{Executors, ExecutorService, Future, TimeUnit}
+import java.util.concurrent.{ExecutorService, TimeUnit}
 import java.util.zip.{ZipEntry, ZipOutputStream}
 
 import scala.collection.JavaConverters._
@@ -29,7 +29,7 @@ import scala.xml.Node
 
 import com.fasterxml.jackson.annotation.JsonIgnore
 import com.google.common.io.ByteStreams
-import com.google.common.util.concurrent.{MoreExecutors, ThreadFactoryBuilder}
+import com.google.common.util.concurrent.MoreExecutors
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.fs.permission.FsAction
 import org.apache.hadoop.hdfs.DistributedFileSystem
@@ -116,8 +116,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
   // Used by check event thread and clean log thread.
   // Scheduled thread pool size must be one, otherwise it will have concurrent issues about fs
   // and applications between check task and clean task.
-  private val pool = Executors.newScheduledThreadPool(1, new ThreadFactoryBuilder()
-    .setNameFormat("spark-history-task-%d").setDaemon(true).build())
+  private val pool = ThreadUtils.newDaemonSingleThreadScheduledExecutor("spark-history-task-%d")
 
   // The modification time of the newest log detected during the last scan.   Currently only
   // used for logging msgs (logs are re-scanned based on file size, rather than modtime)
@@ -174,7 +173,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
    * Fixed size thread pool to fetch and parse log files.
    */
   private val replayExecutor: ExecutorService = {
-    if (!conf.contains("spark.testing")) {
+    if (Utils.isTesting) {
       ThreadUtils.newDaemonFixedThreadPool(NUM_PROCESSING_THREADS, "log-replay-executor")
     } else {
       MoreExecutors.sameThreadExecutor()
@@ -275,7 +274,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     try {
       Some(load(appId).toApplicationInfo())
     } catch {
-      case e: NoSuchElementException =>
+      case _: NoSuchElementException =>
         None
     }
   }
@@ -405,49 +404,70 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     try {
       val newLastScanTime = getNewLastScanTime()
       logDebug(s"Scanning $logDir with lastScanTime==$lastScanTime")
-      // scan for modified applications, replay and merge them
-      val logInfos = Option(fs.listStatus(new Path(logDir))).map(_.toSeq).getOrElse(Nil)
+
+      val updated = Option(fs.listStatus(new Path(logDir))).map(_.toSeq).getOrElse(Nil)
         .filter { entry =>
           !entry.isDirectory() &&
             // FsHistoryProvider generates a hidden file which can't be read.  Accidentally
             // reading a garbage file is safe, but we would log an error which can be scary to
             // the end-user.
             !entry.getPath().getName().startsWith(".") &&
-            SparkHadoopUtil.get.checkAccessPermission(entry, FsAction.READ) &&
-            recordedFileSize(entry.getPath()) < entry.getLen()
+            SparkHadoopUtil.get.checkAccessPermission(entry, FsAction.READ)
+        }
+        .filter { entry =>
+          try {
+            val info = listing.read(classOf[LogInfo], entry.getPath().toString())
+            if (info.fileSize < entry.getLen()) {
+              // Log size has changed, it should be parsed.
+              true
+            } else {
+              // If the SHS view has a valid application, update the time the file was last seen so
+              // that the entry is not deleted from the SHS listing.
+              if (info.appId.isDefined) {
+                listing.write(info.copy(lastProcessed = newLastScanTime))
+              }
+              false
+            }
+          } catch {
+            case _: NoSuchElementException =>
+              // If the file is currently not being tracked by the SHS, add an entry for it and try
+              // to parse it. This will allow the cleaner code to detect the file as stale later on
+              // if it was not possible to parse it.
+              listing.write(LogInfo(entry.getPath().toString(), newLastScanTime, None, None,
+                entry.getLen()))
+              entry.getLen() > 0
+          }
         }
         .sortWith { case (entry1, entry2) =>
           entry1.getModificationTime() > entry2.getModificationTime()
         }
 
-      if (logInfos.nonEmpty) {
-        logDebug(s"New/updated attempts found: ${logInfos.size} ${logInfos.map(_.getPath)}")
+      if (updated.nonEmpty) {
+        logDebug(s"New/updated attempts found: ${updated.size} ${updated.map(_.getPath)}")
       }
 
-      var tasks = mutable.ListBuffer[Future[_]]()
-
-      try {
-        for (file <- logInfos) {
-          tasks += replayExecutor.submit(new Runnable {
-            override def run(): Unit = mergeApplicationListing(file)
+      val tasks = updated.map { entry =>
+        try {
+          replayExecutor.submit(new Runnable {
+            override def run(): Unit = mergeApplicationListing(entry, newLastScanTime)
           })
+        } catch {
+          // let the iteration over the updated entries break, since an exception on
+          // replayExecutor.submit (..) indicates the ExecutorService is unable
+          // to take any more submissions at this time
+          case e: Exception =>
+            logError(s"Exception while submitting event log for replay", e)
+            null
         }
-      } catch {
-        // let the iteration over logInfos break, since an exception on
-        // replayExecutor.submit (..) indicates the ExecutorService is unable
-        // to take any more submissions at this time
-
-        case e: Exception =>
-          logError(s"Exception while submitting event log for replay", e)
-      }
+      }.filter(_ != null)
 
       pendingReplayTasksCount.addAndGet(tasks.size)
 
+      // Wait for all tasks to finish. This makes sure that checkForLogs
+      // is not scheduled again while some tasks are already running in
+      // the replayExecutor.
       tasks.foreach { task =>
         try {
-          // Wait for all tasks to finish. This makes sure that checkForLogs
-          // is not scheduled again while some tasks are already running in
-          // the replayExecutor.
           task.get()
         } catch {
           case e: InterruptedException =>
@@ -459,13 +479,70 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
         }
       }
 
+      // Delete all information about applications whose log files disappeared from storage.
+      // This is done by identifying the event logs which were not touched by the current
+      // directory scan.
+      //
+      // Only entries with valid applications are cleaned up here. Cleaning up invalid log
+      // files is done by the periodic cleaner task.
+      val stale = listing.view(classOf[LogInfo])
+        .index("lastProcessed")
+        .last(newLastScanTime - 1)
+        .asScala
+        .toList
+      stale.foreach { log =>
+        log.appId.foreach { appId =>
+          cleanAppData(appId, log.attemptId, log.logPath)
+          listing.delete(classOf[LogInfo], log.logPath)
+        }
+      }
+
       lastScanTime.set(newLastScanTime)
     } catch {
       case e: Exception => logError("Exception in checking for event log updates", e)
     }
   }
 
-  private def getNewLastScanTime(): Long = {
+  private def cleanAppData(appId: String, attemptId: Option[String], logPath: String): Unit = {
+    try {
+      val app = load(appId)
+      val (attempt, others) = app.attempts.partition(_.info.attemptId == attemptId)
+
+      assert(attempt.isEmpty || attempt.size == 1)
+      val isStale = attempt.headOption.exists { a =>
+        if (a.logPath != new Path(logPath).getName()) {
+          // If the log file name does not match, then probably the old log file was from an
+          // in progress application. Just return that the app should be left alone.
+          false
+        } else {
+          val maybeUI = synchronized {
+            activeUIs.remove(appId -> attemptId)
+          }
+
+          maybeUI.foreach { ui =>
+            ui.invalidate()
+            ui.ui.store.close()
+          }
+
+          diskManager.foreach(_.release(appId, attemptId, delete = true))
+          true
+        }
+      }
+
+      if (isStale) {
+        if (others.nonEmpty) {
+          val newAppInfo = new ApplicationInfoWrapper(app.info, others)
+          listing.write(newAppInfo)
+        } else {
+          listing.delete(classOf[ApplicationInfoWrapper], appId)
+        }
+      }
+    } catch {
+      case _: NoSuchElementException =>
+    }
+  }
+
+  private[history] def getNewLastScanTime(): Long = {
     val fileName = "." + UUID.randomUUID().toString
     val path = new Path(logDir, fileName)
     val fos = fs.create(path)
@@ -530,7 +607,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
   /**
    * Replay the given log file, saving the application in the listing db.
    */
-  protected def mergeApplicationListing(fileStatus: FileStatus): Unit = {
+  protected def mergeApplicationListing(fileStatus: FileStatus, scanTime: Long): Unit = {
     val eventsFilter: ReplayEventsFilter = { eventString =>
       eventString.startsWith(APPL_START_EVENT_PREFIX) ||
         eventString.startsWith(APPL_END_EVENT_PREFIX) ||
@@ -544,73 +621,78 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     bus.addListener(listener)
     replay(fileStatus, bus, eventsFilter = eventsFilter)
 
-    listener.applicationInfo.foreach { app =>
-      // Invalidate the existing UI for the reloaded app attempt, if any. See LoadedAppUI for a
-      // discussion on the UI lifecycle.
-      synchronized {
-        activeUIs.get((app.info.id, app.attempts.head.info.attemptId)).foreach { ui =>
-          ui.invalidate()
-          ui.ui.store.close()
+    val (appId, attemptId) = listener.applicationInfo match {
+      case Some(app) =>
+        // Invalidate the existing UI for the reloaded app attempt, if any. See LoadedAppUI for a
+        // discussion on the UI lifecycle.
+        synchronized {
+          activeUIs.get((app.info.id, app.attempts.head.info.attemptId)).foreach { ui =>
+            ui.invalidate()
+            ui.ui.store.close()
+          }
         }
-      }
 
-      addListing(app)
+        addListing(app)
+        (Some(app.info.id), app.attempts.head.info.attemptId)
+
+      case _ =>
+        // If the app hasn't written down its app ID to the logs, still record the entry in the
+        // listing db, with an empty ID. This will make the log eligible for deletion if the app
+        // does not make progress after the configured max log age.
+        (None, None)
     }
-    listing.write(new LogInfo(logPath.toString(), fileStatus.getLen()))
+    listing.write(LogInfo(logPath.toString(), scanTime, appId, attemptId, fileStatus.getLen()))
   }
 
   /**
    * Delete event logs from the log directory according to the clean policy defined by the user.
    */
-  private[history] def cleanLogs(): Unit = {
-    var iterator: Option[KVStoreIterator[ApplicationInfoWrapper]] = None
-    try {
-      val maxTime = clock.getTimeMillis() - conf.get(MAX_LOG_AGE_S) * 1000
-
-      // Iterate descending over all applications whose oldest attempt happened before maxTime.
-      iterator = Some(listing.view(classOf[ApplicationInfoWrapper])
-        .index("oldestAttempt")
-        .reverse()
-        .first(maxTime)
-        .closeableIterator())
-
-      iterator.get.asScala.foreach { app =>
-        // Applications may have multiple attempts, some of which may not need to be deleted yet.
-        val (remaining, toDelete) = app.attempts.partition { attempt =>
-          attempt.info.lastUpdated.getTime() >= maxTime
-        }
+  private[history] def cleanLogs(): Unit = Utils.tryLog {
+    val maxTime = clock.getTimeMillis() - conf.get(MAX_LOG_AGE_S) * 1000
 
-        if (remaining.nonEmpty) {
-          val newApp = new ApplicationInfoWrapper(app.info, remaining)
-          listing.write(newApp)
-        }
+    val expired = listing.view(classOf[ApplicationInfoWrapper])
+      .index("oldestAttempt")
+      .reverse()
+      .first(maxTime)
+      .asScala
+      .toList
+    expired.foreach { app =>
+      // Applications may have multiple attempts, some of which may not need to be deleted yet.
+      val (remaining, toDelete) = app.attempts.partition { attempt =>
+        attempt.info.lastUpdated.getTime() >= maxTime
+      }
 
-        toDelete.foreach { attempt =>
-          val logPath = new Path(logDir, attempt.logPath)
-          try {
-            listing.delete(classOf[LogInfo], logPath.toString())
-          } catch {
-            case _: NoSuchElementException =>
-              logDebug(s"Log info entry for $logPath not found.")
-          }
-          try {
-            fs.delete(logPath, true)
-          } catch {
-            case e: AccessControlException =>
-              logInfo(s"No permission to delete ${attempt.logPath}, ignoring.")
-            case t: IOException =>
-              logError(s"IOException in cleaning ${attempt.logPath}", t)
-          }
-        }
+      if (remaining.nonEmpty) {
+        val newApp = new ApplicationInfoWrapper(app.info, remaining)
+        listing.write(newApp)
+      }
 
-        if (remaining.isEmpty) {
-          listing.delete(app.getClass(), app.id)
-        }
+      toDelete.foreach { attempt =>
+        logInfo(s"Deleting expired event log for ${attempt.logPath}")
+        val logPath = new Path(logDir, attempt.logPath)
+        listing.delete(classOf[LogInfo], logPath.toString())
+        cleanAppData(app.id, attempt.info.attemptId, logPath.toString())
+        deleteLog(logPath)
+      }
+
+      if (remaining.isEmpty) {
+        listing.delete(app.getClass(), app.id)
+      }
+    }
+
+    // Delete log files that don't have a valid application and exceed the configured max age.
+    val stale = listing.view(classOf[LogInfo])
+      .index("lastProcessed")
+      .reverse()
+      .first(maxTime)
+      .asScala
+      .toList
+    stale.foreach { log =>
+      if (log.appId.isEmpty) {
+        logInfo(s"Deleting invalid / corrupt event log ${log.logPath}")
+        deleteLog(new Path(log.logPath))
+        listing.delete(classOf[LogInfo], log.logPath)
       }
-    } catch {
-      case t: Exception => logError("Exception while cleaning logs", t)
-    } finally {
-      iterator.foreach(_.close())
     }
   }
 
@@ -631,12 +713,9 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     // an error the other way -- if we report a size bigger (ie later) than the file that is
     // actually read, we may never refresh the app.  FileStatus is guaranteed to be static
     // after it's created, so we get a file size that is no bigger than what is actually read.
-    val logInput = EventLoggingListener.openEventLog(logPath, fs)
-    try {
-      bus.replay(logInput, logPath.toString, !isCompleted, eventsFilter)
+    Utils.tryWithResource(EventLoggingListener.openEventLog(logPath, fs)) { in =>
+      bus.replay(in, logPath.toString, !isCompleted, eventsFilter)
       logInfo(s"Finished parsing $logPath")
-    } finally {
-      logInput.close()
     }
   }
 
@@ -703,18 +782,6 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
         |  application count=$count}""".stripMargin
   }
 
-  /**
-   * Return the last known size of the given event log, recorded the last time the file
-   * system scanner detected a change in the file.
-   */
-  private def recordedFileSize(log: Path): Long = {
-    try {
-      listing.read(classOf[LogInfo], log.toString()).fileSize
-    } catch {
-      case _: NoSuchElementException => 0L
-    }
-  }
-
   private def load(appId: String): ApplicationInfoWrapper = {
     listing.read(classOf[ApplicationInfoWrapper], appId)
   }
@@ -773,11 +840,8 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     logInfo(s"Leasing disk manager space for app $appId / ${attempt.info.attemptId}...")
     val lease = dm.lease(status.getLen(), isCompressed)
     val newStorePath = try {
-      val store = KVUtils.open(lease.tmpPath, metadata)
-      try {
+      Utils.tryWithResource(KVUtils.open(lease.tmpPath, metadata)) { store =>
         rebuildAppStore(store, status, attempt.info.lastUpdated.getTime())
-      } finally {
-        store.close()
       }
       lease.commit(appId, attempt.info.attemptId)
     } catch {
@@ -806,6 +870,17 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
       throw new NoSuchElementException(s"Cannot find attempt $attemptId of $appId."))
   }
 
+  private def deleteLog(log: Path): Unit = {
+    try {
+      fs.delete(log, true)
+    } catch {
+      case _: AccessControlException =>
+        logInfo(s"No permission to delete $log, ignoring.")
+      case ioe: IOException =>
+        logError(s"IOException in cleaning $log", ioe)
+    }
+  }
+
 }
 
 private[history] object FsHistoryProvider {
@@ -832,8 +907,16 @@ private[history] case class FsHistoryProviderMetadata(
     uiVersion: Long,
     logDir: String)
 
+/**
+ * Tracking info for event logs detected in the configured log directory. Tracks both valid and
+ * invalid logs (e.g. unparseable logs, recorded as logs with no app ID) so that the cleaner
+ * can know what log files are safe to delete.
+ */
 private[history] case class LogInfo(
     @KVIndexParam logPath: String,
+    @KVIndexParam("lastProcessed") lastProcessed: Long,
+    appId: Option[String],
+    attemptId: Option[String],
     fileSize: Long)
 
 private[history] class AttemptInfoWrapper(
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
index 84ee01c7f5aaf..787de59edf465 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
@@ -31,7 +31,7 @@ import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.hdfs.DistributedFileSystem
 import org.json4s.jackson.JsonMethods._
 import org.mockito.Matchers.any
-import org.mockito.Mockito.{mock, spy, verify}
+import org.mockito.Mockito.{doReturn, mock, spy, verify}
 import org.scalatest.BeforeAndAfter
 import org.scalatest.Matchers
 import org.scalatest.concurrent.Eventually._
@@ -149,8 +149,10 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
 
     class TestFsHistoryProvider extends FsHistoryProvider(createTestConf()) {
       var mergeApplicationListingCall = 0
-      override protected def mergeApplicationListing(fileStatus: FileStatus): Unit = {
-        super.mergeApplicationListing(fileStatus)
+      override protected def mergeApplicationListing(
+          fileStatus: FileStatus,
+          lastSeen: Long): Unit = {
+        super.mergeApplicationListing(fileStatus, lastSeen)
         mergeApplicationListingCall += 1
       }
     }
@@ -663,6 +665,115 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
     freshUI.get.ui.store.job(0)
   }
 
+  test("clean up stale app information") {
+    val storeDir = Utils.createTempDir()
+    val conf = createTestConf().set(LOCAL_STORE_DIR, storeDir.getAbsolutePath())
+    val provider = spy(new FsHistoryProvider(conf))
+    val appId = "new1"
+
+    // Write logs for two app attempts.
+    doReturn(1L).when(provider).getNewLastScanTime()
+    val attempt1 = newLogFile(appId, Some("1"), inProgress = false)
+    writeFile(attempt1, true, None,
+      SparkListenerApplicationStart(appId, Some(appId), 1L, "test", Some("1")),
+      SparkListenerJobStart(0, 1L, Nil, null),
+      SparkListenerApplicationEnd(5L)
+      )
+    val attempt2 = newLogFile(appId, Some("2"), inProgress = false)
+    writeFile(attempt2, true, None,
+      SparkListenerApplicationStart(appId, Some(appId), 1L, "test", Some("2")),
+      SparkListenerJobStart(0, 1L, Nil, null),
+      SparkListenerApplicationEnd(5L)
+      )
+    updateAndCheck(provider) { list =>
+      assert(list.size === 1)
+      assert(list(0).id === appId)
+      assert(list(0).attempts.size === 2)
+    }
+
+    // Load the app's UI.
+    val ui = provider.getAppUI(appId, Some("1"))
+    assert(ui.isDefined)
+
+    // Delete the underlying log file for attempt 1 and rescan. The UI should go away, but since
+    // attempt 2 still exists, listing data should be there.
+    doReturn(2L).when(provider).getNewLastScanTime()
+    attempt1.delete()
+    updateAndCheck(provider) { list =>
+      assert(list.size === 1)
+      assert(list(0).id === appId)
+      assert(list(0).attempts.size === 1)
+    }
+    assert(!ui.get.valid)
+    assert(provider.getAppUI(appId, None) === None)
+
+    // Delete the second attempt's log file. Now everything should go away.
+    doReturn(3L).when(provider).getNewLastScanTime()
+    attempt2.delete()
+    updateAndCheck(provider) { list =>
+      assert(list.isEmpty)
+    }
+  }
+
+  test("SPARK-21571: clean up removes invalid history files") {
+    // TODO: "maxTime" becoming negative in cleanLogs() causes this test to fail, so avoid that
+    // until we figure out what's causing the problem.
+    val clock = new ManualClock(TimeUnit.DAYS.toMillis(120))
+    val conf = createTestConf().set(MAX_LOG_AGE_S.key, s"2d")
+    val provider = new FsHistoryProvider(conf, clock) {
+      override def getNewLastScanTime(): Long = clock.getTimeMillis()
+    }
+
+    // Create 0-byte size inprogress and complete files
+    var logCount = 0
+    var validLogCount = 0
+
+    val emptyInProgress = newLogFile("emptyInprogressLogFile", None, inProgress = true)
+    emptyInProgress.createNewFile()
+    emptyInProgress.setLastModified(clock.getTimeMillis())
+    logCount += 1
+
+    val slowApp = newLogFile("slowApp", None, inProgress = true)
+    slowApp.createNewFile()
+    slowApp.setLastModified(clock.getTimeMillis())
+    logCount += 1
+
+    val emptyFinished = newLogFile("emptyFinishedLogFile", None, inProgress = false)
+    emptyFinished.createNewFile()
+    emptyFinished.setLastModified(clock.getTimeMillis())
+    logCount += 1
+
+    // Create an incomplete log file, has an end record but no start record.
+    val corrupt = newLogFile("nonEmptyCorruptLogFile", None, inProgress = false)
+    writeFile(corrupt, true, None, SparkListenerApplicationEnd(0))
+    corrupt.setLastModified(clock.getTimeMillis())
+    logCount += 1
+
+    provider.checkForLogs()
+    provider.cleanLogs()
+    assert(new File(testDir.toURI).listFiles().size === logCount)
+
+    // Move the clock forward 1 day and scan the files again. They should still be there.
+    clock.advance(TimeUnit.DAYS.toMillis(1))
+    provider.checkForLogs()
+    provider.cleanLogs()
+    assert(new File(testDir.toURI).listFiles().size === logCount)
+
+    // Update the slow app to contain valid info. Code should detect the change and not clean
+    // it up.
+    writeFile(slowApp, true, None,
+      SparkListenerApplicationStart(slowApp.getName(), Some(slowApp.getName()), 1L, "test", None))
+    slowApp.setLastModified(clock.getTimeMillis())
+    validLogCount += 1
+
+    // Move the clock forward another 2 days and scan the files again. This time the cleaner should
+    // pick up the invalid files and get rid of them.
+    clock.advance(TimeUnit.DAYS.toMillis(2))
+    provider.checkForLogs()
+    provider.cleanLogs()
+    assert(new File(testDir.toURI).listFiles().size === validLogCount)
+  }
+
   /**
    * Asks the provider to check for logs and calls a function to perform checks on the updated
    * app list. Example:
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
index 3738f85da5831..6856cfe12bae3 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
@@ -48,7 +48,7 @@ import org.apache.spark.deploy.history.config._
 import org.apache.spark.status.api.v1.ApplicationInfo
 import org.apache.spark.status.api.v1.JobData
 import org.apache.spark.ui.SparkUI
-import org.apache.spark.util.{ResetSystemProperties, Utils}
+import org.apache.spark.util.{ResetSystemProperties, ShutdownHookManager, Utils}
 
 /**
  * A collection of tests against the historyserver, including comparing responses from the json
@@ -564,7 +564,7 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
     assert(jobcount === getNumJobs("/jobs"))
 
     // no need to retain the test dir now the tests complete
-    logDir.deleteOnExit()
+    ShutdownHookManager.registerShutdownDeleteDir(logDir)
   }
 
   test("ui and api authorization checks") {

From d0cb19873bb325be7e31de62b0ba117dd6b92619 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 19 Jan 2018 13:32:20 -0600
Subject: [PATCH 144/566] [SPARK-23103][CORE] Ensure correct sort order for
 negative values in LevelDB.

The code was sorting "0" as "less than" negative values, which is a little
wrong. Fix is simple, most of the changes are the added test and related
cleanup.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20284 from vanzin/SPARK-23103.

(cherry picked from commit aa3a1276f9e23ffbb093d00743e63cd4369f9f57)
Signed-off-by: Imran Rashid <irashid@cloudera.com>
---
 .../spark/util/kvstore/LevelDBTypeInfo.java   |  2 +-
 .../spark/util/kvstore/DBIteratorSuite.java   |  7 +-
 .../spark/util/kvstore/LevelDBSuite.java      | 77 ++++++++++---------
 .../spark/status/AppStatusListenerSuite.scala |  8 +-
 4 files changed, 52 insertions(+), 42 deletions(-)

diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java
index 232ee41dd0b1f..f4d359234cb9e 100644
--- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java
@@ -493,7 +493,7 @@ byte[] toKey(Object value, byte prefix) {
         byte[] key = new byte[bytes * 2 + 2];
         long longValue = ((Number) value).longValue();
         key[0] = prefix;
-        key[1] = longValue > 0 ? POSITIVE_MARKER : NEGATIVE_MARKER;
+        key[1] = longValue >= 0 ? POSITIVE_MARKER : NEGATIVE_MARKER;
 
         for (int i = 0; i < key.length - 2; i++) {
           int masked = (int) ((longValue >>> (4 * i)) & 0xF);
diff --git a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/DBIteratorSuite.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/DBIteratorSuite.java
index 9a81f86812cde..1e062437d1803 100644
--- a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/DBIteratorSuite.java
+++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/DBIteratorSuite.java
@@ -73,7 +73,9 @@ default BaseComparator reverse() {
   private static final BaseComparator NATURAL_ORDER = (t1, t2) -> t1.key.compareTo(t2.key);
   private static final BaseComparator REF_INDEX_ORDER = (t1, t2) -> t1.id.compareTo(t2.id);
   private static final BaseComparator COPY_INDEX_ORDER = (t1, t2) -> t1.name.compareTo(t2.name);
-  private static final BaseComparator NUMERIC_INDEX_ORDER = (t1, t2) -> t1.num - t2.num;
+  private static final BaseComparator NUMERIC_INDEX_ORDER = (t1, t2) -> {
+    return Integer.valueOf(t1.num).compareTo(t2.num);
+  };
   private static final BaseComparator CHILD_INDEX_ORDER = (t1, t2) -> t1.child.compareTo(t2.child);
 
   /**
@@ -112,7 +114,8 @@ public void setup() throws Exception {
       t.key = "key" + i;
       t.id = "id" + i;
       t.name = "name" + RND.nextInt(MAX_ENTRIES);
-      t.num = RND.nextInt(MAX_ENTRIES);
+      // Force one item to have an integer value of zero to test the fix for SPARK-23103.
+      t.num = (i != 0) ? (int) RND.nextLong() : 0;
       t.child = "child" + (i % MIN_ENTRIES);
       allEntries.add(t);
     }
diff --git a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBSuite.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBSuite.java
index 2b07d249d2022..b8123ac81d29a 100644
--- a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBSuite.java
+++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBSuite.java
@@ -21,6 +21,8 @@
 import java.util.Arrays;
 import java.util.List;
 import java.util.NoSuchElementException;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
 
 import org.apache.commons.io.FileUtils;
 import org.iq80.leveldb.DBIterator;
@@ -74,11 +76,7 @@ public void testReopenAndVersionCheckDb() throws Exception {
 
   @Test
   public void testObjectWriteReadDelete() throws Exception {
-    CustomType1 t = new CustomType1();
-    t.key = "key";
-    t.id = "id";
-    t.name = "name";
-    t.child = "child";
+    CustomType1 t = createCustomType1(1);
 
     try {
       db.read(CustomType1.class, t.key);
@@ -106,17 +104,9 @@ public void testObjectWriteReadDelete() throws Exception {
 
   @Test
   public void testMultipleObjectWriteReadDelete() throws Exception {
-    CustomType1 t1 = new CustomType1();
-    t1.key = "key1";
-    t1.id = "id";
-    t1.name = "name1";
-    t1.child = "child1";
-
-    CustomType1 t2 = new CustomType1();
-    t2.key = "key2";
-    t2.id = "id";
-    t2.name = "name2";
-    t2.child = "child2";
+    CustomType1 t1 = createCustomType1(1);
+    CustomType1 t2 = createCustomType1(2);
+    t2.id = t1.id;
 
     db.write(t1);
     db.write(t2);
@@ -142,11 +132,7 @@ public void testMultipleObjectWriteReadDelete() throws Exception {
 
   @Test
   public void testMultipleTypesWriteReadDelete() throws Exception {
-    CustomType1 t1 = new CustomType1();
-    t1.key = "1";
-    t1.id = "id";
-    t1.name = "name1";
-    t1.child = "child1";
+    CustomType1 t1 = createCustomType1(1);
 
     IntKeyType t2 = new IntKeyType();
     t2.key = 2;
@@ -188,10 +174,7 @@ public void testMultipleTypesWriteReadDelete() throws Exception {
   public void testMetadata() throws Exception {
     assertNull(db.getMetadata(CustomType1.class));
 
-    CustomType1 t = new CustomType1();
-    t.id = "id";
-    t.name = "name";
-    t.child = "child";
+    CustomType1 t = createCustomType1(1);
 
     db.setMetadata(t);
     assertEquals(t, db.getMetadata(CustomType1.class));
@@ -202,11 +185,7 @@ public void testMetadata() throws Exception {
 
   @Test
   public void testUpdate() throws Exception {
-    CustomType1 t = new CustomType1();
-    t.key = "key";
-    t.id = "id";
-    t.name = "name";
-    t.child = "child";
+    CustomType1 t = createCustomType1(1);
 
     db.write(t);
 
@@ -222,13 +201,7 @@ public void testUpdate() throws Exception {
   @Test
   public void testSkip() throws Exception {
     for (int i = 0; i < 10; i++) {
-      CustomType1 t = new CustomType1();
-      t.key = "key" + i;
-      t.id = "id" + i;
-      t.name = "name" + i;
-      t.child = "child" + i;
-
-      db.write(t);
+      db.write(createCustomType1(i));
     }
 
     KVStoreIterator<CustomType1> it = db.view(CustomType1.class).closeableIterator();
@@ -240,6 +213,36 @@ public void testSkip() throws Exception {
     assertFalse(it.hasNext());
   }
 
+  @Test
+  public void testNegativeIndexValues() throws Exception {
+    List<Integer> expected = Arrays.asList(-100, -50, 0, 50, 100);
+
+    expected.stream().forEach(i -> {
+      try {
+        db.write(createCustomType1(i));
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
+    });
+
+    List<Integer> results = StreamSupport
+      .stream(db.view(CustomType1.class).index("int").spliterator(), false)
+      .map(e -> e.num)
+      .collect(Collectors.toList());
+
+    assertEquals(expected, results);
+  }
+
+  private CustomType1 createCustomType1(int i) {
+    CustomType1 t = new CustomType1();
+    t.key = "key" + i;
+    t.id = "id" + i;
+    t.name = "name" + i;
+    t.num = i;
+    t.child = "child" + i;
+    return t;
+  }
+
   private int countKeys(Class<?> type) throws Exception {
     byte[] prefix = db.getTypeInfo(type).keyPrefix();
     int count = 0;
diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
index ca66b6b9db890..e7981bec6d64b 100644
--- a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
@@ -894,15 +894,19 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     val dropped = stages.drop(1).head
 
     // Cache some quantiles by calling AppStatusStore.taskSummary(). For quantiles to be
-    // calculcated, we need at least one finished task.
+    // calculated, we need at least one finished task. The code in AppStatusStore uses
+    // `executorRunTime` to detect valid tasks, so that metric needs to be updated in the
+    // task end event.
     time += 1
     val task = createTasks(1, Array("1")).head
     listener.onTaskStart(SparkListenerTaskStart(dropped.stageId, dropped.attemptId, task))
 
     time += 1
     task.markFinished(TaskState.FINISHED, time)
+    val metrics = TaskMetrics.empty
+    metrics.setExecutorRunTime(42L)
     listener.onTaskEnd(SparkListenerTaskEnd(dropped.stageId, dropped.attemptId,
-      "taskType", Success, task, null))
+      "taskType", Success, task, metrics))
 
     new AppStatusStore(store)
       .taskSummary(dropped.stageId, dropped.attemptId, Array(0.25d, 0.50d, 0.75d))

From f9ad00a5aeeecf4b8d261a0dae6c8cb6be8daa67 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 19 Jan 2018 13:14:24 -0800
Subject: [PATCH 145/566] [SPARK-23135][UI] Fix rendering of accumulators in
 the stage page.

This follows the behavior of 2.2: only named accumulators with a
value are rendered.

Screenshot:
![accs](https://user-images.githubusercontent.com/1694083/35065700-df409114-fb82-11e7-87c1-550c3f674371.png)

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20299 from vanzin/SPARK-23135.

(cherry picked from commit f6da41b0150725fe96ccb2ee3b48840b207f47eb)
Signed-off-by: Sameer Agarwal <sameerag@apache.org>
---
 .../org/apache/spark/ui/jobs/StagePage.scala  | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index af78373ddb4b2..38f7b352114fa 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -260,7 +260,11 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
 
     val accumulableHeaders: Seq[String] = Seq("Accumulable", "Value")
     def accumulableRow(acc: AccumulableInfo): Seq[Node] = {
-      <tr><td>{acc.name}</td><td>{acc.value}</td></tr>
+      if (acc.name != null && acc.value != null) {
+        <tr><td>{acc.name}</td><td>{acc.value}</td></tr>
+      } else {
+        Nil
+      }
     }
     val accumulableTable = UIUtils.listingTable(
       accumulableHeaders,
@@ -856,7 +860,7 @@ private[ui] class TaskPagedTable(
         {formatBytes(task.taskMetrics.map(_.peakExecutionMemory))}
       </td>
       {if (hasAccumulators(stage)) {
-        accumulatorsInfo(task)
+        <td>{accumulatorsInfo(task)}</td>
       }}
       {if (hasInput(stage)) {
         metricInfo(task) { m =>
@@ -912,8 +916,12 @@ private[ui] class TaskPagedTable(
   }
 
   private def accumulatorsInfo(task: TaskData): Seq[Node] = {
-    task.accumulatorUpdates.map { acc =>
-      Unparsed(StringEscapeUtils.escapeHtml4(s"${acc.name}: ${acc.update}"))
+    task.accumulatorUpdates.flatMap { acc =>
+      if (acc.name != null && acc.update.isDefined) {
+        Unparsed(StringEscapeUtils.escapeHtml4(s"${acc.name}: ${acc.update.get}")) ++ <br />
+      } else {
+        Nil
+      }
     }
   }
 
@@ -977,7 +985,9 @@ private object ApiHelper {
     "Shuffle Spill (Disk)" -> TaskIndexNames.DISK_SPILL,
     "Errors" -> TaskIndexNames.ERROR)
 
-  def hasAccumulators(stageData: StageData): Boolean = stageData.accumulatorUpdates.size > 0
+  def hasAccumulators(stageData: StageData): Boolean = {
+    stageData.accumulatorUpdates.exists { acc => acc.name != null && acc.value != null }
+  }
 
   def hasInput(stageData: StageData): Boolean = stageData.inputBytes > 0
 

From c647f918b1aee27d7a53852aca74629f03ad49f6 Mon Sep 17 00:00:00 2001
From: Kent Yao <11215016@zju.edu.cn>
Date: Fri, 19 Jan 2018 15:49:29 -0800
Subject: [PATCH 146/566] [SPARK-21771][SQL] remove useless hive client in
 SparkSQLEnv

## What changes were proposed in this pull request?

Once a meta hive client is created, it generates its SessionState which creates a lot of session related directories, some deleteOnExit, some does not. if a hive client is useless we may not create it at the very start.

## How was this patch tested?
N/A

cc hvanhovell cloud-fan

Author: Kent Yao <11215016@zju.edu.cn>

Closes #18983 from yaooqinn/patch-1.

(cherry picked from commit 793841c6b8b98b918dcf241e29f60ef125914db9)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala   | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala
index 6b19f971b73bb..cbd75ad12d430 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala
@@ -50,8 +50,7 @@ private[hive] object SparkSQLEnv extends Logging {
       sqlContext = sparkSession.sqlContext
 
       val metadataHive = sparkSession
-        .sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog]
-        .client.newSession()
+        .sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client
       metadataHive.setOut(new PrintStream(System.out, true, "UTF-8"))
       metadataHive.setInfo(new PrintStream(System.err, true, "UTF-8"))
       metadataHive.setError(new PrintStream(System.err, true, "UTF-8"))

From 0cde5212a80b5572bfe53b06ed557e6c2ec8c903 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Fri, 19 Jan 2018 22:46:34 -0800
Subject: [PATCH 147/566] [SPARK-23091][ML] Incorrect unit test for
 approxQuantile

## What changes were proposed in this pull request?

Narrow bound on approx quantile test to epsilon from 2*epsilon to match paper

## How was this patch tested?

Existing tests.

Author: Sean Owen <sowen@cloudera.com>

Closes #20324 from srowen/SPARK-23091.

(cherry picked from commit 396cdfbea45232bacbc03bfaf8be4ea85d47d3fd)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../apache/spark/sql/DataFrameStatSuite.scala | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
index 5169d2b5fc6b2..8eae35325faea 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -154,24 +154,24 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
       val Array(d1, d2) = df.stat.approxQuantile("doubles", Array(q1, q2), epsilon)
       val Array(s1, s2) = df.stat.approxQuantile("singles", Array(q1, q2), epsilon)
 
-      val error_single = 2 * 1000 * epsilon
-      val error_double = 2 * 2000 * epsilon
+      val errorSingle = 1000 * epsilon
+      val errorDouble = 2.0 * errorSingle
 
-      assert(math.abs(single1 - q1 * n) < error_single)
-      assert(math.abs(double2 - 2 * q2 * n) < error_double)
-      assert(math.abs(s1 - q1 * n) < error_single)
-      assert(math.abs(s2 - q2 * n) < error_single)
-      assert(math.abs(d1 - 2 * q1 * n) < error_double)
-      assert(math.abs(d2 - 2 * q2 * n) < error_double)
+      assert(math.abs(single1 - q1 * n) <= errorSingle)
+      assert(math.abs(double2 - 2 * q2 * n) <= errorDouble)
+      assert(math.abs(s1 - q1 * n) <= errorSingle)
+      assert(math.abs(s2 - q2 * n) <= errorSingle)
+      assert(math.abs(d1 - 2 * q1 * n) <= errorDouble)
+      assert(math.abs(d2 - 2 * q2 * n) <= errorDouble)
 
       // Multiple columns
       val Array(Array(ms1, ms2), Array(md1, md2)) =
         df.stat.approxQuantile(Array("singles", "doubles"), Array(q1, q2), epsilon)
 
-      assert(math.abs(ms1 - q1 * n) < error_single)
-      assert(math.abs(ms2 - q2 * n) < error_single)
-      assert(math.abs(md1 - 2 * q1 * n) < error_double)
-      assert(math.abs(md2 - 2 * q2 * n) < error_double)
+      assert(math.abs(ms1 - q1 * n) <= errorSingle)
+      assert(math.abs(ms2 - q2 * n) <= errorSingle)
+      assert(math.abs(md1 - 2 * q1 * n) <= errorDouble)
+      assert(math.abs(md2 - 2 * q2 * n) <= errorDouble)
     }
 
     // quantile should be in the range [0.0, 1.0]

From e11d5eaf79ffccbe3a5444a5b9ecf3a203e1fc90 Mon Sep 17 00:00:00 2001
From: Shashwat Anand <me@shashwat.me>
Date: Sat, 20 Jan 2018 14:34:37 -0800
Subject: [PATCH 148/566] [SPARK-23165][DOC] Spelling mistake fix in
 quick-start doc.

## What changes were proposed in this pull request?

Fix spelling in quick-start doc.

## How was this patch tested?

Doc only.

Author: Shashwat Anand <me@shashwat.me>

Closes #20336 from ashashwat/SPARK-23165.

(cherry picked from commit 84a076e0e9a38a26edf7b702c24fdbbcf1e697b9)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 docs/cloud-integration.md                      |  4 ++--
 docs/configuration.md                          | 14 +++++++-------
 docs/graphx-programming-guide.md               |  4 ++--
 docs/monitoring.md                             |  8 ++++----
 docs/quick-start.md                            |  6 +++---
 docs/running-on-mesos.md                       |  2 +-
 docs/running-on-yarn.md                        |  2 +-
 docs/security.md                               |  2 +-
 docs/sql-programming-guide.md                  |  8 ++++----
 docs/storage-openstack-swift.md                |  2 +-
 docs/streaming-programming-guide.md            |  4 ++--
 docs/structured-streaming-kafka-integration.md |  4 ++--
 docs/structured-streaming-programming-guide.md |  6 +++---
 docs/submitting-applications.md                |  8 ++++----
 14 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/docs/cloud-integration.md b/docs/cloud-integration.md
index 751a192da4ffd..c150d9efc06ff 100644
--- a/docs/cloud-integration.md
+++ b/docs/cloud-integration.md
@@ -180,10 +180,10 @@ under the path, not the number of *new* files, so it can become a slow operation
 The size of the window needs to be set to handle this.
 
 1. Files only appear in an object store once they are completely written; there
-is no need for a worklow of write-then-rename to ensure that files aren't picked up
+is no need for a workflow of write-then-rename to ensure that files aren't picked up
 while they are still being written. Applications can write straight to the monitored directory.
 
-1. Streams should only be checkpointed to an store implementing a fast and
+1. Streams should only be checkpointed to a store implementing a fast and
 atomic `rename()` operation Otherwise the checkpointing may be slow and potentially unreliable.
 
 ## Further Reading
diff --git a/docs/configuration.md b/docs/configuration.md
index eecb39dcafc9e..e7f2419cc2fa4 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -79,7 +79,7 @@ Then, you can supply configuration values at runtime:
 {% endhighlight %}
 
 The Spark shell and [`spark-submit`](submitting-applications.html)
-tool support two ways to load configurations dynamically. The first are command line options,
+tool support two ways to load configurations dynamically. The first is command line options,
 such as `--master`, as shown above. `spark-submit` can accept any Spark property using the `--conf`
 flag, but uses special flags for properties that play a part in launching the Spark application.
 Running `./bin/spark-submit --help` will show the entire list of these options.
@@ -413,7 +413,7 @@ Apart from these, the following properties are also available, and may be useful
   <td>false</td>
   <td>
     Enable profiling in Python worker, the profile result will show up by <code>sc.show_profiles()</code>,
-    or it will be displayed before the driver exiting. It also can be dumped into disk by
+    or it will be displayed before the driver exits. It also can be dumped into disk by
     <code>sc.dump_profiles(path)</code>. If some of the profile results had been displayed manually,
     they will not be displayed automatically before driver exiting.
 
@@ -446,7 +446,7 @@ Apart from these, the following properties are also available, and may be useful
   <td>true</td>
   <td>
     Reuse Python worker or not. If yes, it will use a fixed number of Python workers,
-    does not need to fork() a Python process for every tasks. It will be very useful
+    does not need to fork() a Python process for every task. It will be very useful
     if there is large broadcast, then the broadcast will not be needed to transferred
     from JVM to Python worker for every task.
   </td>
@@ -1294,7 +1294,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.files.openCostInBytes</code></td>
   <td>4194304 (4 MB)</td>
   <td>
-    The estimated cost to open a file, measured by the number of bytes could be scanned in the same
+    The estimated cost to open a file, measured by the number of bytes could be scanned at the same
     time. This is used when putting multiple files into a partition. It is better to over estimate,
     then the partitions with small files will be faster than partitions with bigger files.
   </td>
@@ -1855,8 +1855,8 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.user.groups.mapping</code></td>
   <td><code>org.apache.spark.security.ShellBasedGroupsMappingProvider</code></td>
   <td>
-    The list of groups for a user are determined by a group mapping service defined by the trait
-    org.apache.spark.security.GroupMappingServiceProvider which can configured by this property.
+    The list of groups for a user is determined by a group mapping service defined by the trait
+    org.apache.spark.security.GroupMappingServiceProvider which can be configured by this property.
     A default unix shell based implementation is provided <code>org.apache.spark.security.ShellBasedGroupsMappingProvider</code>
     which can be specified to resolve a list of groups for a user.
     <em>Note:</em> This implementation supports only a Unix/Linux based environment. Windows environment is
@@ -2465,7 +2465,7 @@ should be included on Spark's classpath:
 
 The location of these configuration files varies across Hadoop versions, but
 a common location is inside of `/etc/hadoop/conf`. Some tools create
-configurations on-the-fly, but offer a mechanisms to download copies of them.
+configurations on-the-fly, but offer a mechanism to download copies of them.
 
 To make these files visible to Spark, set `HADOOP_CONF_DIR` in `$SPARK_HOME/conf/spark-env.sh`
 to a location containing the configuration files.
diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md
index 46225dc598da8..5c97a248df4bc 100644
--- a/docs/graphx-programming-guide.md
+++ b/docs/graphx-programming-guide.md
@@ -708,7 +708,7 @@ messages remaining.
 > messaging function.  These constraints allow additional optimization within GraphX.
 
 The following is the type signature of the [Pregel operator][GraphOps.pregel] as well as a *sketch*
-of its implementation (note: to avoid stackOverflowError due to long lineage chains, pregel support periodcally
+of its implementation (note: to avoid stackOverflowError due to long lineage chains, pregel support periodically
 checkpoint graph and messages by setting "spark.graphx.pregel.checkpointInterval" to a positive number,
 say 10. And set checkpoint directory as well using SparkContext.setCheckpointDir(directory: String)):
 
@@ -928,7 +928,7 @@ switch to 2D-partitioning or other heuristics included in GraphX.
   <!-- Images are downsized intentionally to improve quality on retina displays -->
 </p>
 
-Once the edges have be partitioned the key challenge to efficient graph-parallel computation is
+Once the edges have been partitioned the key challenge to efficient graph-parallel computation is
 efficiently joining vertex attributes with the edges.  Because real-world graphs typically have more
 edges than vertices, we move vertex attributes to the edges.  Because not all partitions will
 contain edges adjacent to all vertices we internally maintain a routing table which identifies where
diff --git a/docs/monitoring.md b/docs/monitoring.md
index f8d3ce91a0691..6f6cfc1288d73 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -118,7 +118,7 @@ The history server can be configured as follows:
     <td>
       The number of applications to retain UI data for in the cache. If this cap is exceeded, then
       the oldest applications will be removed from the cache. If an application is not in the cache,
-      it will have to be loaded from disk if its accessed from the UI.
+      it will have to be loaded from disk if it is accessed from the UI.
     </td>
   </tr>
   <tr>
@@ -407,7 +407,7 @@ can be identified by their `[attempt-id]`. In the API listed below, when running
   </tr>
 </table>
 
-The number of jobs and stages which can retrieved is constrained by the same retention
+The number of jobs and stages which can be retrieved is constrained by the same retention
 mechanism of the standalone Spark UI; `"spark.ui.retainedJobs"` defines the threshold
 value triggering garbage collection on jobs, and `spark.ui.retainedStages` that for stages.
 Note that the garbage collection takes place on playback: it is possible to retrieve
@@ -422,10 +422,10 @@ These endpoints have been strongly versioned to make it easier to develop applic
 * Individual fields will never be removed for any given endpoint
 * New endpoints may be added
 * New fields may be added to existing endpoints
-* New versions of the api may be added in the future at a separate endpoint (eg., `api/v2`).  New versions are *not* required to be backwards compatible.
+* New versions of the api may be added in the future as a separate endpoint (eg., `api/v2`).  New versions are *not* required to be backwards compatible.
 * Api versions may be dropped, but only after at least one minor release of co-existing with a new api version.
 
-Note that even when examining the UI of a running applications, the `applications/[app-id]` portion is
+Note that even when examining the UI of running applications, the `applications/[app-id]` portion is
 still required, though there is only one application available.  Eg. to see the list of jobs for the
 running app, you would go to `http://localhost:4040/api/v1/applications/[app-id]/jobs`.  This is to
 keep the paths consistent in both modes.
diff --git a/docs/quick-start.md b/docs/quick-start.md
index 200b97230e866..07c520cbee6be 100644
--- a/docs/quick-start.md
+++ b/docs/quick-start.md
@@ -67,7 +67,7 @@ res3: Long = 15
     ./bin/pyspark
 
 
-Or if PySpark is installed with pip in your current enviroment:
+Or if PySpark is installed with pip in your current environment:
 
     pyspark
 
@@ -156,7 +156,7 @@ One common data flow pattern is MapReduce, as popularized by Hadoop. Spark can i
 >>> wordCounts = textFile.select(explode(split(textFile.value, "\s+")).alias("word")).groupBy("word").count()
 {% endhighlight %}
 
-Here, we use the `explode` function in `select`, to transfrom a Dataset of lines to a Dataset of words, and then combine `groupBy` and `count` to compute the per-word counts in the file as a DataFrame of 2 columns: "word" and "count". To collect the word counts in our shell, we can call `collect`:
+Here, we use the `explode` function in `select`, to transform a Dataset of lines to a Dataset of words, and then combine `groupBy` and `count` to compute the per-word counts in the file as a DataFrame of 2 columns: "word" and "count". To collect the word counts in our shell, we can call `collect`:
 
 {% highlight python %}
 >>> wordCounts.collect()
@@ -422,7 +422,7 @@ $ YOUR_SPARK_HOME/bin/spark-submit \
 Lines with a: 46, Lines with b: 23
 {% endhighlight %}
 
-If you have PySpark pip installed into your enviroment (e.g., `pip install pyspark`), you can run your application with the regular Python interpreter or use the provided 'spark-submit' as you prefer.
+If you have PySpark pip installed into your environment (e.g., `pip install pyspark`), you can run your application with the regular Python interpreter or use the provided 'spark-submit' as you prefer.
 
 {% highlight bash %}
 # Use the Python interpreter to run your application
diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index 382cbfd5301b0..2bb5ecf1b8509 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -154,7 +154,7 @@ can find the results of the driver from the Mesos Web UI.
 To use cluster mode, you must start the `MesosClusterDispatcher` in your cluster via the `sbin/start-mesos-dispatcher.sh` script,
 passing in the Mesos master URL (e.g: mesos://host:5050). This starts the `MesosClusterDispatcher` as a daemon running on the host.
 
-By setting the Mesos proxy config property (requires mesos version >= 1.4), `--conf spark.mesos.proxy.baseURL=http://localhost:5050` when launching the dispacther, the mesos sandbox URI for each driver is added to the mesos dispatcher UI.
+By setting the Mesos proxy config property (requires mesos version >= 1.4), `--conf spark.mesos.proxy.baseURL=http://localhost:5050` when launching the dispatcher, the mesos sandbox URI for each driver is added to the mesos dispatcher UI.
 
 If you like to run the `MesosClusterDispatcher` with Marathon, you need to run the `MesosClusterDispatcher` in the foreground (i.e: `bin/spark-class org.apache.spark.deploy.mesos.MesosClusterDispatcher`). Note that the `MesosClusterDispatcher` not yet supports multiple instances for HA.
 
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index e7edec5990363..e4f5a0c659e66 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -445,7 +445,7 @@ To use a custom metrics.properties for the application master and executors, upd
   <code>yarn.nodemanager.log-aggregation.roll-monitoring-interval-seconds</code> should be
   configured in yarn-site.xml.
   This feature can only be used with Hadoop 2.6.4+. The Spark log4j appender needs be changed to use
-  FileAppender or another appender that can handle the files being removed while its running. Based
+  FileAppender or another appender that can handle the files being removed while it is running. Based
   on the file name configured in the log4j configuration (like spark.log), the user should set the
   regex (spark*) to include all the log files that need to be aggregated.
   </td>
diff --git a/docs/security.md b/docs/security.md
index 15aadf07cf873..bebc28ddbfb0e 100644
--- a/docs/security.md
+++ b/docs/security.md
@@ -62,7 +62,7 @@ component-specific configuration namespaces used to override the default setting
   </tr>
 </table>
 
-The full breakdown of available SSL options  can be found on the [configuration page](configuration.html).
+The full breakdown of available SSL options can be found on the [configuration page](configuration.html).
 SSL must be configured on each node and configured for each component involved in communication using the particular protocol.
 
 ### YARN mode
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 3e2e48a0ef249..502c0a8c37e01 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1253,7 +1253,7 @@ provide a ClassTag.
 (Note that this is different than the Spark SQL JDBC server, which allows other applications to
 run queries using Spark SQL).
 
-To get started you will need to include the JDBC driver for you particular database on the
+To get started you will need to include the JDBC driver for your particular database on the
 spark classpath. For example, to connect to postgres from the Spark Shell you would run the
 following command:
 
@@ -1793,7 +1793,7 @@ options.
   - Since Spark 2.3, when all inputs are binary, `functions.concat()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.concatBinaryAsString` to `true`.
   - Since Spark 2.3, when all inputs are binary, SQL `elt()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.eltOutputAsString` to `true`.
 
- - Since Spark 2.3, by default arithmetic operations between decimals return a rounded value if an exact representation is not possible (instead of returning NULL). This is compliant to SQL ANSI 2011 specification and Hive's new behavior introduced in Hive 2.2 (HIVE-15331). This involves the following changes
+ - Since Spark 2.3, by default arithmetic operations between decimals return a rounded value if an exact representation is not possible (instead of returning NULL). This is compliant with SQL ANSI 2011 specification and Hive's new behavior introduced in Hive 2.2 (HIVE-15331). This involves the following changes
     - The rules to determine the result type of an arithmetic operation have been updated. In particular, if the precision / scale needed are out of the range of available values, the scale is reduced up to 6, in order to prevent the truncation of the integer part of the decimals. All the arithmetic operations are affected by the change, ie. addition (`+`), subtraction (`-`), multiplication (`*`), division (`/`), remainder (`%`) and positive module (`pmod`).
     - Literal values used in SQL operations are converted to DECIMAL with the exact precision and scale needed by them.
     - The configuration `spark.sql.decimalOperations.allowPrecisionLoss` has been introduced. It defaults to `true`, which means the new behavior described here; if set to `false`, Spark uses previous rules, ie. it doesn't adjust the needed scale to represent the values and it returns NULL if an exact representation of the value is not possible.
@@ -1821,7 +1821,7 @@ options.
    transformations (e.g., `map`, `filter`, and `groupByKey`) and untyped transformations (e.g.,
    `select` and `groupBy`) are available on the Dataset class. Since compile-time type-safety in
    Python and R is not a language feature, the concept of Dataset does not apply to these languages’
-   APIs. Instead, `DataFrame` remains the primary programing abstraction, which is analogous to the
+   APIs. Instead, `DataFrame` remains the primary programming abstraction, which is analogous to the
    single-node data frame notion in these languages.
 
  - Dataset and DataFrame API `unionAll` has been deprecated and replaced by `union`
@@ -1997,7 +1997,7 @@ Java and Python users will need to update their code.
 
 Prior to Spark 1.3 there were separate Java compatible classes (`JavaSQLContext` and `JavaSchemaRDD`)
 that mirrored the Scala API. In Spark 1.3 the Java API and Scala API have been unified. Users
-of either language should use `SQLContext` and `DataFrame`. In general theses classes try to
+of either language should use `SQLContext` and `DataFrame`. In general these classes try to
 use types that are usable from both languages (i.e. `Array` instead of language specific collections).
 In some cases where no common type exists (e.g., for passing in closures or Maps) function overloading
 is used instead.
diff --git a/docs/storage-openstack-swift.md b/docs/storage-openstack-swift.md
index f4bb2353e3c49..1dd54719b21aa 100644
--- a/docs/storage-openstack-swift.md
+++ b/docs/storage-openstack-swift.md
@@ -42,7 +42,7 @@ Create <code>core-site.xml</code> and place it inside Spark's <code>conf</code>
 The main category of parameters that should be configured are the authentication parameters
 required by Keystone.
 
-The following table  contains a list of Keystone mandatory parameters. <code>PROVIDER</code> can be
+The following table contains a list of Keystone mandatory parameters. <code>PROVIDER</code> can be
 any (alphanumeric) name.
 
 <table class="table">
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index 868acc41226dc..ffda36d64a770 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -74,7 +74,7 @@ import org.apache.spark.streaming._
 import org.apache.spark.streaming.StreamingContext._ // not necessary since Spark 1.3
 
 // Create a local StreamingContext with two working thread and batch interval of 1 second.
-// The master requires 2 cores to prevent from a starvation scenario.
+// The master requires 2 cores to prevent a starvation scenario.
 
 val conf = new SparkConf().setMaster("local[2]").setAppName("NetworkWordCount")
 val ssc = new StreamingContext(conf, Seconds(1))
@@ -172,7 +172,7 @@ each line will be split into multiple words and the stream of words is represent
 `words` DStream. Note that we defined the transformation using a
 [FlatMapFunction](api/scala/index.html#org.apache.spark.api.java.function.FlatMapFunction) object.
 As we will discover along the way, there are a number of such convenience classes in the Java API
-that help define DStream transformations.
+that help defines DStream transformations.
 
 Next, we want to count these words.
 
diff --git a/docs/structured-streaming-kafka-integration.md b/docs/structured-streaming-kafka-integration.md
index 461c29ce1ba89..5647ec6bc5797 100644
--- a/docs/structured-streaming-kafka-integration.md
+++ b/docs/structured-streaming-kafka-integration.md
@@ -125,7 +125,7 @@ df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
 
 ### Creating a Kafka Source for Batch Queries 
 If you have a use case that is better suited to batch processing,
-you can create an Dataset/DataFrame for a defined range of offsets.
+you can create a Dataset/DataFrame for a defined range of offsets.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
@@ -597,7 +597,7 @@ Note that the following Kafka params cannot be set and the Kafka source or sink
 - **key.serializer**: Keys are always serialized with ByteArraySerializer or StringSerializer. Use
 DataFrame operations to explicitly serialize the keys into either strings or byte arrays.
 - **value.serializer**: values are always serialized with ByteArraySerializer or StringSerializer. Use
-DataFrame oeprations to explicitly serialize the values into either strings or byte arrays.
+DataFrame operations to explicitly serialize the values into either strings or byte arrays.
 - **enable.auto.commit**: Kafka source doesn't commit any offset.
 - **interceptor.classes**: Kafka source always read keys and values as byte arrays. It's not safe to
  use ConsumerInterceptor as it may break the query.
diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index 2ddba2f0d942e..2ef5d3168a87b 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -10,7 +10,7 @@ title: Structured Streaming Programming Guide
 # Overview
 Structured Streaming is a scalable and fault-tolerant stream processing engine built on the Spark SQL engine. You can express your streaming computation the same way you would express a batch computation on static data. The Spark SQL engine will take care of running it incrementally and continuously and updating the final result as streaming data continues to arrive. You can use the [Dataset/DataFrame API](sql-programming-guide.html) in Scala, Java, Python or R to express streaming aggregations, event-time windows, stream-to-batch joins, etc. The computation is executed on the same optimized Spark SQL engine. Finally, the system ensures end-to-end exactly-once fault-tolerance guarantees through checkpointing and Write Ahead Logs. In short, *Structured Streaming provides fast, scalable, fault-tolerant, end-to-end exactly-once stream processing without the user having to reason about streaming.*
 
-Internally, by default, Structured Streaming queries are processed using a *micro-batch processing* engine, which processes data streams as a series of small batch jobs thereby achieving end-to-end latencies as low as 100 milliseconds and exactly-once fault-tolerance guarantees. However, since Spark 2.3, we have introduced a new low-latency processing mode called **Continuous Processing**, which can achieve end-to-end latencies as low as 1 millisecond with at-least-once guarantees. Without changing the Dataset/DataFrame operations in your queries, you will be able choose the mode based on your application requirements. 
+Internally, by default, Structured Streaming queries are processed using a *micro-batch processing* engine, which processes data streams as a series of small batch jobs thereby achieving end-to-end latencies as low as 100 milliseconds and exactly-once fault-tolerance guarantees. However, since Spark 2.3, we have introduced a new low-latency processing mode called **Continuous Processing**, which can achieve end-to-end latencies as low as 1 millisecond with at-least-once guarantees. Without changing the Dataset/DataFrame operations in your queries, you will be able to choose the mode based on your application requirements. 
 
 In this guide, we are going to walk you through the programming model and the APIs. We are going to explain the concepts mostly using the default micro-batch processing model, and then [later](#continuous-processing-experimental) discuss Continuous Processing model. First, let's start with a simple example of a Structured Streaming query - a streaming word count.
 
@@ -1121,7 +1121,7 @@ Let’s discuss the different types of supported stream-stream joins and how to
 ##### Inner Joins with optional Watermarking
 Inner joins on any kind of columns along with any kind of join conditions are supported.
 However, as the stream runs, the size of streaming state will keep growing indefinitely as
-*all* past input must be saved as the any new input can match with any input from the past.
+*all* past input must be saved as any new input can match with any input from the past.
 To avoid unbounded state, you have to define additional join conditions such that indefinitely
 old inputs cannot match with future inputs and therefore can be cleared from the state.
 In other words, you will have to do the following additional steps in the join.
@@ -1839,7 +1839,7 @@ aggDF \
     .format("console") \
     .start()
 
-# Have all the aggregates in an in memory table. The query name will be the table name
+# Have all the aggregates in an in-memory table. The query name will be the table name
 aggDF \
     .writeStream \
     .queryName("aggregates") \
diff --git a/docs/submitting-applications.md b/docs/submitting-applications.md
index 0473ab73a5e6c..a3643bf0838a1 100644
--- a/docs/submitting-applications.md
+++ b/docs/submitting-applications.md
@@ -5,7 +5,7 @@ title: Submitting Applications
 
 The `spark-submit` script in Spark's `bin` directory is used to launch applications on a cluster.
 It can use all of Spark's supported [cluster managers](cluster-overview.html#cluster-manager-types)
-through a uniform interface so you don't have to configure your application specially for each one.
+through a uniform interface so you don't have to configure your application especially for each one.
 
 # Bundling Your Application's Dependencies
 If your code depends on other projects, you will need to package them alongside
@@ -58,7 +58,7 @@ for applications that involve the REPL (e.g. Spark shell).
 
 Alternatively, if your application is submitted from a machine far from the worker machines (e.g.
 locally on your laptop), it is common to use `cluster` mode to minimize network latency between
-the drivers and the executors. Currently, standalone mode does not support cluster mode for Python
+the drivers and the executors. Currently, the standalone mode does not support cluster mode for Python
 applications.
 
 For Python applications, simply pass a `.py` file in the place of `<application-jar>` instead of a JAR,
@@ -68,7 +68,7 @@ There are a few options available that are specific to the
 [cluster manager](cluster-overview.html#cluster-manager-types) that is being used.
 For example, with a [Spark standalone cluster](spark-standalone.html) with `cluster` deploy mode,
 you can also specify `--supervise` to make sure that the driver is automatically restarted if it
-fails with non-zero exit code. To enumerate all such options available to `spark-submit`,
+fails with a non-zero exit code. To enumerate all such options available to `spark-submit`,
 run it with `--help`. Here are a few examples of common options:
 
 {% highlight bash %}
@@ -192,7 +192,7 @@ debugging information by running `spark-submit` with the `--verbose` option.
 
 # Advanced Dependency Management
 When using `spark-submit`, the application jar along with any jars included with the `--jars` option
-will be automatically transferred to the cluster. URLs supplied after `--jars` must be separated by commas. That list is included on the driver and executor classpaths. Directory expansion does not work with `--jars`.
+will be automatically transferred to the cluster. URLs supplied after `--jars` must be separated by commas. That list is included in the driver and executor classpaths. Directory expansion does not work with `--jars`.
 
 Spark uses the following URL scheme to allow different strategies for disseminating jars:
 

From b9c1367b7d9240070c5d83572dc7b43c7480b456 Mon Sep 17 00:00:00 2001
From: fjh100456 <fu.jinhua6@zte.com.cn>
Date: Sat, 20 Jan 2018 14:49:49 -0800
Subject: [PATCH 149/566] [SPARK-21786][SQL] The
 'spark.sql.parquet.compression.codec' and 'spark.sql.orc.compression.codec'
 configuration doesn't take effect on hive table writing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[SPARK-21786][SQL] The 'spark.sql.parquet.compression.codec' and 'spark.sql.orc.compression.codec' configuration doesn't take effect on hive table writing

What changes were proposed in this pull request?

Pass ‘spark.sql.parquet.compression.codec’ value to ‘parquet.compression’.
Pass ‘spark.sql.orc.compression.codec’ value to ‘orc.compress’.

How was this patch tested?

Add test.

Note:
This is the same issue mentioned in #19218 . That branch was deleted mistakenly, so make a new pr instead.

gatorsmile maropu dongjoon-hyun discipleforteen

Author: fjh100456 <fu.jinhua6@zte.com.cn>
Author: Takeshi Yamamuro <yamamuro@apache.org>
Author: Wenchen Fan <wenchen@databricks.com>
Author: gatorsmile <gatorsmile@gmail.com>
Author: Yinan Li <liyinan926@gmail.com>
Author: Marcelo Vanzin <vanzin@cloudera.com>
Author: Juliusz Sompolski <julek@databricks.com>
Author: Felix Cheung <felixcheung_m@hotmail.com>
Author: jerryshao <sshao@hortonworks.com>
Author: Li Jin <ice.xelloss@gmail.com>
Author: Gera Shegalov <gera@apache.org>
Author: chetkhatri <ckhatrimanjal@gmail.com>
Author: Joseph K. Bradley <joseph@databricks.com>
Author: Bago Amirbekian <bago@databricks.com>
Author: Xianjin YE <advancedxy@gmail.com>
Author: Bruce Robbins <bersprockets@gmail.com>
Author: zuotingbing <zuo.tingbing9@zte.com.cn>
Author: Kent Yao <yaooqinn@hotmail.com>
Author: hyukjinkwon <gurwls223@gmail.com>
Author: Adrian Ionescu <adrian@databricks.com>

Closes #20087 from fjh100456/HiveTableWriting.

(cherry picked from commit 00d169156d4b1c91d2bcfd788b254b03c509dc41)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../datasources/orc/OrcOptions.scala          |   2 +
 .../datasources/parquet/ParquetOptions.scala  |   6 +-
 .../sql/hive/execution/HiveOptions.scala      |  22 ++
 .../sql/hive/execution/SaveAsHiveFile.scala   |  20 +-
 .../sql/hive/CompressionCodecSuite.scala      | 353 ++++++++++++++++++
 5 files changed, 397 insertions(+), 6 deletions(-)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/CompressionCodecSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcOptions.scala
index c866dd834a525..0ad3862f6cf01 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcOptions.scala
@@ -67,4 +67,6 @@ object OrcOptions {
     "snappy" -> "SNAPPY",
     "zlib" -> "ZLIB",
     "lzo" -> "LZO")
+
+  def getORCCompressionCodecName(name: String): String = shortOrcCompressionCodecNames(name)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
index ef67ea7d17cea..f36a89a4c3c5f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.internal.SQLConf
 /**
  * Options for the Parquet data source.
  */
-private[parquet] class ParquetOptions(
+class ParquetOptions(
     @transient private val parameters: CaseInsensitiveMap[String],
     @transient private val sqlConf: SQLConf)
   extends Serializable {
@@ -82,4 +82,8 @@ object ParquetOptions {
     "snappy" -> CompressionCodecName.SNAPPY,
     "gzip" -> CompressionCodecName.GZIP,
     "lzo" -> CompressionCodecName.LZO)
+
+  def getParquetCompressionCodecName(name: String): String = {
+    shortParquetCompressionCodecNames(name).name()
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveOptions.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveOptions.scala
index 5c515515b9b9c..802ddafdbee4d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveOptions.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveOptions.scala
@@ -19,7 +19,16 @@ package org.apache.spark.sql.hive.execution
 
 import java.util.Locale
 
+import scala.collection.JavaConverters._
+
+import org.apache.hadoop.hive.ql.plan.TableDesc
+import org.apache.orc.OrcConf.COMPRESS
+import org.apache.parquet.hadoop.ParquetOutputFormat
+
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
+import org.apache.spark.sql.execution.datasources.orc.OrcOptions
+import org.apache.spark.sql.execution.datasources.parquet.ParquetOptions
+import org.apache.spark.sql.internal.SQLConf
 
 /**
  * Options for the Hive data source. Note that rule `DetermineHiveSerde` will extract Hive
@@ -102,4 +111,17 @@ object HiveOptions {
     "collectionDelim" -> "colelction.delim",
     "mapkeyDelim" -> "mapkey.delim",
     "lineDelim" -> "line.delim").map { case (k, v) => k.toLowerCase(Locale.ROOT) -> v }
+
+  def getHiveWriteCompression(tableInfo: TableDesc, sqlConf: SQLConf): Option[(String, String)] = {
+    val tableProps = tableInfo.getProperties.asScala.toMap
+    tableInfo.getOutputFileFormatClassName.toLowerCase(Locale.ROOT) match {
+      case formatName if formatName.endsWith("parquetoutputformat") =>
+        val compressionCodec = new ParquetOptions(tableProps, sqlConf).compressionCodecClassName
+        Option((ParquetOutputFormat.COMPRESSION, compressionCodec))
+      case formatName if formatName.endsWith("orcoutputformat") =>
+        val compressionCodec = new OrcOptions(tableProps, sqlConf).compressionCodec
+        Option((COMPRESS.getAttribute, compressionCodec))
+      case _ => None
+    }
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index 9a6607f2f2c6c..e484356906e87 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -55,18 +55,28 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
       customPartitionLocations: Map[TablePartitionSpec, String] = Map.empty,
       partitionAttributes: Seq[Attribute] = Nil): Set[String] = {
 
-    val isCompressed = hadoopConf.get("hive.exec.compress.output", "false").toBoolean
+    val isCompressed =
+      fileSinkConf.getTableInfo.getOutputFileFormatClassName.toLowerCase(Locale.ROOT) match {
+        case formatName if formatName.endsWith("orcoutputformat") =>
+          // For ORC,"mapreduce.output.fileoutputformat.compress",
+          // "mapreduce.output.fileoutputformat.compress.codec", and
+          // "mapreduce.output.fileoutputformat.compress.type"
+          // have no impact because it uses table properties to store compression information.
+          false
+        case _ => hadoopConf.get("hive.exec.compress.output", "false").toBoolean
+    }
+
     if (isCompressed) {
-      // Please note that isCompressed, "mapreduce.output.fileoutputformat.compress",
-      // "mapreduce.output.fileoutputformat.compress.codec", and
-      // "mapreduce.output.fileoutputformat.compress.type"
-      // have no impact on ORC because it uses table properties to store compression information.
       hadoopConf.set("mapreduce.output.fileoutputformat.compress", "true")
       fileSinkConf.setCompressed(true)
       fileSinkConf.setCompressCodec(hadoopConf
         .get("mapreduce.output.fileoutputformat.compress.codec"))
       fileSinkConf.setCompressType(hadoopConf
         .get("mapreduce.output.fileoutputformat.compress.type"))
+    } else {
+      // Set compression by priority
+      HiveOptions.getHiveWriteCompression(fileSinkConf.getTableInfo, sparkSession.sessionState.conf)
+        .foreach { case (compression, codec) => hadoopConf.set(compression, codec) }
     }
 
     val committer = FileCommitProtocol.instantiate(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CompressionCodecSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CompressionCodecSuite.scala
new file mode 100644
index 0000000000000..d10a6f25c64fc
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CompressionCodecSuite.scala
@@ -0,0 +1,353 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import java.io.File
+
+import scala.collection.JavaConverters._
+
+import org.apache.hadoop.fs.Path
+import org.apache.orc.OrcConf.COMPRESS
+import org.apache.parquet.hadoop.ParquetOutputFormat
+import org.scalatest.BeforeAndAfterAll
+
+import org.apache.spark.sql.execution.datasources.orc.OrcOptions
+import org.apache.spark.sql.execution.datasources.parquet.{ParquetOptions, ParquetTest}
+import org.apache.spark.sql.hive.orc.OrcFileOperator
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.SQLConf
+
+class CompressionCodecSuite extends TestHiveSingleton with ParquetTest with BeforeAndAfterAll {
+  import spark.implicits._
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    (0 until maxRecordNum).toDF("a").createOrReplaceTempView("table_source")
+  }
+
+  override def afterAll(): Unit = {
+    try {
+      spark.catalog.dropTempView("table_source")
+    } finally {
+      super.afterAll()
+    }
+  }
+
+  private val maxRecordNum = 50
+
+  private def getConvertMetastoreConfName(format: String): String = format.toLowerCase match {
+    case "parquet" => HiveUtils.CONVERT_METASTORE_PARQUET.key
+    case "orc" => HiveUtils.CONVERT_METASTORE_ORC.key
+  }
+
+  private def getSparkCompressionConfName(format: String): String = format.toLowerCase match {
+    case "parquet" => SQLConf.PARQUET_COMPRESSION.key
+    case "orc" => SQLConf.ORC_COMPRESSION.key
+  }
+
+  private def getHiveCompressPropName(format: String): String = format.toLowerCase match {
+    case "parquet" => ParquetOutputFormat.COMPRESSION
+    case "orc" => COMPRESS.getAttribute
+  }
+
+  private def normalizeCodecName(format: String, name: String): String = {
+    format.toLowerCase match {
+      case "parquet" => ParquetOptions.getParquetCompressionCodecName(name)
+      case "orc" => OrcOptions.getORCCompressionCodecName(name)
+    }
+  }
+
+  private def getTableCompressionCodec(path: String, format: String): Seq[String] = {
+    val hadoopConf = spark.sessionState.newHadoopConf()
+    val codecs = format.toLowerCase match {
+      case "parquet" => for {
+        footer <- readAllFootersWithoutSummaryFiles(new Path(path), hadoopConf)
+        block <- footer.getParquetMetadata.getBlocks.asScala
+        column <- block.getColumns.asScala
+      } yield column.getCodec.name()
+      case "orc" => new File(path).listFiles().filter { file =>
+        file.isFile && !file.getName.endsWith(".crc") && file.getName != "_SUCCESS"
+      }.map { orcFile =>
+        OrcFileOperator.getFileReader(orcFile.toPath.toString).get.getCompression.toString
+      }.toSeq
+    }
+    codecs.distinct
+  }
+
+  private def createTable(
+      rootDir: File,
+      tableName: String,
+      isPartitioned: Boolean,
+      format: String,
+      compressionCodec: Option[String]): Unit = {
+    val tblProperties = compressionCodec match {
+      case Some(prop) => s"TBLPROPERTIES('${getHiveCompressPropName(format)}'='$prop')"
+      case _ => ""
+    }
+    val partitionCreate = if (isPartitioned) "PARTITIONED BY (p string)" else ""
+    sql(
+      s"""
+        |CREATE TABLE $tableName(a int)
+        |$partitionCreate
+        |STORED AS $format
+        |LOCATION '${rootDir.toURI.toString.stripSuffix("/")}/$tableName'
+        |$tblProperties
+      """.stripMargin)
+  }
+
+  private def writeDataToTable(
+      tableName: String,
+      partitionValue: Option[String]): Unit = {
+    val partitionInsert = partitionValue.map(p => s"partition (p='$p')").mkString
+    sql(
+      s"""
+        |INSERT INTO TABLE $tableName
+        |$partitionInsert
+        |SELECT * FROM table_source
+      """.stripMargin)
+  }
+
+  private def writeDateToTableUsingCTAS(
+      rootDir: File,
+      tableName: String,
+      partitionValue: Option[String],
+      format: String,
+      compressionCodec: Option[String]): Unit = {
+    val partitionCreate = partitionValue.map(p => s"PARTITIONED BY (p)").mkString
+    val compressionOption = compressionCodec.map { codec =>
+      s",'${getHiveCompressPropName(format)}'='$codec'"
+    }.mkString
+    val partitionSelect = partitionValue.map(p => s",'$p' AS p").mkString
+    sql(
+      s"""
+        |CREATE TABLE $tableName
+        |USING $format
+        |OPTIONS('path'='${rootDir.toURI.toString.stripSuffix("/")}/$tableName' $compressionOption)
+        |$partitionCreate
+        |AS SELECT * $partitionSelect FROM table_source
+      """.stripMargin)
+  }
+
+  private def getPreparedTablePath(
+      tmpDir: File,
+      tableName: String,
+      isPartitioned: Boolean,
+      format: String,
+      compressionCodec: Option[String],
+      usingCTAS: Boolean): String = {
+    val partitionValue = if (isPartitioned) Some("test") else None
+    if (usingCTAS) {
+      writeDateToTableUsingCTAS(tmpDir, tableName, partitionValue, format, compressionCodec)
+    } else {
+      createTable(tmpDir, tableName, isPartitioned, format, compressionCodec)
+      writeDataToTable(tableName, partitionValue)
+    }
+    getTablePartitionPath(tmpDir, tableName, partitionValue)
+  }
+
+  private def getTableSize(path: String): Long = {
+    val dir = new File(path)
+    val files = dir.listFiles().filter(_.getName.startsWith("part-"))
+    files.map(_.length()).sum
+  }
+
+  private def getTablePartitionPath(
+      dir: File,
+      tableName: String,
+      partitionValue: Option[String]) = {
+    val partitionPath = partitionValue.map(p => s"p=$p").mkString
+    s"${dir.getPath.stripSuffix("/")}/$tableName/$partitionPath"
+  }
+
+  private def getUncompressedDataSizeByFormat(
+      format: String, isPartitioned: Boolean, usingCTAS: Boolean): Long = {
+    var totalSize = 0L
+    val tableName = s"tbl_$format"
+    val codecName = normalizeCodecName(format, "uncompressed")
+    withSQLConf(getSparkCompressionConfName(format) -> codecName) {
+      withTempDir { tmpDir =>
+        withTable(tableName) {
+          val compressionCodec = Option(codecName)
+          val path = getPreparedTablePath(
+            tmpDir, tableName, isPartitioned, format, compressionCodec, usingCTAS)
+          totalSize = getTableSize(path)
+        }
+      }
+    }
+    assert(totalSize > 0L)
+    totalSize
+  }
+
+  private def checkCompressionCodecForTable(
+      format: String,
+      isPartitioned: Boolean,
+      compressionCodec: Option[String],
+      usingCTAS: Boolean)
+      (assertion: (String, Long) => Unit): Unit = {
+    val tableName =
+      if (usingCTAS) s"tbl_$format$isPartitioned" else s"tbl_$format${isPartitioned}_CAST"
+    withTempDir { tmpDir =>
+      withTable(tableName) {
+        val path = getPreparedTablePath(
+          tmpDir, tableName, isPartitioned, format, compressionCodec, usingCTAS)
+        val relCompressionCodecs = getTableCompressionCodec(path, format)
+        assert(relCompressionCodecs.length == 1)
+        val tableSize = getTableSize(path)
+        assertion(relCompressionCodecs.head, tableSize)
+      }
+    }
+  }
+
+  private def checkTableCompressionCodecForCodecs(
+      format: String,
+      isPartitioned: Boolean,
+      convertMetastore: Boolean,
+      usingCTAS: Boolean,
+      compressionCodecs: List[String],
+      tableCompressionCodecs: List[String])
+      (assertionCompressionCodec: (Option[String], String, String, Long) => Unit): Unit = {
+    withSQLConf(getConvertMetastoreConfName(format) -> convertMetastore.toString) {
+      tableCompressionCodecs.foreach { tableCompression =>
+        compressionCodecs.foreach { sessionCompressionCodec =>
+          withSQLConf(getSparkCompressionConfName(format) -> sessionCompressionCodec) {
+            // 'tableCompression = null' means no table-level compression
+            val compression = Option(tableCompression)
+            checkCompressionCodecForTable(format, isPartitioned, compression, usingCTAS) {
+              case (realCompressionCodec, tableSize) =>
+                assertionCompressionCodec(
+                  compression, sessionCompressionCodec, realCompressionCodec, tableSize)
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // When the amount of data is small, compressed data size may be larger than uncompressed one,
+  // so we just check the difference when compressionCodec is not NONE or UNCOMPRESSED.
+  private def checkTableSize(
+      format: String,
+      compressionCodec: String,
+      isPartitioned: Boolean,
+      convertMetastore: Boolean,
+      usingCTAS: Boolean,
+      tableSize: Long): Boolean = {
+    val uncompressedSize = getUncompressedDataSizeByFormat(format, isPartitioned, usingCTAS)
+    compressionCodec match {
+      case "UNCOMPRESSED" if format == "parquet" => tableSize == uncompressedSize
+      case "NONE" if format == "orc" => tableSize == uncompressedSize
+      case _ => tableSize != uncompressedSize
+    }
+  }
+
+  def checkForTableWithCompressProp(format: String, compressCodecs: List[String]): Unit = {
+    Seq(true, false).foreach { isPartitioned =>
+      Seq(true, false).foreach { convertMetastore =>
+        // TODO: Also verify CTAS(usingCTAS=true) cases when the bug(SPARK-22926) is fixed.
+        Seq(false).foreach { usingCTAS =>
+          checkTableCompressionCodecForCodecs(
+            format,
+            isPartitioned,
+            convertMetastore,
+            usingCTAS,
+            compressionCodecs = compressCodecs,
+            tableCompressionCodecs = compressCodecs) {
+            case (tableCodec, sessionCodec, realCodec, tableSize) =>
+              // For non-partitioned table and when convertMetastore is true, Expect session-level
+              // take effect, and in other cases expect table-level take effect
+              // TODO: It should always be table-level taking effect when the bug(SPARK-22926)
+              // is fixed
+              val expectCodec =
+                if (convertMetastore && !isPartitioned) sessionCodec else tableCodec.get
+              assert(expectCodec == realCodec)
+              assert(checkTableSize(
+                format, expectCodec, isPartitioned, convertMetastore, usingCTAS, tableSize))
+          }
+        }
+      }
+    }
+  }
+
+  def checkForTableWithoutCompressProp(format: String, compressCodecs: List[String]): Unit = {
+    Seq(true, false).foreach { isPartitioned =>
+      Seq(true, false).foreach { convertMetastore =>
+        // TODO: Also verify CTAS(usingCTAS=true) cases when the bug(SPARK-22926) is fixed.
+        Seq(false).foreach { usingCTAS =>
+          checkTableCompressionCodecForCodecs(
+            format,
+            isPartitioned,
+            convertMetastore,
+            usingCTAS,
+            compressionCodecs = compressCodecs,
+            tableCompressionCodecs = List(null)) {
+            case (tableCodec, sessionCodec, realCodec, tableSize) =>
+              // Always expect session-level take effect
+              assert(sessionCodec == realCodec)
+              assert(checkTableSize(
+                format, sessionCodec, isPartitioned, convertMetastore, usingCTAS, tableSize))
+          }
+        }
+      }
+    }
+  }
+
+  test("both table-level and session-level compression are set") {
+    checkForTableWithCompressProp("parquet", List("UNCOMPRESSED", "SNAPPY", "GZIP"))
+    checkForTableWithCompressProp("orc", List("NONE", "SNAPPY", "ZLIB"))
+  }
+
+  test("table-level compression is not set but session-level compressions is set ") {
+    checkForTableWithoutCompressProp("parquet", List("UNCOMPRESSED", "SNAPPY", "GZIP"))
+    checkForTableWithoutCompressProp("orc", List("NONE", "SNAPPY", "ZLIB"))
+  }
+
+  def checkTableWriteWithCompressionCodecs(format: String, compressCodecs: List[String]): Unit = {
+    Seq(true, false).foreach { isPartitioned =>
+      Seq(true, false).foreach { convertMetastore =>
+        withTempDir { tmpDir =>
+          val tableName = s"tbl_$format$isPartitioned"
+          createTable(tmpDir, tableName, isPartitioned, format, None)
+          withTable(tableName) {
+            compressCodecs.foreach { compressionCodec =>
+              val partitionValue = if (isPartitioned) Some(compressionCodec) else None
+              withSQLConf(getConvertMetastoreConfName(format) -> convertMetastore.toString,
+                getSparkCompressionConfName(format) -> compressionCodec
+              ) { writeDataToTable(tableName, partitionValue) }
+            }
+            val tablePath = getTablePartitionPath(tmpDir, tableName, None)
+            val realCompressionCodecs =
+              if (isPartitioned) compressCodecs.flatMap { codec =>
+                getTableCompressionCodec(s"$tablePath/p=$codec", format)
+              } else {
+                getTableCompressionCodec(tablePath, format)
+              }
+
+            assert(realCompressionCodecs.distinct.sorted == compressCodecs.sorted)
+            val recordsNum = sql(s"SELECT * from $tableName").count()
+            assert(recordsNum == maxRecordNum * compressCodecs.length)
+          }
+        }
+      }
+    }
+  }
+
+  test("test table containing mixed compression codec") {
+    checkTableWriteWithCompressionCodecs("parquet", List("UNCOMPRESSED", "SNAPPY", "GZIP"))
+    checkTableWriteWithCompressionCodecs("orc", List("NONE", "SNAPPY", "ZLIB"))
+  }
+}

From e0ef30f770329f058843a7a486bf357e9cd6e26a Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Sat, 20 Jan 2018 22:39:49 -0800
Subject: [PATCH 150/566] [SPARK-23087][SQL] CheckCartesianProduct too
 restrictive when condition is false/null

## What changes were proposed in this pull request?

CheckCartesianProduct raises an AnalysisException also when the join condition is always false/null. In this case, we shouldn't raise it, since the result will not be a cartesian product.

## How was this patch tested?

added UT

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #20333 from mgaido91/SPARK-23087.

(cherry picked from commit 121dc96f088a7b157d5b2cffb626b0e22d1fc052)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../spark/sql/catalyst/optimizer/Optimizer.scala   | 10 +++++++---
 .../org/apache/spark/sql/DataFrameJoinSuite.scala  | 14 ++++++++++++++
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index c794ba8619322..0f9daa5f04c76 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -1108,15 +1108,19 @@ object CheckCartesianProducts extends Rule[LogicalPlan] with PredicateHelper {
    */
   def isCartesianProduct(join: Join): Boolean = {
     val conditions = join.condition.map(splitConjunctivePredicates).getOrElse(Nil)
-    !conditions.map(_.references).exists(refs => refs.exists(join.left.outputSet.contains)
-        && refs.exists(join.right.outputSet.contains))
+
+    conditions match {
+      case Seq(Literal.FalseLiteral) | Seq(Literal(null, BooleanType)) => false
+      case _ => !conditions.map(_.references).exists(refs =>
+        refs.exists(join.left.outputSet.contains) && refs.exists(join.right.outputSet.contains))
+    }
   }
 
   def apply(plan: LogicalPlan): LogicalPlan =
     if (SQLConf.get.crossJoinEnabled) {
       plan
     } else plan transform {
-      case j @ Join(left, right, Inner | LeftOuter | RightOuter | FullOuter, condition)
+      case j @ Join(left, right, Inner | LeftOuter | RightOuter | FullOuter, _)
         if isCartesianProduct(j) =>
           throw new AnalysisException(
             s"""Detected cartesian product for ${j.joinType.sql} join between logical plans
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
index aef0d7f3e425b..1656f290ee19c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
@@ -274,4 +274,18 @@ class DataFrameJoinSuite extends QueryTest with SharedSQLContext {
     checkAnswer(innerJoin, Row(1) :: Nil)
   }
 
+  test("SPARK-23087: don't throw Analysis Exception in CheckCartesianProduct when join condition " +
+    "is false or null") {
+    val df = spark.range(10)
+    val dfNull = spark.range(10).select(lit(null).as("b"))
+    val planNull = df.join(dfNull, $"id" === $"b", "left").queryExecution.analyzed
+
+    spark.sessionState.executePlan(planNull).optimizedPlan
+
+    val dfOne = df.select(lit(1).as("a"))
+    val dfTwo = spark.range(10).select(lit(2).as("b"))
+    val planFalse = dfOne.join(dfTwo, $"a" === $"b", "left").queryExecution.analyzed
+
+    spark.sessionState.executePlan(planFalse).optimizedPlan
+  }
 }

From 7520491bf80eb2e21f0630aa13d7cdaad881626b Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Sun, 21 Jan 2018 11:23:51 -0800
Subject: [PATCH 151/566] [SPARK-21293][SS][SPARKR] Add doc example for
 streaming join, dedup

## What changes were proposed in this pull request?

streaming programming guide changes

## How was this patch tested?

manually

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #20340 from felixcheung/rstreamdoc.

(cherry picked from commit 2239d7a410e906ccd40aa8e84d637e9d06cd7b8a)
Signed-off-by: Felix Cheung <felixcheung@apache.org>
---
 .../structured-streaming-programming-guide.md | 74 ++++++++++++++++++-
 1 file changed, 72 insertions(+), 2 deletions(-)

diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index 2ef5d3168a87b..62589a62ac4c4 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -1100,6 +1100,21 @@ streamingDf.join(staticDf, "type")  # inner equi-join with a static DF
 streamingDf.join(staticDf, "type", "right_join")  # right outer join with a static DF
 {% endhighlight %}
 
+</div>
+
+<div data-lang="r"  markdown="1">
+
+{% highlight r %}
+staticDf <- read.df(...)
+streamingDf <- read.stream(...)
+joined <- merge(streamingDf, staticDf, sort = FALSE)  # inner equi-join with a static DF
+joined <- join(
+            staticDf,
+            streamingDf, 
+            streamingDf$value == staticDf$value,
+            "right_outer")  # right outer join with a static DF
+{% endhighlight %}
+
 </div>
 </div>
 
@@ -1227,6 +1242,30 @@ impressionsWithWatermark.join(
 
 {% endhighlight %}
 
+</div>
+<div data-lang="r"  markdown="1">
+
+{% highlight r %}
+impressions <- read.stream(...)
+clicks <- read.stream(...)
+
+# Apply watermarks on event-time columns
+impressionsWithWatermark <- withWatermark(impressions, "impressionTime", "2 hours")
+clicksWithWatermark <- withWatermark(clicks, "clickTime", "3 hours")
+
+# Join with event-time constraints
+joined <- join(
+  impressionsWithWatermark,
+  clicksWithWatermark,
+  expr(
+    paste(
+      "clickAdId = impressionAdId AND",
+      "clickTime >= impressionTime AND",
+      "clickTime <= impressionTime + interval 1 hour"
+)))
+
+{% endhighlight %}
+
 </div>
 </div>
 
@@ -1287,6 +1326,23 @@ impressionsWithWatermark.join(
 
 {% endhighlight %}
 
+</div>
+<div data-lang="r"  markdown="1">
+
+{% highlight r %}
+joined <- join(
+  impressionsWithWatermark,
+  clicksWithWatermark,
+  expr(
+    paste(
+      "clickAdId = impressionAdId AND",
+      "clickTime >= impressionTime AND",
+      "clickTime <= impressionTime + interval 1 hour"),
+  "left_outer"                 # can be "inner", "left_outer", "right_outer"
+))
+
+{% endhighlight %}
+
 </div>
 </div>
 
@@ -1441,15 +1497,29 @@ streamingDf
 {% highlight python %}
 streamingDf = spark.readStream. ...
 
-// Without watermark using guid column
+# Without watermark using guid column
 streamingDf.dropDuplicates("guid")
 
-// With watermark using guid and eventTime columns
+# With watermark using guid and eventTime columns
 streamingDf \
   .withWatermark("eventTime", "10 seconds") \
   .dropDuplicates("guid", "eventTime")
 {% endhighlight %}
 
+</div>
+<div data-lang="r"  markdown="1">
+
+{% highlight r %}
+streamingDf <- read.stream(...)
+
+# Without watermark using guid column
+streamingDf <- dropDuplicates(streamingDf, "guid")
+
+# With watermark using guid and eventTime columns
+streamingDf <- withWatermark(streamingDf, "eventTime", "10 seconds")
+streamingDf <- dropDuplicates(streamingDf, "guid", "eventTime")
+{% endhighlight %}
+
 </div>
 </div>
 

From 5781fa79e28e2123e370fc1096488e318f2b4ee2 Mon Sep 17 00:00:00 2001
From: Russell Spitzer <Russell.Spitzer@gmail.com>
Date: Mon, 22 Jan 2018 12:27:51 +0800
Subject: [PATCH 152/566] [SPARK-22976][CORE] Cluster mode driver dir removed
 while running

## What changes were proposed in this pull request?

The clean up logic on the worker perviously determined the liveness of a
particular applicaiton based on whether or not it had running executors.
This would fail in the case that a directory was made for a driver
running in cluster mode if that driver had no running executors on the
same machine. To preserve driver directories we consider both executors
and running drivers when checking directory liveness.

## How was this patch tested?

Manually started up two node cluster with a single core on each node. Turned on worker directory cleanup and set the interval to 1 second and liveness to one second. Without the patch the driver directory is removed immediately after the app is launched. With the patch it is not

### Without Patch
```
INFO  2018-01-05 23:48:24,693 Logging.scala:54 - Asked to launch driver driver-20180105234824-0000
INFO  2018-01-05 23:48:25,293 Logging.scala:54 - Changing view acls to: cassandra
INFO  2018-01-05 23:48:25,293 Logging.scala:54 - Changing modify acls to: cassandra
INFO  2018-01-05 23:48:25,294 Logging.scala:54 - Changing view acls groups to:
INFO  2018-01-05 23:48:25,294 Logging.scala:54 - Changing modify acls groups to:
INFO  2018-01-05 23:48:25,294 Logging.scala:54 - SecurityManager: authentication disabled; ui acls disabled; users  with view permissions: Set(cassandra); groups with view permissions: Set(); users  with modify permissions: Set(cassandra); groups with modify permissions: Set()
INFO  2018-01-05 23:48:25,330 Logging.scala:54 - Copying user jar file:/home/automaton/writeRead-0.1.jar to /var/lib/spark/worker/driver-20180105234824-0000/writeRead-0.1.jar
INFO  2018-01-05 23:48:25,332 Logging.scala:54 - Copying /home/automaton/writeRead-0.1.jar to /var/lib/spark/worker/driver-20180105234824-0000/writeRead-0.1.jar
INFO  2018-01-05 23:48:25,361 Logging.scala:54 - Launch Command: "/usr/lib/jvm/jdk1.8.0_40//bin/java" ....
****
INFO  2018-01-05 23:48:56,577 Logging.scala:54 - Removing directory: /var/lib/spark/worker/driver-20180105234824-0000  ### << Cleaned up
****
--
One minute passes while app runs (app has 1 minute sleep built in)
--

WARN  2018-01-05 23:49:58,080 ShuffleSecretManager.java:73 - Attempted to unregister application app-20180105234831-0000 when it is not registered
INFO  2018-01-05 23:49:58,081 ExternalShuffleBlockResolver.java:163 - Application app-20180105234831-0000 removed, cleanupLocalDirs = false
INFO  2018-01-05 23:49:58,081 ExternalShuffleBlockResolver.java:163 - Application app-20180105234831-0000 removed, cleanupLocalDirs = false
INFO  2018-01-05 23:49:58,082 ExternalShuffleBlockResolver.java:163 - Application app-20180105234831-0000 removed, cleanupLocalDirs = true
INFO  2018-01-05 23:50:00,999 Logging.scala:54 - Driver driver-20180105234824-0000 exited successfully
```

With Patch
```
INFO  2018-01-08 23:19:54,603 Logging.scala:54 - Asked to launch driver driver-20180108231954-0002
INFO  2018-01-08 23:19:54,975 Logging.scala:54 - Changing view acls to: automaton
INFO  2018-01-08 23:19:54,976 Logging.scala:54 - Changing modify acls to: automaton
INFO  2018-01-08 23:19:54,976 Logging.scala:54 - Changing view acls groups to:
INFO  2018-01-08 23:19:54,976 Logging.scala:54 - Changing modify acls groups to:
INFO  2018-01-08 23:19:54,976 Logging.scala:54 - SecurityManager: authentication disabled; ui acls disabled; users  with view permissions: Set(automaton); groups with view permissions: Set(); users  with modify permissions: Set(automaton); groups with modify permissions: Set()
INFO  2018-01-08 23:19:55,029 Logging.scala:54 - Copying user jar file:/home/automaton/writeRead-0.1.jar to /var/lib/spark/worker/driver-20180108231954-0002/writeRead-0.1.jar
INFO  2018-01-08 23:19:55,031 Logging.scala:54 - Copying /home/automaton/writeRead-0.1.jar to /var/lib/spark/worker/driver-20180108231954-0002/writeRead-0.1.jar
INFO  2018-01-08 23:19:55,038 Logging.scala:54 - Launch Command: ......
INFO  2018-01-08 23:21:28,674 ShuffleSecretManager.java:69 - Unregistered shuffle secret for application app-20180108232000-0000
INFO  2018-01-08 23:21:28,675 ExternalShuffleBlockResolver.java:163 - Application app-20180108232000-0000 removed, cleanupLocalDirs = false
INFO  2018-01-08 23:21:28,675 ExternalShuffleBlockResolver.java:163 - Application app-20180108232000-0000 removed, cleanupLocalDirs = false
INFO  2018-01-08 23:21:28,681 ExternalShuffleBlockResolver.java:163 - Application app-20180108232000-0000 removed, cleanupLocalDirs = true
INFO  2018-01-08 23:21:31,703 Logging.scala:54 - Driver driver-20180108231954-0002 exited successfully
*****
INFO  2018-01-08 23:21:32,346 Logging.scala:54 - Removing directory: /var/lib/spark/worker/driver-20180108231954-0002 ### < Happening AFTER the Run completes rather than during it
*****
```

Author: Russell Spitzer <Russell.Spitzer@gmail.com>

Closes #20298 from RussellSpitzer/SPARK-22976-master.

(cherry picked from commit 11daeb833222b1cd349fb1410307d64ab33981db)
Signed-off-by: jerryshao <sshao@hortonworks.com>
---
 core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index 3962d422f81d3..563b84934f264 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -441,7 +441,7 @@ private[deploy] class Worker(
       // Spin up a separate thread (in a future) to do the dir cleanup; don't tie up worker
       // rpcEndpoint.
       // Copy ids so that it can be used in the cleanup thread.
-      val appIds = executors.values.map(_.appId).toSet
+      val appIds = (executors.values.map(_.appId) ++ drivers.values.map(_.driverId)).toSet
       val cleanupFuture = concurrent.Future {
         val appDirs = workDir.listFiles()
         if (appDirs == null) {

From 36af73b59b6fb3d5f8e8a8e1caf44bd565e97b3d Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Mon, 22 Jan 2018 15:18:57 +0900
Subject: [PATCH 153/566] [MINOR][SQL] Fix wrong comments on
 org.apache.spark.sql.parquet.row.attributes

## What changes were proposed in this pull request?

This PR fixes the wrong comment on `org.apache.spark.sql.parquet.row.attributes`
which is useful for UDTs like Vector/Matrix. Please see [SPARK-22320](https://issues.apache.org/jira/browse/SPARK-22320) for the usage.

Originally, [SPARK-19411](https://github.com/apache/spark/commit/bf493686eb17006727b3ec81849b22f3df68fdef#diff-ee26d4c4be21e92e92a02e9f16dbc285L314) left this behind during removing optional column metadatas. In the same PR, the same comment was removed at line 310-311.

## How was this patch tested?

N/A (This is about comments).

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20346 from dongjoon-hyun/minor_comment_parquet.

(cherry picked from commit 8142a3b883a5fe6fc620a2c5b25b6bde4fda32e5)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 .../sql/execution/datasources/parquet/ParquetFileFormat.scala  | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
index 45bedf70f975c..f53a97ba45a26 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -108,8 +108,7 @@ class ParquetFileFormat
 
     ParquetOutputFormat.setWriteSupportClass(job, classOf[ParquetWriteSupport])
 
-    // We want to clear this temporary metadata from saving into Parquet file.
-    // This metadata is only useful for detecting optional columns when pushdowning filters.
+    // This metadata is useful for keeping UDTs like Vector/Matrix.
     ParquetWriteSupport.setSchema(dataSchema, conf)
 
     // Sets flags for `ParquetWriteSupport`, which converts Catalyst schema to Parquet

From 57c320a0dcc6ca784331af0191438e252d418075 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Mon, 22 Jan 2018 14:49:12 +0800
Subject: [PATCH 154/566] [SPARK-23020][CORE] Fix races in launcher code, test.

The race in the code is because the handle might update
its state to the wrong state if the connection handling
thread is still processing incoming data; so the handle
needs to wait for the connection to finish up before
checking the final state.

The race in the test is because when waiting for a handle
to reach a final state, the waitFor() method needs to wait
until all handle state is updated (which also includes
waiting for the connection thread above to finish).
Otherwise, waitFor() may return too early, which would cause
a bunch of different races (like the listener not being yet
notified of the state change, or being in the middle of
being notified, or the handle not being properly disposed
and causing postChecks() to assert).

On top of that I found, by code inspection, a couple of
potential races that could make a handle end up in the
wrong state when being killed.

The original version of this fix introduced the flipped
version of the first race described above; the connection
closing might override the handle state before the
handle might have a chance to do cleanup. The fix there
is to only dispose of the handle from the connection
when there is an error, and let the handle dispose
itself in the normal case.

The fix also caused a bug in YarnClusterSuite to be surfaced;
the code was checking for a file in the classpath that was
not expected to be there in client mode. Because of the above
issues, the error was not propagating correctly and the (buggy)
test was incorrectly passing.

Tested by running the existing unit tests a lot (and not
seeing the errors I was seeing before).

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20297 from vanzin/SPARK-23020.

(cherry picked from commit ec228976156619ed8df21a85bceb5fd3bdeb5855)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/launcher/SparkLauncherSuite.java    | 53 +++++++++++--------
 .../spark/launcher/AbstractAppHandle.java     | 22 ++++++--
 .../spark/launcher/ChildProcAppHandle.java    | 18 ++++---
 .../spark/launcher/InProcessAppHandle.java    | 17 +++---
 .../spark/launcher/LauncherConnection.java    | 18 +++----
 .../apache/spark/launcher/LauncherServer.java | 49 +++++++++++++----
 .../org/apache/spark/launcher/BaseSuite.java  | 42 ++++++++++++---
 .../spark/launcher/LauncherServerSuite.java   | 23 ++++----
 .../spark/deploy/yarn/YarnClusterSuite.scala  |  4 +-
 9 files changed, 165 insertions(+), 81 deletions(-)

diff --git a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
index dffa609f1cbdf..a042375c6ae91 100644
--- a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
+++ b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.launcher;
 
+import java.time.Duration;
 import java.util.Arrays;
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -25,13 +26,13 @@
 import java.util.Properties;
 import java.util.concurrent.TimeUnit;
 
-import org.junit.Ignore;
 import org.junit.Test;
 import static org.junit.Assert.*;
 import static org.junit.Assume.*;
 import static org.mockito.Mockito.*;
 
 import org.apache.spark.SparkContext;
+import org.apache.spark.SparkContext$;
 import org.apache.spark.internal.config.package$;
 import org.apache.spark.util.Utils;
 
@@ -121,8 +122,7 @@ public void testChildProcLauncher() throws Exception {
     assertEquals(0, app.waitFor());
   }
 
-  // TODO: [SPARK-23020] Re-enable this
-  @Ignore
+  @Test
   public void testInProcessLauncher() throws Exception {
     // Because this test runs SparkLauncher in process and in client mode, it pollutes the system
     // properties, and that can cause test failures down the test pipeline. So restore the original
@@ -139,7 +139,9 @@ public void testInProcessLauncher() throws Exception {
       // Here DAGScheduler is stopped, while SparkContext.clearActiveContext may not be called yet.
       // Wait for a reasonable amount of time to avoid creating two active SparkContext in JVM.
       // See SPARK-23019 and SparkContext.stop() for details.
-      TimeUnit.MILLISECONDS.sleep(500);
+      eventually(Duration.ofSeconds(5), Duration.ofMillis(10), () -> {
+        assertTrue("SparkContext is still alive.", SparkContext$.MODULE$.getActive().isEmpty());
+      });
     }
   }
 
@@ -148,26 +150,35 @@ private void inProcessLauncherTestImpl() throws Exception {
     SparkAppHandle.Listener listener = mock(SparkAppHandle.Listener.class);
     doAnswer(invocation -> {
       SparkAppHandle h = (SparkAppHandle) invocation.getArguments()[0];
-      transitions.add(h.getState());
+      synchronized (transitions) {
+        transitions.add(h.getState());
+      }
       return null;
     }).when(listener).stateChanged(any(SparkAppHandle.class));
 
-    SparkAppHandle handle = new InProcessLauncher()
-      .setMaster("local")
-      .setAppResource(SparkLauncher.NO_RESOURCE)
-      .setMainClass(InProcessTestApp.class.getName())
-      .addAppArgs("hello")
-      .startApplication(listener);
-
-    waitFor(handle);
-    assertEquals(SparkAppHandle.State.FINISHED, handle.getState());
-
-    // Matches the behavior of LocalSchedulerBackend.
-    List<SparkAppHandle.State> expected = Arrays.asList(
-      SparkAppHandle.State.CONNECTED,
-      SparkAppHandle.State.RUNNING,
-      SparkAppHandle.State.FINISHED);
-    assertEquals(expected, transitions);
+    SparkAppHandle handle = null;
+    try {
+      handle = new InProcessLauncher()
+        .setMaster("local")
+        .setAppResource(SparkLauncher.NO_RESOURCE)
+        .setMainClass(InProcessTestApp.class.getName())
+        .addAppArgs("hello")
+        .startApplication(listener);
+
+      waitFor(handle);
+      assertEquals(SparkAppHandle.State.FINISHED, handle.getState());
+
+      // Matches the behavior of LocalSchedulerBackend.
+      List<SparkAppHandle.State> expected = Arrays.asList(
+        SparkAppHandle.State.CONNECTED,
+        SparkAppHandle.State.RUNNING,
+        SparkAppHandle.State.FINISHED);
+      assertEquals(expected, transitions);
+    } finally {
+      if (handle != null) {
+        handle.kill();
+      }
+    }
   }
 
   public static class SparkLauncherTestApp {
diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java
index df1e7316861d4..daf0972f824dd 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java
@@ -33,7 +33,7 @@ abstract class AbstractAppHandle implements SparkAppHandle {
   private List<Listener> listeners;
   private State state;
   private String appId;
-  private boolean disposed;
+  private volatile boolean disposed;
 
   protected AbstractAppHandle(LauncherServer server) {
     this.server = server;
@@ -70,8 +70,7 @@ public void stop() {
 
   @Override
   public synchronized void disconnect() {
-    if (!disposed) {
-      disposed = true;
+    if (!isDisposed()) {
       if (connection != null) {
         try {
           connection.close();
@@ -79,7 +78,7 @@ public synchronized void disconnect() {
           // no-op.
         }
       }
-      server.unregister(this);
+      dispose();
     }
   }
 
@@ -95,6 +94,21 @@ boolean isDisposed() {
     return disposed;
   }
 
+  /**
+   * Mark the handle as disposed, and set it as LOST in case the current state is not final.
+   */
+  synchronized void dispose() {
+    if (!isDisposed()) {
+      // Unregister first to make sure that the connection with the app has been really
+      // terminated.
+      server.unregister(this);
+      if (!getState().isFinal()) {
+        setState(State.LOST);
+      }
+      this.disposed = true;
+    }
+  }
+
   void setState(State s) {
     setState(s, false);
   }
diff --git a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
index 8b3f427b7750e..2b99461652e1f 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
@@ -48,14 +48,16 @@ public synchronized void disconnect() {
 
   @Override
   public synchronized void kill() {
-    disconnect();
-    if (childProc != null) {
-      if (childProc.isAlive()) {
-        childProc.destroyForcibly();
+    if (!isDisposed()) {
+      setState(State.KILLED);
+      disconnect();
+      if (childProc != null) {
+        if (childProc.isAlive()) {
+          childProc.destroyForcibly();
+        }
+        childProc = null;
       }
-      childProc = null;
     }
-    setState(State.KILLED);
   }
 
   void setChildProc(Process childProc, String loggerName, InputStream logStream) {
@@ -94,8 +96,6 @@ void monitorChild() {
         return;
       }
 
-      disconnect();
-
       int ec;
       try {
         ec = proc.exitValue();
@@ -118,6 +118,8 @@ void monitorChild() {
       if (newState != null) {
         setState(newState, true);
       }
+
+      disconnect();
     }
   }
 
diff --git a/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
index acd64c962604f..f04263cb74a58 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
@@ -39,15 +39,16 @@ class InProcessAppHandle extends AbstractAppHandle {
 
   @Override
   public synchronized void kill() {
-    LOG.warning("kill() may leave the underlying app running in in-process mode.");
-    disconnect();
-
-    // Interrupt the thread. This is not guaranteed to kill the app, though.
-    if (app != null) {
-      app.interrupt();
+    if (!isDisposed()) {
+      LOG.warning("kill() may leave the underlying app running in in-process mode.");
+      setState(State.KILLED);
+      disconnect();
+
+      // Interrupt the thread. This is not guaranteed to kill the app, though.
+      if (app != null) {
+        app.interrupt();
+      }
     }
-
-    setState(State.KILLED);
   }
 
   synchronized void start(String appName, Method main, String[] args) {
diff --git a/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java b/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java
index b4a8719e26053..e8ab3f5e369ab 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java
@@ -53,7 +53,7 @@ abstract class LauncherConnection implements Closeable, Runnable {
   public void run() {
     try {
       FilteredObjectInputStream in = new FilteredObjectInputStream(socket.getInputStream());
-      while (!closed) {
+      while (isOpen()) {
         Message msg = (Message) in.readObject();
         handle(msg);
       }
@@ -95,15 +95,15 @@ protected synchronized void send(Message msg) throws IOException {
   }
 
   @Override
-  public void close() throws IOException {
-    if (!closed) {
-      synchronized (this) {
-        if (!closed) {
-          closed = true;
-          socket.close();
-        }
-      }
+  public synchronized void close() throws IOException {
+    if (isOpen()) {
+      closed = true;
+      socket.close();
     }
   }
 
+  boolean isOpen() {
+    return !closed;
+  }
+
 }
diff --git a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
index b8999a1d7a4f4..8091885c4f562 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
@@ -217,6 +217,33 @@ void unregister(AbstractAppHandle handle) {
         break;
       }
     }
+
+    // If there is a live connection for this handle, we need to wait for it to finish before
+    // returning, otherwise there might be a race between the connection thread processing
+    // buffered data and the handle cleaning up after itself, leading to potentially the wrong
+    // state being reported for the handle.
+    ServerConnection conn = null;
+    synchronized (clients) {
+      for (ServerConnection c : clients) {
+        if (c.handle == handle) {
+          conn = c;
+          break;
+        }
+      }
+    }
+
+    if (conn != null) {
+      synchronized (conn) {
+        if (conn.isOpen()) {
+          try {
+            conn.wait();
+          } catch (InterruptedException ie) {
+            // Ignore.
+          }
+        }
+      }
+    }
+
     unref();
   }
 
@@ -288,7 +315,7 @@ private String createSecret() {
   private class ServerConnection extends LauncherConnection {
 
     private TimerTask timeout;
-    private AbstractAppHandle handle;
+    volatile AbstractAppHandle handle;
 
     ServerConnection(Socket socket, TimerTask timeout) throws IOException {
       super(socket);
@@ -313,7 +340,7 @@ protected void handle(Message msg) throws IOException {
         } else {
           if (handle == null) {
             throw new IllegalArgumentException("Expected hello, got: " +
-            msg != null ? msg.getClass().getName() : null);
+              msg != null ? msg.getClass().getName() : null);
           }
           if (msg instanceof SetAppId) {
             SetAppId set = (SetAppId) msg;
@@ -331,6 +358,9 @@ protected void handle(Message msg) throws IOException {
           timeout.cancel();
         }
         close();
+        if (handle != null) {
+          handle.dispose();
+        }
       } finally {
         timeoutTimer.purge();
       }
@@ -338,16 +368,17 @@ protected void handle(Message msg) throws IOException {
 
     @Override
     public void close() throws IOException {
+      if (!isOpen()) {
+        return;
+      }
+
       synchronized (clients) {
         clients.remove(this);
       }
-      super.close();
-      if (handle != null) {
-        if (!handle.getState().isFinal()) {
-          LOG.log(Level.WARNING, "Lost connection to spark application.");
-          handle.setState(SparkAppHandle.State.LOST);
-        }
-        handle.disconnect();
+
+      synchronized (this) {
+        super.close();
+        notifyAll();
       }
     }
 
diff --git a/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java b/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java
index 3e1a90eae98d4..3722a59d9438e 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.launcher;
 
+import java.time.Duration;
 import java.util.concurrent.TimeUnit;
 
 import org.junit.After;
@@ -47,19 +48,46 @@ public void postChecks() {
     assertNull(server);
   }
 
-  protected void waitFor(SparkAppHandle handle) throws Exception {
-    long deadline = System.nanoTime() + TimeUnit.SECONDS.toNanos(10);
+  protected void waitFor(final SparkAppHandle handle) throws Exception {
     try {
-      while (!handle.getState().isFinal()) {
-        assertTrue("Timed out waiting for handle to transition to final state.",
-          System.nanoTime() < deadline);
-        TimeUnit.MILLISECONDS.sleep(10);
-      }
+      eventually(Duration.ofSeconds(10), Duration.ofMillis(10), () -> {
+        assertTrue("Handle is not in final state.", handle.getState().isFinal());
+      });
     } finally {
       if (!handle.getState().isFinal()) {
         handle.kill();
       }
     }
+
+    // Wait until the handle has been marked as disposed, to make sure all cleanup tasks
+    // have been performed.
+    AbstractAppHandle ahandle = (AbstractAppHandle) handle;
+    eventually(Duration.ofSeconds(10), Duration.ofMillis(10), () -> {
+      assertTrue("Handle is still not marked as disposed.", ahandle.isDisposed());
+    });
+  }
+
+  /**
+   * Call a closure that performs a check every "period" until it succeeds, or the timeout
+   * elapses.
+   */
+  protected void eventually(Duration timeout, Duration period, Runnable check) throws Exception {
+    assertTrue("Timeout needs to be larger than period.", timeout.compareTo(period) > 0);
+    long deadline = System.nanoTime() + timeout.toNanos();
+    int count = 0;
+    while (true) {
+      try {
+        count++;
+        check.run();
+        return;
+      } catch (Throwable t) {
+        if (System.nanoTime() >= deadline) {
+          String msg = String.format("Failed check after %d tries: %s.", count, t.getMessage());
+          throw new IllegalStateException(msg, t);
+        }
+        Thread.sleep(period.toMillis());
+      }
+    }
   }
 
 }
diff --git a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
index 7e2b09ce25c9b..024efac33c391 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
@@ -23,12 +23,14 @@
 import java.net.InetAddress;
 import java.net.Socket;
 import java.net.SocketException;
+import java.time.Duration;
 import java.util.Arrays;
 import java.util.List;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.Semaphore;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
 
 import org.junit.Test;
 import static org.junit.Assert.*;
@@ -143,7 +145,8 @@ public void infoChanged(SparkAppHandle handle) {
       assertTrue(semaphore.tryAcquire(30, TimeUnit.SECONDS));
       // Make sure the server matched the client to the handle.
       assertNotNull(handle.getConnection());
-      close(client);
+      client.close();
+      handle.dispose();
       assertTrue(semaphore.tryAcquire(30, TimeUnit.SECONDS));
       assertEquals(SparkAppHandle.State.LOST, handle.getState());
     } finally {
@@ -197,28 +200,20 @@ private void close(Closeable c) {
    * server-side close immediately.
    */
   private void waitForError(TestClient client, String secret) throws Exception {
-    boolean helloSent = false;
-    int maxTries = 10;
-    for (int i = 0; i < maxTries; i++) {
+    final AtomicBoolean helloSent = new AtomicBoolean();
+    eventually(Duration.ofSeconds(1), Duration.ofMillis(10), () -> {
       try {
-        if (!helloSent) {
+        if (!helloSent.get()) {
           client.send(new Hello(secret, "1.4.0"));
-          helloSent = true;
+          helloSent.set(true);
         } else {
           client.send(new SetAppId("appId"));
         }
         fail("Expected error but message went through.");
       } catch (IllegalStateException | IOException e) {
         // Expected.
-        break;
-      } catch (AssertionError e) {
-        if (i < maxTries - 1) {
-          Thread.sleep(100);
-        } else {
-          throw new AssertionError("Test failed after " + maxTries + " attempts.", e);
-        }
       }
-    }
+    });
   }
 
   private static class TestClient extends LauncherConnection {
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index 061f653b97b7a..e9dcfaf6ba4f0 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -381,7 +381,9 @@ private object YarnClusterDriver extends Logging with Matchers {
 
       // Verify that the config archive is correctly placed in the classpath of all containers.
       val confFile = "/" + Client.SPARK_CONF_FILE
-      assert(getClass().getResource(confFile) != null)
+      if (conf.getOption(SparkLauncher.DEPLOY_MODE) == Some("cluster")) {
+        assert(getClass().getResource(confFile) != null)
+      }
       val configFromExecutors = sc.parallelize(1 to 4, 4)
         .map { _ => Option(getClass().getResource(confFile)).map(_.toString).orNull }
         .collect()

From cf078a205a14d8709e2c4a9d9f23f6efa20b4fe7 Mon Sep 17 00:00:00 2001
From: Arseniy Tashoyan <tashoyan@users.noreply.github.com>
Date: Mon, 22 Jan 2018 20:17:05 +0800
Subject: [PATCH 155/566] [MINOR][DOC] Fix the path to the examples jar

## What changes were proposed in this pull request?

The example jar file is now in ./examples/jars directory of Spark distribution.

Author: Arseniy Tashoyan <tashoyan@users.noreply.github.com>

Closes #20349 from tashoyan/patch-1.

(cherry picked from commit 60175e959f275d2961798fbc5a9150dac9de51ff)
Signed-off-by: jerryshao <sshao@hortonworks.com>
---
 docs/running-on-yarn.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index e4f5a0c659e66..c010af35f8d2e 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -35,7 +35,7 @@ For example:
         --executor-memory 2g \
         --executor-cores 1 \
         --queue thequeue \
-        lib/spark-examples*.jar \
+        examples/jars/spark-examples*.jar \
         10
 
 The above starts a YARN client program which starts the default Application Master. Then SparkPi will be run as a child thread of Application Master. The client will periodically poll the Application Master for status updates and display them in the console. The client will exit once your application has finished running.  Refer to the "Debugging your Application" section below for how to see driver and executor logs.

From 743b9173f8feaed8e594961aa85d61fb3f8e5e70 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Mon, 22 Jan 2018 04:27:59 -0800
Subject: [PATCH 156/566] [SPARK-23122][PYSPARK][FOLLOW-UP] Update the docs for
 UDF Registration

## What changes were proposed in this pull request?

This PR is to update the docs for UDF registration

## How was this patch tested?

N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20348 from gatorsmile/testUpdateDoc.

(cherry picked from commit 73281161fc7fddd645c712986ec376ac2b1bd213)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 python/pyspark/sql/udf.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index c77f19f89a442..134badb8485f5 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -199,8 +199,8 @@ def __init__(self, sparkSession):
     @ignore_unicode_prefix
     @since("1.3.1")
     def register(self, name, f, returnType=None):
-        """Registers a Python function (including lambda function) or a user-defined function
-        in SQL statements.
+        """Register a Python function (including lambda function) or a user-defined function
+        as a SQL function.
 
         :param name: name of the user-defined function in SQL statements.
         :param f: a Python function, or a user-defined function. The user-defined function can
@@ -210,6 +210,10 @@ def register(self, name, f, returnType=None):
             be either a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
         :return: a user-defined function.
 
+        To register a nondeterministic Python function, users need to first build
+        a nondeterministic user-defined function for the Python function and then register it
+        as a SQL function.
+
         `returnType` can be optionally specified when `f` is a Python function but not
         when `f` is a user-defined function. Please see below.
 
@@ -297,7 +301,7 @@ def register(self, name, f, returnType=None):
     @ignore_unicode_prefix
     @since(2.3)
     def registerJavaFunction(self, name, javaClassName, returnType=None):
-        """Register a Java user-defined function so it can be used in SQL statements.
+        """Register a Java user-defined function as a SQL function.
 
         In addition to a name and the function itself, the return type can be optionally specified.
         When the return type is not specified we would infer it via reflection.
@@ -334,7 +338,7 @@ def registerJavaFunction(self, name, javaClassName, returnType=None):
     @ignore_unicode_prefix
     @since(2.3)
     def registerJavaUDAF(self, name, javaClassName):
-        """Register a Java user-defined aggregate function so it can be used in SQL statements.
+        """Register a Java user-defined aggregate function as a SQL function.
 
         :param name: name of the user-defined aggregate function
         :param javaClassName: fully qualified name of java class

From d933fcea6f3b1d2a5bfb03d808ec83db0f97298a Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Mon, 22 Jan 2018 04:31:24 -0800
Subject: [PATCH 157/566] [SPARK-23170][SQL] Dump the statistics of effective
 runs of analyzer and optimizer rules

## What changes were proposed in this pull request?

Dump the statistics of effective runs of analyzer and optimizer rules.

## How was this patch tested?

Do a manual run of TPCDSQuerySuite

```
=== Metrics of Analyzer/Optimizer Rules ===
Total number of runs: 175899
Total time: 25.486559948 seconds

Rule                                                                                               Effective Time / Total Time                     Effective Runs / Total Runs

org.apache.spark.sql.catalyst.optimizer.ColumnPruning                                              1603280450 / 2868461549                         761 / 1877
org.apache.spark.sql.catalyst.analysis.Analyzer$CTESubstitution                                    2045860009 / 2056602674                         37 / 788
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveAggregateFunctions                          440719059 / 1693110949                          38 / 1982
org.apache.spark.sql.catalyst.optimizer.Optimizer$OptimizeSubqueries                               1429834919 / 1446016225                         39 / 285
org.apache.spark.sql.catalyst.optimizer.PruneFilters                                               33273083 / 1389586938                           3 / 1592
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveReferences                                  821183615 / 1266668754                          616 / 1982
org.apache.spark.sql.catalyst.optimizer.ReorderJoin                                                775837028 / 866238225                           132 / 1592
org.apache.spark.sql.catalyst.analysis.DecimalPrecision                                            550683593 / 748854507                           211 / 1982
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveSubquery                                    513075345 / 634370596                           49 / 1982
org.apache.spark.sql.catalyst.analysis.Analyzer$FixNullability                                     33475731 / 606406532                            12 / 742
org.apache.spark.sql.catalyst.analysis.TypeCoercion$ImplicitTypeCasts                              193144298 / 545403925                           86 / 1982
org.apache.spark.sql.catalyst.optimizer.BooleanSimplification                                      18651497 / 495725004                            7 / 1592
org.apache.spark.sql.catalyst.optimizer.PushPredicateThroughJoin                                   369257217 / 489934378                           709 / 1592
org.apache.spark.sql.catalyst.optimizer.RemoveRedundantAliases                                     3707000 / 468291609                             9 / 1592
org.apache.spark.sql.catalyst.optimizer.InferFiltersFromConstraints                                410155900 / 435254175                           192 / 285
org.apache.spark.sql.execution.datasources.FindDataSourceTable                                     348885539 / 371855866                           233 / 1982
org.apache.spark.sql.catalyst.optimizer.NullPropagation                                            11307645 / 307531225                            26 / 1592
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveFunctions                                   120324545 / 304948785                           294 / 1982
org.apache.spark.sql.catalyst.analysis.TypeCoercion$FunctionArgumentConversion                     92323199 / 286695007                            38 / 1982
org.apache.spark.sql.catalyst.optimizer.PushDownPredicate                                          230084193 / 265845972                           785 / 1592
org.apache.spark.sql.catalyst.analysis.TypeCoercion$PromoteStrings                                 45938401 / 265144009                            40 / 1982
org.apache.spark.sql.catalyst.analysis.TypeCoercion$InConversion                                   14888776 / 261499450                            1 / 1982
org.apache.spark.sql.catalyst.analysis.TypeCoercion$CaseWhenCoercion                               113796384 / 244913861                           29 / 1982
org.apache.spark.sql.catalyst.optimizer.ConstantFolding                                            65008069 / 236548480                            126 / 1592
org.apache.spark.sql.catalyst.analysis.Analyzer$ExtractGenerator                                   0 / 226338929                                   0 / 1982
org.apache.spark.sql.catalyst.analysis.ResolveTimeZone                                             98134906 / 221323770                            417 / 1982
org.apache.spark.sql.catalyst.optimizer.ReorderAssociativeOperator                                 0 / 208421703                                   0 / 1592
org.apache.spark.sql.catalyst.optimizer.OptimizeIn                                                 8762534 / 199351958                             16 / 1592
org.apache.spark.sql.catalyst.analysis.TypeCoercion$DateTimeOperations                             11980016 / 190779046                            27 / 1982
org.apache.spark.sql.catalyst.optimizer.SimplifyBinaryComparison                                   0 / 188887385                                   0 / 1592
org.apache.spark.sql.catalyst.optimizer.SimplifyConditionals                                       0 / 186812106                                   0 / 1592
org.apache.spark.sql.catalyst.optimizer.SimplifyCaseConversionExpressions                          0 / 183885230                                   0 / 1592
org.apache.spark.sql.catalyst.optimizer.SimplifyCasts                                              17128295 / 182901910                            69 / 1592
org.apache.spark.sql.catalyst.analysis.TypeCoercion$Division                                       14579110 / 180309340                            8 / 1982
org.apache.spark.sql.catalyst.analysis.TypeCoercion$BooleanEquality                                0 / 176740516                                   0 / 1982
org.apache.spark.sql.catalyst.analysis.TypeCoercion$IfCoercion                                     0 / 170781986                                   0 / 1982
org.apache.spark.sql.catalyst.optimizer.LikeSimplification                                         771605 / 164136736                              1 / 1592
org.apache.spark.sql.catalyst.optimizer.RemoveDispensableExpressions                               0 / 155958962                                   0 / 1592
org.apache.spark.sql.catalyst.analysis.ResolveCreateNamedStruct                                    0 / 151222943                                   0 / 1982
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowOrder                                 7534632 / 146596355                             14 / 1982
org.apache.spark.sql.catalyst.analysis.TypeCoercion$EltCoercion                                    0 / 144488654                                   0 / 1982
org.apache.spark.sql.catalyst.analysis.TypeCoercion$ConcatCoercion                                 0 / 142403338                                   0 / 1982
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowFrame                                 12067635 / 141500665                            21 / 1982
org.apache.spark.sql.catalyst.analysis.TimeWindowing                                               0 / 140431958                                   0 / 1982
org.apache.spark.sql.catalyst.analysis.TypeCoercion$WindowFrameCoercion                            0 / 125471960                                   0 / 1982
org.apache.spark.sql.catalyst.optimizer.EliminateOuterJoin                                         14226972 / 124922019                            11 / 1592
org.apache.spark.sql.catalyst.analysis.TypeCoercion$StackCoercion                                  0 / 123613887                                   0 / 1982
org.apache.spark.sql.catalyst.optimizer.RewriteCorrelatedScalarSubquery                            8491071 / 121179056                             7 / 1592
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveGroupingAnalytics                           55526073 / 120290529                            11 / 1982
org.apache.spark.sql.catalyst.optimizer.ConstantPropagation                                        0 / 113886790                                   0 / 1592
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveDeserializer                                52383759 / 107160222                            148 / 1982
org.apache.spark.sql.catalyst.analysis.CleanupAliases                                              52543524 / 102091518                            344 / 1086
org.apache.spark.sql.catalyst.optimizer.RemoveRedundantProject                                     40682895 / 94403652                             342 / 1877
org.apache.spark.sql.catalyst.analysis.Analyzer$ExtractWindowExpressions                           38473816 / 89740578                             23 / 1982
org.apache.spark.sql.catalyst.optimizer.CollapseProject                                            46806090 / 83315506                             281 / 1877
org.apache.spark.sql.catalyst.optimizer.FoldablePropagation                                        0 / 78750087                                    0 / 1592
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveAliases                                     13742765 / 77227258                             47 / 1982
org.apache.spark.sql.catalyst.optimizer.CombineFilters                                             53386729 / 76960344                             448 / 1592
org.apache.spark.sql.execution.datasources.DataSourceAnalysis                                      68034341 / 75724186                             24 / 742
org.apache.spark.sql.catalyst.analysis.Analyzer$LookupFunctions                                    0 / 71151084                                    0 / 750
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveMissingReferences                           12139848 / 67599140                             8 / 1982
org.apache.spark.sql.catalyst.optimizer.PullupCorrelatedPredicates                                 45017938 / 65968777                             23 / 285
org.apache.spark.sql.execution.datasources.v2.PushDownOperatorsToDataSource                        0 / 60937767                                    0 / 285
org.apache.spark.sql.catalyst.optimizer.CollapseRepartition                                        0 / 59897237                                    0 / 1592
org.apache.spark.sql.catalyst.optimizer.PushProjectionThroughUnion                                 8547262 / 53941370                              10 / 1592
org.apache.spark.sql.catalyst.analysis.Analyzer$HandleNullInputsForUDF                             0 / 52735976                                    0 / 742
org.apache.spark.sql.catalyst.analysis.TypeCoercion$WidenSetOperationTypes                         9797713 / 52401665                              9 / 1982
org.apache.spark.sql.catalyst.analysis.Analyzer$PullOutNondeterministic                            0 / 51741500                                    0 / 742
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations                                   28614911 / 51061186                             233 / 1990
org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions                               0 / 50621510                                    0 / 285
org.apache.spark.sql.catalyst.optimizer.CombineUnions                                              2777800 / 50262112                              17 / 1877
org.apache.spark.sql.catalyst.analysis.Analyzer$GlobalAggregates                                   1640641 / 49633909                              46 / 1982
org.apache.spark.sql.catalyst.optimizer.DecimalAggregates                                          20198374 / 48488419                             100 / 385
org.apache.spark.sql.catalyst.optimizer.LimitPushDown                                              0 / 45052523                                    0 / 1592
org.apache.spark.sql.catalyst.optimizer.CombineLimits                                              0 / 44719443                                    0 / 1592
org.apache.spark.sql.catalyst.optimizer.EliminateSorts                                             0 / 44216930                                    0 / 1592
org.apache.spark.sql.catalyst.optimizer.RewritePredicateSubquery                                   36235699 / 44165786                             148 / 285
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveNewInstance                                 0 / 42750307                                    0 / 1982
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveUpCast                                      0 / 41811748                                    0 / 1982
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveOrdinalInOrderByAndGroupBy                  3819476 / 41776562                              4 / 1982
org.apache.spark.sql.catalyst.optimizer.ComputeCurrentTime                                         0 / 40527808                                    0 / 285
org.apache.spark.sql.catalyst.optimizer.CollapseWindow                                             0 / 36832538                                    0 / 1592
org.apache.spark.sql.catalyst.optimizer.EliminateSerialization                                     0 / 36120667                                    0 / 1592
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveAggAliasInGroupBy                           0 / 32435826                                    0 / 1982
org.apache.spark.sql.execution.datasources.PreprocessTableCreation                                 0 / 32145218                                    0 / 742
org.apache.spark.sql.execution.datasources.ResolveSQLOnFile                                        0 / 30295614                                    0 / 1982
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolvePivot                                       0 / 30111655                                    0 / 1982
org.apache.spark.sql.catalyst.expressions.codegen.package$ExpressionCanonicalizer$CleanExpressions 59930 / 28038201                                26 / 8280
org.apache.spark.sql.catalyst.analysis.ResolveInlineTables                                         0 / 27808108                                    0 / 1982
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveSubqueryColumnAliases                       0 / 27066690                                    0 / 1982
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveGenerate                                    0 / 26660210                                    0 / 1982
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveNaturalAndUsingJoin                         0 / 25255184                                    0 / 1982
org.apache.spark.sql.catalyst.analysis.ResolveTableValuedFunctions                                 0 / 24663088                                    0 / 1990
org.apache.spark.sql.catalyst.analysis.SubstituteUnresolvedOrdinals                                9709079 / 24450670                              4 / 788
org.apache.spark.sql.catalyst.analysis.ResolveHints$ResolveBroadcastHints                          0 / 23776535                                    0 / 750
org.apache.spark.sql.catalyst.optimizer.ReplaceExpressions                                         0 / 22697895                                    0 / 285
org.apache.spark.sql.catalyst.optimizer.CheckCartesianProducts                                     0 / 22523798                                    0 / 285
org.apache.spark.sql.catalyst.optimizer.ReplaceDistinctWithAggregate                               988593 / 21535410                               15 / 300
org.apache.spark.sql.catalyst.optimizer.EliminateMapObjects                                        0 / 20269996                                    0 / 285
org.apache.spark.sql.catalyst.optimizer.RewriteDistinctAggregates                                  0 / 19388592                                    0 / 285
org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases                                    17675532 / 18971185                             215 / 285
org.apache.spark.sql.catalyst.optimizer.GetCurrentDatabase                                         0 / 18271152                                    0 / 285
org.apache.spark.sql.catalyst.optimizer.PropagateEmptyRelation                                     2077097 / 17190855                              3 / 288
org.apache.spark.sql.catalyst.analysis.EliminateBarriers                                           0 / 16736359                                    0 / 1086
org.apache.spark.sql.execution.OptimizeMetadataOnlyQuery                                           0 / 16669341                                    0 / 285
org.apache.spark.sql.catalyst.analysis.UpdateOuterReferences                                       0 / 14470235                                    0 / 742
org.apache.spark.sql.catalyst.optimizer.ReplaceExceptWithAntiJoin                                  6715625 / 12190561                              1 / 300
org.apache.spark.sql.catalyst.optimizer.ReplaceIntersectWithSemiJoin                               3451793 / 11431432                              7 / 300
org.apache.spark.sql.execution.python.ExtractPythonUDFFromAggregate                                0 / 10810568                                    0 / 285
org.apache.spark.sql.catalyst.optimizer.RemoveRepetitionFromGroupExpressions                       344198 / 10475276                               1 / 286
org.apache.spark.sql.catalyst.analysis.Analyzer$WindowsSubstitution                                0 / 10386630                                    0 / 788
org.apache.spark.sql.catalyst.analysis.EliminateUnions                                             0 / 10096526                                    0 / 788
org.apache.spark.sql.catalyst.analysis.AliasViewChild                                              0 / 9991706                                     0 / 742
org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation                                     0 / 9649334                                     0 / 288
org.apache.spark.sql.catalyst.analysis.ResolveHints$RemoveAllHints                                 0 / 8739109                                     0 / 750
org.apache.spark.sql.execution.datasources.PreprocessTableInsertion                                0 / 8420889                                     0 / 742
org.apache.spark.sql.catalyst.analysis.EliminateView                                               0 / 8319134                                     0 / 285
org.apache.spark.sql.catalyst.optimizer.RemoveLiteralFromGroupExpressions                          0 / 7392627                                     0 / 286
org.apache.spark.sql.catalyst.optimizer.ReplaceExceptWithFilter                                    0 / 7170516                                     0 / 300
org.apache.spark.sql.catalyst.optimizer.SimplifyCreateArrayOps                                     0 / 7109643                                     0 / 1592
org.apache.spark.sql.catalyst.optimizer.SimplifyCreateStructOps                                    0 / 6837590                                     0 / 1592
org.apache.spark.sql.catalyst.optimizer.SimplifyCreateMapOps                                       0 / 6617848                                     0 / 1592
org.apache.spark.sql.catalyst.optimizer.CombineConcats                                             0 / 5768406                                     0 / 1592
org.apache.spark.sql.catalyst.optimizer.ReplaceDeduplicateWithAggregate                            0 / 5349831                                     0 / 285
org.apache.spark.sql.catalyst.optimizer.CombineTypedFilters                                        0 / 5186642                                     0 / 285
org.apache.spark.sql.catalyst.optimizer.EliminateDistinct                                          0 / 2427686                                     0 / 285
org.apache.spark.sql.catalyst.optimizer.CostBasedJoinReorder                                       0 / 2420436                                     0 / 285

```

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20342 from gatorsmile/reportExecution.

(cherry picked from commit 78801881c405de47f7e53eea3e0420dd69593dbd)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../rules/QueryExecutionMetering.scala        | 91 +++++++++++++++++++
 .../sql/catalyst/rules/RuleExecutor.scala     | 32 +++----
 .../apache/spark/sql/BenchmarkQueryTest.scala |  2 +-
 .../apache/spark/sql/SQLQueryTestSuite.scala  |  2 +-
 .../execution/HiveCompatibilitySuite.scala    |  2 +-
 5 files changed, 109 insertions(+), 20 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/QueryExecutionMetering.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/QueryExecutionMetering.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/QueryExecutionMetering.scala
new file mode 100644
index 0000000000000..62f7541150a6e
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/QueryExecutionMetering.scala
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.rules
+
+import scala.collection.JavaConverters._
+
+import com.google.common.util.concurrent.AtomicLongMap
+
+case class QueryExecutionMetering() {
+  private val timeMap = AtomicLongMap.create[String]()
+  private val numRunsMap = AtomicLongMap.create[String]()
+  private val numEffectiveRunsMap = AtomicLongMap.create[String]()
+  private val timeEffectiveRunsMap = AtomicLongMap.create[String]()
+
+  /** Resets statistics about time spent running specific rules */
+  def resetMetrics(): Unit = {
+    timeMap.clear()
+    numRunsMap.clear()
+    numEffectiveRunsMap.clear()
+    timeEffectiveRunsMap.clear()
+  }
+
+  def totalTime: Long = {
+    timeMap.sum()
+  }
+
+  def totalNumRuns: Long = {
+    numRunsMap.sum()
+  }
+
+  def incExecutionTimeBy(ruleName: String, delta: Long): Unit = {
+    timeMap.addAndGet(ruleName, delta)
+  }
+
+  def incTimeEffectiveExecutionBy(ruleName: String, delta: Long): Unit = {
+    timeEffectiveRunsMap.addAndGet(ruleName, delta)
+  }
+
+  def incNumEffectiveExecution(ruleName: String): Unit = {
+    numEffectiveRunsMap.incrementAndGet(ruleName)
+  }
+
+  def incNumExecution(ruleName: String): Unit = {
+    numRunsMap.incrementAndGet(ruleName)
+  }
+
+  /** Dump statistics about time spent running specific rules. */
+  def dumpTimeSpent(): String = {
+    val map = timeMap.asMap().asScala
+    val maxLengthRuleNames = map.keys.map(_.toString.length).max
+
+    val colRuleName = "Rule".padTo(maxLengthRuleNames, " ").mkString
+    val colRunTime = "Effective Time / Total Time".padTo(len = 47, " ").mkString
+    val colNumRuns = "Effective Runs / Total Runs".padTo(len = 47, " ").mkString
+
+    val ruleMetrics = map.toSeq.sortBy(_._2).reverseMap { case (name, time) =>
+      val timeEffectiveRun = timeEffectiveRunsMap.get(name)
+      val numRuns = numRunsMap.get(name)
+      val numEffectiveRun = numEffectiveRunsMap.get(name)
+
+      val ruleName = name.padTo(maxLengthRuleNames, " ").mkString
+      val runtimeValue = s"$timeEffectiveRun / $time".padTo(len = 47, " ").mkString
+      val numRunValue = s"$numEffectiveRun / $numRuns".padTo(len = 47, " ").mkString
+      s"$ruleName $runtimeValue $numRunValue"
+    }.mkString("\n", "\n", "")
+
+    s"""
+       |=== Metrics of Analyzer/Optimizer Rules ===
+       |Total number of runs: $totalNumRuns
+       |Total time: ${totalTime / 1000000000D} seconds
+       |
+       |$colRuleName $colRunTime $colNumRuns
+       |$ruleMetrics
+     """.stripMargin
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
index 7e4b784033bfc..dccb44ddebfa4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
@@ -17,10 +17,6 @@
 
 package org.apache.spark.sql.catalyst.rules
 
-import scala.collection.JavaConverters._
-
-import com.google.common.util.concurrent.AtomicLongMap
-
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.trees.TreeNode
@@ -28,18 +24,16 @@ import org.apache.spark.sql.catalyst.util.sideBySide
 import org.apache.spark.util.Utils
 
 object RuleExecutor {
-  protected val timeMap = AtomicLongMap.create[String]()
-
-  /** Resets statistics about time spent running specific rules */
-  def resetTime(): Unit = timeMap.clear()
+  protected val queryExecutionMeter = QueryExecutionMetering()
 
   /** Dump statistics about time spent running specific rules. */
   def dumpTimeSpent(): String = {
-    val map = timeMap.asMap().asScala
-    val maxSize = map.keys.map(_.toString.length).max
-    map.toSeq.sortBy(_._2).reverseMap { case (k, v) =>
-      s"${k.padTo(maxSize, " ").mkString} $v"
-    }.mkString("\n", "\n", "")
+    queryExecutionMeter.dumpTimeSpent()
+  }
+
+  /** Resets statistics about time spent running specific rules */
+  def resetMetrics(): Unit = {
+    queryExecutionMeter.resetMetrics()
   }
 }
 
@@ -77,6 +71,7 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging {
    */
   def execute(plan: TreeType): TreeType = {
     var curPlan = plan
+    val queryExecutionMetrics = RuleExecutor.queryExecutionMeter
 
     batches.foreach { batch =>
       val batchStartPlan = curPlan
@@ -91,15 +86,18 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging {
             val startTime = System.nanoTime()
             val result = rule(plan)
             val runTime = System.nanoTime() - startTime
-            RuleExecutor.timeMap.addAndGet(rule.ruleName, runTime)
 
             if (!result.fastEquals(plan)) {
+              queryExecutionMetrics.incNumEffectiveExecution(rule.ruleName)
+              queryExecutionMetrics.incTimeEffectiveExecutionBy(rule.ruleName, runTime)
               logTrace(
                 s"""
                   |=== Applying Rule ${rule.ruleName} ===
                   |${sideBySide(plan.treeString, result.treeString).mkString("\n")}
                 """.stripMargin)
             }
+            queryExecutionMetrics.incExecutionTimeBy(rule.ruleName, runTime)
+            queryExecutionMetrics.incNumExecution(rule.ruleName)
 
             // Run the structural integrity checker against the plan after each rule.
             if (!isPlanIntegral(result)) {
@@ -135,9 +133,9 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging {
       if (!batchStartPlan.fastEquals(curPlan)) {
         logDebug(
           s"""
-          |=== Result of Batch ${batch.name} ===
-          |${sideBySide(batchStartPlan.treeString, curPlan.treeString).mkString("\n")}
-        """.stripMargin)
+            |=== Result of Batch ${batch.name} ===
+            |${sideBySide(batchStartPlan.treeString, curPlan.treeString).mkString("\n")}
+          """.stripMargin)
       } else {
         logTrace(s"Batch ${batch.name} has no effect.")
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/BenchmarkQueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/BenchmarkQueryTest.scala
index 7037749f14478..e51aad021fcbf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/BenchmarkQueryTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/BenchmarkQueryTest.scala
@@ -46,7 +46,7 @@ abstract class BenchmarkQueryTest extends QueryTest with SharedSQLContext with B
 
   override def beforeAll() {
     super.beforeAll()
-    RuleExecutor.resetTime()
+    RuleExecutor.resetMetrics()
   }
 
   protected def checkGeneratedCode(plan: SparkPlan): Unit = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
index e3901af4b9988..054ada56d99ad 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
@@ -291,7 +291,7 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
     TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"))
     // Add Locale setting
     Locale.setDefault(Locale.US)
-    RuleExecutor.resetTime()
+    RuleExecutor.resetMetrics()
   }
 
   override def afterAll(): Unit = {
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 45791c69b4cb7..cebaad5b4ad9b 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -62,7 +62,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     // Fix session local timezone to America/Los_Angeles for those timezone sensitive tests
     // (timestamp_*)
     TestHive.setConf(SQLConf.SESSION_LOCAL_TIMEZONE, "America/Los_Angeles")
-    RuleExecutor.resetTime()
+    RuleExecutor.resetMetrics()
   }
 
   override def afterAll() {

From 1069fad41fb6896fef4245e6ae6b5ba36115ad68 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Mon, 22 Jan 2018 04:32:59 -0800
Subject: [PATCH 158/566] [MINOR][SQL][TEST] Test case cleanups for recent PRs

## What changes were proposed in this pull request?
Revert the unneeded test case changes we made in SPARK-23000

Also fixes the test suites that do not call `super.afterAll()` in the local `afterAll`. The `afterAll()` of `TestHiveSingleton` actually reset the environments.

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20341 from gatorsmile/testRelated.

(cherry picked from commit 896e45af5fea264683b1d7d20a1711f33908a06f)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../apache/spark/sql/DataFrameJoinSuite.scala | 21 ++++++-----
 .../apache/spark/sql/hive/test/TestHive.scala |  3 +-
 .../sql/hive/HiveMetastoreCatalogSuite.scala  | 26 +++++++-------
 .../sql/hive/execution/HiveUDAFSuite.scala    |  8 +++--
 .../hive/execution/Hive_2_1_DDLSuite.scala    |  6 +++-
 .../execution/ObjectHashAggregateSuite.scala  |  6 +++-
 .../apache/spark/sql/hive/parquetSuites.scala | 35 +++++++++++--------
 7 files changed, 60 insertions(+), 45 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
index 1656f290ee19c..0d9eeabb397a1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
@@ -21,6 +21,7 @@ import org.apache.spark.sql.catalyst.plans.{Inner, LeftOuter, RightOuter}
 import org.apache.spark.sql.catalyst.plans.logical.Join
 import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 
 class DataFrameJoinSuite extends QueryTest with SharedSQLContext {
@@ -276,16 +277,14 @@ class DataFrameJoinSuite extends QueryTest with SharedSQLContext {
 
   test("SPARK-23087: don't throw Analysis Exception in CheckCartesianProduct when join condition " +
     "is false or null") {
-    val df = spark.range(10)
-    val dfNull = spark.range(10).select(lit(null).as("b"))
-    val planNull = df.join(dfNull, $"id" === $"b", "left").queryExecution.analyzed
-
-    spark.sessionState.executePlan(planNull).optimizedPlan
-
-    val dfOne = df.select(lit(1).as("a"))
-    val dfTwo = spark.range(10).select(lit(2).as("b"))
-    val planFalse = dfOne.join(dfTwo, $"a" === $"b", "left").queryExecution.analyzed
-
-    spark.sessionState.executePlan(planFalse).optimizedPlan
+    withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "false") {
+      val df = spark.range(10)
+      val dfNull = spark.range(10).select(lit(null).as("b"))
+      df.join(dfNull, $"id" === $"b", "left").queryExecution.optimizedPlan
+
+      val dfOne = df.select(lit(1).as("a"))
+      val dfTwo = spark.range(10).select(lit(2).as("b"))
+      dfOne.join(dfTwo, $"a" === $"b", "left").queryExecution.optimizedPlan
+    }
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index c84131fc3212a..7287e20d55bbe 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -492,8 +492,7 @@ private[hive] class TestHiveSparkSession(
   protected val originalUDFs: JavaSet[String] = FunctionRegistry.getFunctionNames
 
   /**
-   * Resets the test instance by deleting any tables that have been created.
-   * TODO: also clear out UDFs, views, etc.
+   * Resets the test instance by deleting any table, view, temp view, and UDF that have been created
    */
   def reset() {
     try {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index 83b4c862e2546..ba9b944e4a055 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -166,13 +166,13 @@ class DataSourceWithHiveMetastoreCatalogSuite
     ))
   ).foreach { case (provider, (inputFormat, outputFormat, serde)) =>
     test(s"Persist non-partitioned $provider relation into metastore as managed table") {
-      withTable("default.t") {
+      withTable("t") {
         withSQLConf(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "true") {
           testDF
             .write
             .mode(SaveMode.Overwrite)
             .format(provider)
-            .saveAsTable("default.t")
+            .saveAsTable("t")
         }
 
         val hiveTable = sessionState.catalog.getTableMetadata(TableIdentifier("t", Some("default")))
@@ -187,15 +187,14 @@ class DataSourceWithHiveMetastoreCatalogSuite
         assert(columns.map(_.name) === Seq("d1", "d2"))
         assert(columns.map(_.dataType) === Seq(DecimalType(10, 3), StringType))
 
-        checkAnswer(table("default.t"), testDF)
-        assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM default.t") ===
-          Seq("1.1\t1", "2.1\t2"))
+        checkAnswer(table("t"), testDF)
+        assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM t") === Seq("1.1\t1", "2.1\t2"))
       }
     }
 
     test(s"Persist non-partitioned $provider relation into metastore as external table") {
       withTempPath { dir =>
-        withTable("default.t") {
+        withTable("t") {
           val path = dir.getCanonicalFile
 
           withSQLConf(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "true") {
@@ -204,7 +203,7 @@ class DataSourceWithHiveMetastoreCatalogSuite
               .mode(SaveMode.Overwrite)
               .format(provider)
               .option("path", path.toString)
-              .saveAsTable("default.t")
+              .saveAsTable("t")
           }
 
           val hiveTable =
@@ -220,8 +219,8 @@ class DataSourceWithHiveMetastoreCatalogSuite
           assert(columns.map(_.name) === Seq("d1", "d2"))
           assert(columns.map(_.dataType) === Seq(DecimalType(10, 3), StringType))
 
-          checkAnswer(table("default.t"), testDF)
-          assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM default.t") ===
+          checkAnswer(table("t"), testDF)
+          assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM t") ===
             Seq("1.1\t1", "2.1\t2"))
         }
       }
@@ -229,9 +228,9 @@ class DataSourceWithHiveMetastoreCatalogSuite
 
     test(s"Persist non-partitioned $provider relation into metastore as managed table using CTAS") {
       withTempPath { dir =>
-        withTable("default.t") {
+        withTable("t") {
           sql(
-            s"""CREATE TABLE default.t USING $provider
+            s"""CREATE TABLE t USING $provider
                |OPTIONS (path '${dir.toURI}')
                |AS SELECT 1 AS d1, "val_1" AS d2
              """.stripMargin)
@@ -249,9 +248,8 @@ class DataSourceWithHiveMetastoreCatalogSuite
           assert(columns.map(_.name) === Seq("d1", "d2"))
           assert(columns.map(_.dataType) === Seq(IntegerType, StringType))
 
-          checkAnswer(table("default.t"), Row(1, "val_1"))
-          assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM default.t") ===
-            Seq("1\tval_1"))
+          checkAnswer(table("t"), Row(1, "val_1"))
+          assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM t") === Seq("1\tval_1"))
         }
       }
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDAFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDAFSuite.scala
index 8986fb58c6460..7402c9626873c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDAFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDAFSuite.scala
@@ -49,8 +49,12 @@ class HiveUDAFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
   }
 
   protected override def afterAll(): Unit = {
-    sql(s"DROP TEMPORARY FUNCTION IF EXISTS mock")
-    sql(s"DROP TEMPORARY FUNCTION IF EXISTS hive_max")
+    try {
+      sql(s"DROP TEMPORARY FUNCTION IF EXISTS mock")
+      sql(s"DROP TEMPORARY FUNCTION IF EXISTS hive_max")
+    } finally {
+      super.afterAll()
+    }
   }
 
   test("built-in Hive UDAF") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Hive_2_1_DDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Hive_2_1_DDLSuite.scala
index bc828877e35ec..eaedac1fa95d8 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Hive_2_1_DDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Hive_2_1_DDLSuite.scala
@@ -74,7 +74,11 @@ class Hive_2_1_DDLSuite extends SparkFunSuite with TestHiveSingleton with Before
   }
 
   override def afterAll(): Unit = {
-    catalog = null
+    try {
+      catalog = null
+    } finally {
+      super.afterAll()
+    }
   }
 
   test("SPARK-21617: ALTER TABLE for non-compatible DataSource tables") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala
index 9eaf44c043c71..8dbcd24cd78de 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala
@@ -47,7 +47,11 @@ class ObjectHashAggregateSuite
   }
 
   protected override def afterAll(): Unit = {
-    sql(s"DROP TEMPORARY FUNCTION IF EXISTS hive_max")
+    try {
+      sql(s"DROP TEMPORARY FUNCTION IF EXISTS hive_max")
+    } finally {
+      super.afterAll()
+    }
   }
 
   test("typed_count without grouping keys") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 740e0837350cc..2327d83a1b4f6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -180,15 +180,18 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
   }
 
   override def afterAll(): Unit = {
-    dropTables("partitioned_parquet",
-      "partitioned_parquet_with_key",
-      "partitioned_parquet_with_complextypes",
-      "partitioned_parquet_with_key_and_complextypes",
-      "normal_parquet",
-      "jt",
-      "jt_array",
-      "test_parquet")
-    super.afterAll()
+    try {
+      dropTables("partitioned_parquet",
+        "partitioned_parquet_with_key",
+        "partitioned_parquet_with_complextypes",
+        "partitioned_parquet_with_key_and_complextypes",
+        "normal_parquet",
+        "jt",
+        "jt_array",
+        "test_parquet")
+    } finally {
+      super.afterAll()
+    }
   }
 
   test(s"conversion is working") {
@@ -931,11 +934,15 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
   }
 
   override protected def afterAll(): Unit = {
-    partitionedTableDir.delete()
-    normalTableDir.delete()
-    partitionedTableDirWithKey.delete()
-    partitionedTableDirWithComplexTypes.delete()
-    partitionedTableDirWithKeyAndComplexTypes.delete()
+    try {
+      partitionedTableDir.delete()
+      normalTableDir.delete()
+      partitionedTableDirWithKey.delete()
+      partitionedTableDirWithComplexTypes.delete()
+      partitionedTableDirWithKeyAndComplexTypes.delete()
+    } finally {
+      super.afterAll()
+    }
   }
 
   /**

From d963ba031748711ec7847ad0b702911eb7319c63 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Mon, 22 Jan 2018 20:56:38 +0800
Subject: [PATCH 159/566] [SPARK-23090][SQL] polish ColumnVector

## What changes were proposed in this pull request?

Several improvements:
* provide a default implementation for the batch get methods
* rename `getChildColumn` to `getChild`, which is more concise
* remove `getStruct(int, int)`, it's only used to simplify the codegen, which is an internal thing, we should not add a public API for this purpose.

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20277 from cloud-fan/column-vector.

(cherry picked from commit 5d680cae486c77cdb12dbe9e043710e49e8d51e4)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../expressions/codegen/CodeGenerator.scala   | 18 ++--
 .../datasources/orc/OrcColumnVector.java      | 65 +-----------
 .../orc/OrcColumnarBatchReader.java           | 23 ++---
 .../vectorized/ColumnVectorUtils.java         | 10 +-
 .../vectorized/MutableColumnarRow.java        |  4 +-
 .../vectorized/WritableColumnVector.java      | 10 +-
 .../sql/vectorized/ArrowColumnVector.java     | 99 +------------------
 .../spark/sql/vectorized/ColumnVector.java    | 79 +++++++++++----
 .../spark/sql/vectorized/ColumnarArray.java   |  4 +-
 .../spark/sql/vectorized/ColumnarRow.java     | 46 ++++-----
 .../sql/execution/ColumnarBatchScan.scala     |  2 +-
 .../VectorizedHashMapGenerator.scala          |  4 +-
 .../execution/arrow/ArrowWriterSuite.scala    | 14 +--
 .../vectorized/ArrowColumnVectorSuite.scala   | 12 +--
 .../vectorized/ColumnVectorSuite.scala        | 12 +--
 .../vectorized/ColumnarBatchBenchmark.scala   | 38 +++----
 .../vectorized/ColumnarBatchSuite.scala       | 20 ++--
 17 files changed, 164 insertions(+), 296 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 2c714c228e6c9..f96ed7628fda1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -688,17 +688,13 @@ class CodegenContext {
   /**
    * Returns the specialized code to access a value from a column vector for a given `DataType`.
    */
-  def getValue(vector: String, rowId: String, dataType: DataType): String = {
-    val jt = javaType(dataType)
-    dataType match {
-      case _ if isPrimitiveType(jt) =>
-        s"$vector.get${primitiveTypeName(jt)}($rowId)"
-      case t: DecimalType =>
-        s"$vector.getDecimal($rowId, ${t.precision}, ${t.scale})"
-      case StringType =>
-        s"$vector.getUTF8String($rowId)"
-      case _ =>
-        throw new IllegalArgumentException(s"cannot generate code for unsupported type: $dataType")
+  def getValueFromVector(vector: String, dataType: DataType, rowId: String): String = {
+    if (dataType.isInstanceOf[StructType]) {
+      // `ColumnVector.getStruct` is different from `InternalRow.getStruct`, it only takes an
+      // `ordinal` parameter.
+      s"$vector.getStruct($rowId)"
+    } else {
+      getValue(vector, dataType, rowId)
     }
   }
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
index b6e792274da11..aaf2a380034a9 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
@@ -110,57 +110,21 @@ public boolean getBoolean(int rowId) {
     return longData.vector[getRowIndex(rowId)] == 1;
   }
 
-  @Override
-  public boolean[] getBooleans(int rowId, int count) {
-    boolean[] res = new boolean[count];
-    for (int i = 0; i < count; i++) {
-      res[i] = getBoolean(rowId + i);
-    }
-    return res;
-  }
-
   @Override
   public byte getByte(int rowId) {
     return (byte) longData.vector[getRowIndex(rowId)];
   }
 
-  @Override
-  public byte[] getBytes(int rowId, int count) {
-    byte[] res = new byte[count];
-    for (int i = 0; i < count; i++) {
-      res[i] = getByte(rowId + i);
-    }
-    return res;
-  }
-
   @Override
   public short getShort(int rowId) {
     return (short) longData.vector[getRowIndex(rowId)];
   }
 
-  @Override
-  public short[] getShorts(int rowId, int count) {
-    short[] res = new short[count];
-    for (int i = 0; i < count; i++) {
-      res[i] = getShort(rowId + i);
-    }
-    return res;
-  }
-
   @Override
   public int getInt(int rowId) {
     return (int) longData.vector[getRowIndex(rowId)];
   }
 
-  @Override
-  public int[] getInts(int rowId, int count) {
-    int[] res = new int[count];
-    for (int i = 0; i < count; i++) {
-      res[i] = getInt(rowId + i);
-    }
-    return res;
-  }
-
   @Override
   public long getLong(int rowId) {
     int index = getRowIndex(rowId);
@@ -171,43 +135,16 @@ public long getLong(int rowId) {
     }
   }
 
-  @Override
-  public long[] getLongs(int rowId, int count) {
-    long[] res = new long[count];
-    for (int i = 0; i < count; i++) {
-      res[i] = getLong(rowId + i);
-    }
-    return res;
-  }
-
   @Override
   public float getFloat(int rowId) {
     return (float) doubleData.vector[getRowIndex(rowId)];
   }
 
-  @Override
-  public float[] getFloats(int rowId, int count) {
-    float[] res = new float[count];
-    for (int i = 0; i < count; i++) {
-      res[i] = getFloat(rowId + i);
-    }
-    return res;
-  }
-
   @Override
   public double getDouble(int rowId) {
     return doubleData.vector[getRowIndex(rowId)];
   }
 
-  @Override
-  public double[] getDoubles(int rowId, int count) {
-    double[] res = new double[count];
-    for (int i = 0; i < count; i++) {
-      res[i] = getDouble(rowId + i);
-    }
-    return res;
-  }
-
   @Override
   public int getArrayLength(int rowId) {
     throw new UnsupportedOperationException();
@@ -245,7 +182,7 @@ public org.apache.spark.sql.vectorized.ColumnVector arrayData() {
   }
 
   @Override
-  public org.apache.spark.sql.vectorized.ColumnVector getChildColumn(int ordinal) {
+  public org.apache.spark.sql.vectorized.ColumnVector getChild(int ordinal) {
     throw new UnsupportedOperationException();
   }
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
index 89bae4326e93b..5e7cad470e1d1 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
@@ -289,10 +289,9 @@ private void putRepeatingValues(
         toColumn.putDoubles(0, batchSize, ((DoubleColumnVector)fromColumn).vector[0]);
       } else if (type instanceof StringType || type instanceof BinaryType) {
         BytesColumnVector data = (BytesColumnVector)fromColumn;
-        WritableColumnVector arrayData = toColumn.getChildColumn(0);
         int size = data.vector[0].length;
-        arrayData.reserve(size);
-        arrayData.putBytes(0, size, data.vector[0], 0);
+        toColumn.arrayData().reserve(size);
+        toColumn.arrayData().putBytes(0, size, data.vector[0], 0);
         for (int index = 0; index < batchSize; index++) {
           toColumn.putArray(index, 0, size);
         }
@@ -352,7 +351,7 @@ private void putNonNullValues(
       toColumn.putDoubles(0, batchSize, ((DoubleColumnVector)fromColumn).vector, 0);
     } else if (type instanceof StringType || type instanceof BinaryType) {
       BytesColumnVector data = ((BytesColumnVector)fromColumn);
-      WritableColumnVector arrayData = toColumn.getChildColumn(0);
+      WritableColumnVector arrayData = toColumn.arrayData();
       int totalNumBytes = IntStream.of(data.length).sum();
       arrayData.reserve(totalNumBytes);
       for (int index = 0, pos = 0; index < batchSize; pos += data.length[index], index++) {
@@ -363,8 +362,7 @@ private void putNonNullValues(
       DecimalType decimalType = (DecimalType)type;
       DecimalColumnVector data = ((DecimalColumnVector)fromColumn);
       if (decimalType.precision() > Decimal.MAX_LONG_DIGITS()) {
-        WritableColumnVector arrayData = toColumn.getChildColumn(0);
-        arrayData.reserve(batchSize * 16);
+        toColumn.arrayData().reserve(batchSize * 16);
       }
       for (int index = 0; index < batchSize; index++) {
         putDecimalWritable(
@@ -459,7 +457,7 @@ private void putValues(
       }
     } else if (type instanceof StringType || type instanceof BinaryType) {
       BytesColumnVector vector = (BytesColumnVector)fromColumn;
-      WritableColumnVector arrayData = toColumn.getChildColumn(0);
+      WritableColumnVector arrayData = toColumn.arrayData();
       int totalNumBytes = IntStream.of(vector.length).sum();
       arrayData.reserve(totalNumBytes);
       for (int index = 0, pos = 0; index < batchSize; pos += vector.length[index], index++) {
@@ -474,8 +472,7 @@ private void putValues(
       DecimalType decimalType = (DecimalType)type;
       HiveDecimalWritable[] vector = ((DecimalColumnVector)fromColumn).vector;
       if (decimalType.precision() > Decimal.MAX_LONG_DIGITS()) {
-        WritableColumnVector arrayData = toColumn.getChildColumn(0);
-        arrayData.reserve(batchSize * 16);
+        toColumn.arrayData().reserve(batchSize * 16);
       }
       for (int index = 0; index < batchSize; index++) {
         if (fromColumn.isNull[index]) {
@@ -521,8 +518,7 @@ private static void putDecimalWritable(
       toColumn.putLong(index, value.toUnscaledLong());
     } else {
       byte[] bytes = value.toJavaBigDecimal().unscaledValue().toByteArray();
-      WritableColumnVector arrayData = toColumn.getChildColumn(0);
-      arrayData.putBytes(index * 16, bytes.length, bytes, 0);
+      toColumn.arrayData().putBytes(index * 16, bytes.length, bytes, 0);
       toColumn.putArray(index, index * 16, bytes.length);
     }
   }
@@ -547,9 +543,8 @@ private static void putDecimalWritables(
       toColumn.putLongs(0, size, value.toUnscaledLong());
     } else {
       byte[] bytes = value.toJavaBigDecimal().unscaledValue().toByteArray();
-      WritableColumnVector arrayData = toColumn.getChildColumn(0);
-      arrayData.reserve(bytes.length);
-      arrayData.putBytes(0, bytes.length, bytes, 0);
+      toColumn.arrayData().reserve(bytes.length);
+      toColumn.arrayData().putBytes(0, bytes.length, bytes, 0);
       for (int index = 0; index < size; index++) {
         toColumn.putArray(index, 0, bytes.length);
       }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
index 5ee8cc8da2309..a2853bbadc92b 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
@@ -85,8 +85,8 @@ public static void populate(WritableColumnVector col, InternalRow row, int field
         }
       } else if (t instanceof CalendarIntervalType) {
         CalendarInterval c = (CalendarInterval)row.get(fieldIdx, t);
-        col.getChildColumn(0).putInts(0, capacity, c.months);
-        col.getChildColumn(1).putLongs(0, capacity, c.microseconds);
+        col.getChild(0).putInts(0, capacity, c.months);
+        col.getChild(1).putLongs(0, capacity, c.microseconds);
       } else if (t instanceof DateType) {
         col.putInts(0, capacity, row.getInt(fieldIdx));
       } else if (t instanceof TimestampType) {
@@ -149,8 +149,8 @@ private static void appendValue(WritableColumnVector dst, DataType t, Object o)
       } else if (t instanceof CalendarIntervalType) {
         CalendarInterval c = (CalendarInterval)o;
         dst.appendStruct(false);
-        dst.getChildColumn(0).appendInt(c.months);
-        dst.getChildColumn(1).appendLong(c.microseconds);
+        dst.getChild(0).appendInt(c.months);
+        dst.getChild(1).appendLong(c.microseconds);
       } else if (t instanceof DateType) {
         dst.appendInt(DateTimeUtils.fromJavaDate((Date)o));
       } else {
@@ -179,7 +179,7 @@ private static void appendValue(WritableColumnVector dst, DataType t, Row src, i
         dst.appendStruct(false);
         Row c = src.getStruct(fieldIdx);
         for (int i = 0; i < st.fields().length; i++) {
-          appendValue(dst.getChildColumn(i), st.fields()[i].dataType(), c, i);
+          appendValue(dst.getChild(i), st.fields()[i].dataType(), c, i);
         }
       }
     } else {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
index 70057a9def6c0..2bab095d4d951 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
@@ -146,8 +146,8 @@ public byte[] getBinary(int ordinal) {
   @Override
   public CalendarInterval getInterval(int ordinal) {
     if (columns[ordinal].isNullAt(rowId)) return null;
-    final int months = columns[ordinal].getChildColumn(0).getInt(rowId);
-    final long microseconds = columns[ordinal].getChildColumn(1).getLong(rowId);
+    final int months = columns[ordinal].getChild(0).getInt(rowId);
+    final long microseconds = columns[ordinal].getChild(1).getLong(rowId);
     return new CalendarInterval(months, microseconds);
   }
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index d2ae32b06f83b..ca4f00985c2a3 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -599,17 +599,13 @@ public final int appendStruct(boolean isNull) {
     return elementsAppended;
   }
 
-  /**
-   * Returns the data for the underlying array.
-   */
+  // `WritableColumnVector` puts the data of array in the first child column vector, and puts the
+  // array offsets and lengths in the current column vector.
   @Override
   public WritableColumnVector arrayData() { return childColumns[0]; }
 
-  /**
-   * Returns the ordinal's child data column.
-   */
   @Override
-  public WritableColumnVector getChildColumn(int ordinal) { return childColumns[ordinal]; }
+  public WritableColumnVector getChild(int ordinal) { return childColumns[ordinal]; }
 
   /**
    * Returns the elements appended.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
index bfd1b4cb0ef12..ca7a4751450d4 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
@@ -33,18 +33,6 @@ public final class ArrowColumnVector extends ColumnVector {
   private final ArrowVectorAccessor accessor;
   private ArrowColumnVector[] childColumns;
 
-  private void ensureAccessible(int index) {
-    ensureAccessible(index, 1);
-  }
-
-  private void ensureAccessible(int index, int count) {
-    int valueCount = accessor.getValueCount();
-    if (index < 0 || index + count > valueCount) {
-      throw new IndexOutOfBoundsException(
-        String.format("index range: [%d, %d), valueCount: %d", index, index + count, valueCount));
-    }
-  }
-
   @Override
   public int numNulls() {
     return accessor.getNullCount();
@@ -55,156 +43,75 @@ public void close() {
     if (childColumns != null) {
       for (int i = 0; i < childColumns.length; i++) {
         childColumns[i].close();
+        childColumns[i] = null;
       }
+      childColumns = null;
     }
     accessor.close();
   }
 
   @Override
   public boolean isNullAt(int rowId) {
-    ensureAccessible(rowId);
     return accessor.isNullAt(rowId);
   }
 
   @Override
   public boolean getBoolean(int rowId) {
-    ensureAccessible(rowId);
     return accessor.getBoolean(rowId);
   }
 
-  @Override
-  public boolean[] getBooleans(int rowId, int count) {
-    ensureAccessible(rowId, count);
-    boolean[] array = new boolean[count];
-    for (int i = 0; i < count; ++i) {
-      array[i] = accessor.getBoolean(rowId + i);
-    }
-    return array;
-  }
-
   @Override
   public byte getByte(int rowId) {
-    ensureAccessible(rowId);
     return accessor.getByte(rowId);
   }
 
-  @Override
-  public byte[] getBytes(int rowId, int count) {
-    ensureAccessible(rowId, count);
-    byte[] array = new byte[count];
-    for (int i = 0; i < count; ++i) {
-      array[i] = accessor.getByte(rowId + i);
-    }
-    return array;
-  }
-
   @Override
   public short getShort(int rowId) {
-    ensureAccessible(rowId);
     return accessor.getShort(rowId);
   }
 
-  @Override
-  public short[] getShorts(int rowId, int count) {
-    ensureAccessible(rowId, count);
-    short[] array = new short[count];
-    for (int i = 0; i < count; ++i) {
-      array[i] = accessor.getShort(rowId + i);
-    }
-    return array;
-  }
-
   @Override
   public int getInt(int rowId) {
-    ensureAccessible(rowId);
     return accessor.getInt(rowId);
   }
 
-  @Override
-  public int[] getInts(int rowId, int count) {
-    ensureAccessible(rowId, count);
-    int[] array = new int[count];
-    for (int i = 0; i < count; ++i) {
-      array[i] = accessor.getInt(rowId + i);
-    }
-    return array;
-  }
-
   @Override
   public long getLong(int rowId) {
-    ensureAccessible(rowId);
     return accessor.getLong(rowId);
   }
 
-  @Override
-  public long[] getLongs(int rowId, int count) {
-    ensureAccessible(rowId, count);
-    long[] array = new long[count];
-    for (int i = 0; i < count; ++i) {
-      array[i] = accessor.getLong(rowId + i);
-    }
-    return array;
-  }
-
   @Override
   public float getFloat(int rowId) {
-    ensureAccessible(rowId);
     return accessor.getFloat(rowId);
   }
 
-  @Override
-  public float[] getFloats(int rowId, int count) {
-    ensureAccessible(rowId, count);
-    float[] array = new float[count];
-    for (int i = 0; i < count; ++i) {
-      array[i] = accessor.getFloat(rowId + i);
-    }
-    return array;
-  }
-
   @Override
   public double getDouble(int rowId) {
-    ensureAccessible(rowId);
     return accessor.getDouble(rowId);
   }
 
-  @Override
-  public double[] getDoubles(int rowId, int count) {
-    ensureAccessible(rowId, count);
-    double[] array = new double[count];
-    for (int i = 0; i < count; ++i) {
-      array[i] = accessor.getDouble(rowId + i);
-    }
-    return array;
-  }
-
   @Override
   public int getArrayLength(int rowId) {
-    ensureAccessible(rowId);
     return accessor.getArrayLength(rowId);
   }
 
   @Override
   public int getArrayOffset(int rowId) {
-    ensureAccessible(rowId);
     return accessor.getArrayOffset(rowId);
   }
 
   @Override
   public Decimal getDecimal(int rowId, int precision, int scale) {
-    ensureAccessible(rowId);
     return accessor.getDecimal(rowId, precision, scale);
   }
 
   @Override
   public UTF8String getUTF8String(int rowId) {
-    ensureAccessible(rowId);
     return accessor.getUTF8String(rowId);
   }
 
   @Override
   public byte[] getBinary(int rowId) {
-    ensureAccessible(rowId);
     return accessor.getBinary(rowId);
   }
 
@@ -212,7 +119,7 @@ public byte[] getBinary(int rowId) {
   public ArrowColumnVector arrayData() { return childColumns[0]; }
 
   @Override
-  public ArrowColumnVector getChildColumn(int ordinal) { return childColumns[ordinal]; }
+  public ArrowColumnVector getChild(int ordinal) { return childColumns[ordinal]; }
 
   public ArrowColumnVector(ValueVector vector) {
     super(ArrowUtils.fromArrowField(vector.getField()));
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
index d1196e1299fee..f9936214035b6 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
@@ -51,12 +51,16 @@ public abstract class ColumnVector implements AutoCloseable {
   public final DataType dataType() { return type; }
 
   /**
-   * Cleans up memory for this column. The column is not usable after this.
+   * Cleans up memory for this column vector. The column vector is not usable after this.
+   *
+   * This overwrites `AutoCloseable.close` to remove the `throws` clause, as column vector is
+   * in-memory and we don't expect any exception to happen during closing.
    */
+  @Override
   public abstract void close();
 
   /**
-   * Returns the number of nulls in this column.
+   * Returns the number of nulls in this column vector.
    */
   public abstract int numNulls();
 
@@ -73,7 +77,13 @@ public abstract class ColumnVector implements AutoCloseable {
   /**
    * Gets values from [rowId, rowId + count)
    */
-  public abstract boolean[] getBooleans(int rowId, int count);
+  public boolean[] getBooleans(int rowId, int count) {
+    boolean[] res = new boolean[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getBoolean(rowId + i);
+    }
+    return res;
+  }
 
   /**
    * Returns the value for rowId.
@@ -83,7 +93,13 @@ public abstract class ColumnVector implements AutoCloseable {
   /**
    * Gets values from [rowId, rowId + count)
    */
-  public abstract byte[] getBytes(int rowId, int count);
+  public byte[] getBytes(int rowId, int count) {
+    byte[] res = new byte[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getByte(rowId + i);
+    }
+    return res;
+  }
 
   /**
    * Returns the value for rowId.
@@ -93,7 +109,13 @@ public abstract class ColumnVector implements AutoCloseable {
   /**
    * Gets values from [rowId, rowId + count)
    */
-  public abstract short[] getShorts(int rowId, int count);
+  public short[] getShorts(int rowId, int count) {
+    short[] res = new short[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getShort(rowId + i);
+    }
+    return res;
+  }
 
   /**
    * Returns the value for rowId.
@@ -103,7 +125,13 @@ public abstract class ColumnVector implements AutoCloseable {
   /**
    * Gets values from [rowId, rowId + count)
    */
-  public abstract int[] getInts(int rowId, int count);
+  public int[] getInts(int rowId, int count) {
+    int[] res = new int[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getInt(rowId + i);
+    }
+    return res;
+  }
 
   /**
    * Returns the value for rowId.
@@ -113,7 +141,13 @@ public abstract class ColumnVector implements AutoCloseable {
   /**
    * Gets values from [rowId, rowId + count)
    */
-  public abstract long[] getLongs(int rowId, int count);
+  public long[] getLongs(int rowId, int count) {
+    long[] res = new long[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getLong(rowId + i);
+    }
+    return res;
+  }
 
   /**
    * Returns the value for rowId.
@@ -123,7 +157,13 @@ public abstract class ColumnVector implements AutoCloseable {
   /**
    * Gets values from [rowId, rowId + count)
    */
-  public abstract float[] getFloats(int rowId, int count);
+  public float[] getFloats(int rowId, int count) {
+    float[] res = new float[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getFloat(rowId + i);
+    }
+    return res;
+  }
 
   /**
    * Returns the value for rowId.
@@ -133,7 +173,13 @@ public abstract class ColumnVector implements AutoCloseable {
   /**
    * Gets values from [rowId, rowId + count)
    */
-  public abstract double[] getDoubles(int rowId, int count);
+  public double[] getDoubles(int rowId, int count) {
+    double[] res = new double[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getDouble(rowId + i);
+    }
+    return res;
+  }
 
   /**
    * Returns the length of the array for rowId.
@@ -152,14 +198,6 @@ public final ColumnarRow getStruct(int rowId) {
     return new ColumnarRow(this, rowId);
   }
 
-  /**
-   * A special version of {@link #getStruct(int)}, which is only used as an adapter for Spark
-   * codegen framework, the second parameter is totally ignored.
-   */
-  public final ColumnarRow getStruct(int rowId, int size) {
-    return getStruct(rowId);
-  }
-
   /**
    * Returns the array for rowId.
    */
@@ -196,9 +234,9 @@ public MapData getMap(int ordinal) {
   public abstract ColumnVector arrayData();
 
   /**
-   * Returns the ordinal's child data column.
+   * Returns the ordinal's child column vector.
    */
-  public abstract ColumnVector getChildColumn(int ordinal);
+  public abstract ColumnVector getChild(int ordinal);
 
   /**
    * Data type for this column.
@@ -206,8 +244,7 @@ public MapData getMap(int ordinal) {
   protected DataType type;
 
   /**
-   * Sets up the common state and also handles creating the child columns if this is a nested
-   * type.
+   * Sets up the data type of this column vector.
    */
   protected ColumnVector(DataType type) {
     this.type = type;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
index 0d89a52e7a4fe..522c39580389f 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
@@ -133,8 +133,8 @@ public byte[] getBinary(int ordinal) {
 
   @Override
   public CalendarInterval getInterval(int ordinal) {
-    int month = data.getChildColumn(0).getInt(offset + ordinal);
-    long microseconds = data.getChildColumn(1).getLong(offset + ordinal);
+    int month = data.getChild(0).getInt(offset + ordinal);
+    long microseconds = data.getChild(1).getLong(offset + ordinal);
     return new CalendarInterval(month, microseconds);
   }
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
index 3c6656dec77cd..2e59085a82768 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
@@ -28,7 +28,7 @@
  */
 public final class ColumnarRow extends InternalRow {
   // The data for this row.
-  // E.g. the value of 3rd int field is `data.getChildColumn(3).getInt(rowId)`.
+  // E.g. the value of 3rd int field is `data.getChild(3).getInt(rowId)`.
   private final ColumnVector data;
   private final int rowId;
   private final int numFields;
@@ -53,7 +53,7 @@ public InternalRow copy() {
       if (isNullAt(i)) {
         row.setNullAt(i);
       } else {
-        DataType dt = data.getChildColumn(i).dataType();
+        DataType dt = data.getChild(i).dataType();
         if (dt instanceof BooleanType) {
           row.setBoolean(i, getBoolean(i));
         } else if (dt instanceof ByteType) {
@@ -93,65 +93,65 @@ public boolean anyNull() {
   }
 
   @Override
-  public boolean isNullAt(int ordinal) { return data.getChildColumn(ordinal).isNullAt(rowId); }
+  public boolean isNullAt(int ordinal) { return data.getChild(ordinal).isNullAt(rowId); }
 
   @Override
-  public boolean getBoolean(int ordinal) { return data.getChildColumn(ordinal).getBoolean(rowId); }
+  public boolean getBoolean(int ordinal) { return data.getChild(ordinal).getBoolean(rowId); }
 
   @Override
-  public byte getByte(int ordinal) { return data.getChildColumn(ordinal).getByte(rowId); }
+  public byte getByte(int ordinal) { return data.getChild(ordinal).getByte(rowId); }
 
   @Override
-  public short getShort(int ordinal) { return data.getChildColumn(ordinal).getShort(rowId); }
+  public short getShort(int ordinal) { return data.getChild(ordinal).getShort(rowId); }
 
   @Override
-  public int getInt(int ordinal) { return data.getChildColumn(ordinal).getInt(rowId); }
+  public int getInt(int ordinal) { return data.getChild(ordinal).getInt(rowId); }
 
   @Override
-  public long getLong(int ordinal) { return data.getChildColumn(ordinal).getLong(rowId); }
+  public long getLong(int ordinal) { return data.getChild(ordinal).getLong(rowId); }
 
   @Override
-  public float getFloat(int ordinal) { return data.getChildColumn(ordinal).getFloat(rowId); }
+  public float getFloat(int ordinal) { return data.getChild(ordinal).getFloat(rowId); }
 
   @Override
-  public double getDouble(int ordinal) { return data.getChildColumn(ordinal).getDouble(rowId); }
+  public double getDouble(int ordinal) { return data.getChild(ordinal).getDouble(rowId); }
 
   @Override
   public Decimal getDecimal(int ordinal, int precision, int scale) {
-    if (data.getChildColumn(ordinal).isNullAt(rowId)) return null;
-    return data.getChildColumn(ordinal).getDecimal(rowId, precision, scale);
+    if (data.getChild(ordinal).isNullAt(rowId)) return null;
+    return data.getChild(ordinal).getDecimal(rowId, precision, scale);
   }
 
   @Override
   public UTF8String getUTF8String(int ordinal) {
-    if (data.getChildColumn(ordinal).isNullAt(rowId)) return null;
-    return data.getChildColumn(ordinal).getUTF8String(rowId);
+    if (data.getChild(ordinal).isNullAt(rowId)) return null;
+    return data.getChild(ordinal).getUTF8String(rowId);
   }
 
   @Override
   public byte[] getBinary(int ordinal) {
-    if (data.getChildColumn(ordinal).isNullAt(rowId)) return null;
-    return data.getChildColumn(ordinal).getBinary(rowId);
+    if (data.getChild(ordinal).isNullAt(rowId)) return null;
+    return data.getChild(ordinal).getBinary(rowId);
   }
 
   @Override
   public CalendarInterval getInterval(int ordinal) {
-    if (data.getChildColumn(ordinal).isNullAt(rowId)) return null;
-    final int months = data.getChildColumn(ordinal).getChildColumn(0).getInt(rowId);
-    final long microseconds = data.getChildColumn(ordinal).getChildColumn(1).getLong(rowId);
+    if (data.getChild(ordinal).isNullAt(rowId)) return null;
+    final int months = data.getChild(ordinal).getChild(0).getInt(rowId);
+    final long microseconds = data.getChild(ordinal).getChild(1).getLong(rowId);
     return new CalendarInterval(months, microseconds);
   }
 
   @Override
   public ColumnarRow getStruct(int ordinal, int numFields) {
-    if (data.getChildColumn(ordinal).isNullAt(rowId)) return null;
-    return data.getChildColumn(ordinal).getStruct(rowId);
+    if (data.getChild(ordinal).isNullAt(rowId)) return null;
+    return data.getChild(ordinal).getStruct(rowId);
   }
 
   @Override
   public ColumnarArray getArray(int ordinal) {
-    if (data.getChildColumn(ordinal).isNullAt(rowId)) return null;
-    return data.getChildColumn(ordinal).getArray(rowId);
+    if (data.getChild(ordinal).isNullAt(rowId)) return null;
+    return data.getChild(ordinal).getArray(rowId);
   }
 
   @Override
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
index dd68df9686691..04f2619ed7541 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
@@ -50,7 +50,7 @@ private[sql] trait ColumnarBatchScan extends CodegenSupport {
       dataType: DataType,
       nullable: Boolean): ExprCode = {
     val javaType = ctx.javaType(dataType)
-    val value = ctx.getValue(columnVar, dataType, ordinal)
+    val value = ctx.getValueFromVector(columnVar, dataType, ordinal)
     val isNullVar = if (nullable) { ctx.freshName("isNull") } else { "false" }
     val valueVar = ctx.freshName("value")
     val str = s"columnVector[$columnVar, $ordinal, ${dataType.simpleString}]"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
index eb48584d0c1ee..633eeac180974 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
@@ -127,8 +127,8 @@ class VectorizedHashMapGenerator(
 
     def genEqualsForKeys(groupingKeys: Seq[Buffer]): String = {
       groupingKeys.zipWithIndex.map { case (key: Buffer, ordinal: Int) =>
-        s"""(${ctx.genEqual(key.dataType, ctx.getValue(s"vectors[$ordinal]", "buckets[idx]",
-          key.dataType), key.name)})"""
+        val value = ctx.getValueFromVector(s"vectors[$ordinal]", key.dataType, "buckets[idx]")
+        s"(${ctx.genEqual(key.dataType, value, key.name)})"
       }.mkString(" && ")
     }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala
index c42bc60a59d67..92506032ab2e5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala
@@ -217,21 +217,21 @@ class ArrowWriterSuite extends SparkFunSuite {
 
     val reader = new ArrowColumnVector(writer.root.getFieldVectors().get(0))
 
-    val struct0 = reader.getStruct(0, 2)
+    val struct0 = reader.getStruct(0)
     assert(struct0.getInt(0) === 1)
     assert(struct0.getUTF8String(1) === UTF8String.fromString("str1"))
 
-    val struct1 = reader.getStruct(1, 2)
+    val struct1 = reader.getStruct(1)
     assert(struct1.isNullAt(0))
     assert(struct1.isNullAt(1))
 
     assert(reader.isNullAt(2))
 
-    val struct3 = reader.getStruct(3, 2)
+    val struct3 = reader.getStruct(3)
     assert(struct3.getInt(0) === 4)
     assert(struct3.isNullAt(1))
 
-    val struct4 = reader.getStruct(4, 2)
+    val struct4 = reader.getStruct(4)
     assert(struct4.isNullAt(0))
     assert(struct4.getUTF8String(1) === UTF8String.fromString("str5"))
 
@@ -252,15 +252,15 @@ class ArrowWriterSuite extends SparkFunSuite {
 
     val reader = new ArrowColumnVector(writer.root.getFieldVectors().get(0))
 
-    val struct00 = reader.getStruct(0, 1).getStruct(0, 2)
+    val struct00 = reader.getStruct(0).getStruct(0, 2)
     assert(struct00.getInt(0) === 1)
     assert(struct00.getUTF8String(1) === UTF8String.fromString("str1"))
 
-    val struct10 = reader.getStruct(1, 1).getStruct(0, 2)
+    val struct10 = reader.getStruct(1).getStruct(0, 2)
     assert(struct10.isNullAt(0))
     assert(struct10.isNullAt(1))
 
-    val struct2 = reader.getStruct(2, 1)
+    val struct2 = reader.getStruct(2)
     assert(struct2.isNullAt(0))
 
     assert(reader.isNullAt(3))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
index 53432669e215d..e794f50781ff2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
@@ -346,11 +346,11 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
     assert(columnVector.dataType === schema)
     assert(columnVector.numNulls === 0)
 
-    val row0 = columnVector.getStruct(0, 2)
+    val row0 = columnVector.getStruct(0)
     assert(row0.getInt(0) === 1)
     assert(row0.getLong(1) === 1L)
 
-    val row1 = columnVector.getStruct(1, 2)
+    val row1 = columnVector.getStruct(1)
     assert(row1.getInt(0) === 2)
     assert(row1.isNullAt(1))
 
@@ -398,21 +398,21 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
     assert(columnVector.dataType === schema)
     assert(columnVector.numNulls === 1)
 
-    val row0 = columnVector.getStruct(0, 2)
+    val row0 = columnVector.getStruct(0)
     assert(row0.getInt(0) === 1)
     assert(row0.getLong(1) === 1L)
 
-    val row1 = columnVector.getStruct(1, 2)
+    val row1 = columnVector.getStruct(1)
     assert(row1.getInt(0) === 2)
     assert(row1.isNullAt(1))
 
-    val row2 = columnVector.getStruct(2, 2)
+    val row2 = columnVector.getStruct(2)
     assert(row2.isNullAt(0))
     assert(row2.getLong(1) === 3L)
 
     assert(columnVector.isNullAt(3))
 
-    val row4 = columnVector.getStruct(4, 2)
+    val row4 = columnVector.getStruct(4)
     assert(row4.getInt(0) === 5)
     assert(row4.getLong(1) === 5L)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
index 944240f3bade5..2d1ad4b456783 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
@@ -199,17 +199,17 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
 
   val structType: StructType = new StructType().add("int", IntegerType).add("double", DoubleType)
   testVectors("struct", 10, structType) { testVector =>
-    val c1 = testVector.getChildColumn(0)
-    val c2 = testVector.getChildColumn(1)
+    val c1 = testVector.getChild(0)
+    val c2 = testVector.getChild(1)
     c1.putInt(0, 123)
     c2.putDouble(0, 3.45)
     c1.putInt(1, 456)
     c2.putDouble(1, 5.67)
 
-    assert(testVector.getStruct(0, structType.length).get(0, IntegerType) === 123)
-    assert(testVector.getStruct(0, structType.length).get(1, DoubleType) === 3.45)
-    assert(testVector.getStruct(1, structType.length).get(0, IntegerType) === 456)
-    assert(testVector.getStruct(1, structType.length).get(1, DoubleType) === 5.67)
+    assert(testVector.getStruct(0).get(0, IntegerType) === 123)
+    assert(testVector.getStruct(0).get(1, DoubleType) === 3.45)
+    assert(testVector.getStruct(1).get(0, IntegerType) === 456)
+    assert(testVector.getStruct(1).get(1, DoubleType) === 5.67)
   }
 
   test("[SPARK-22092] off-heap column vector reallocation corrupts array data") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchBenchmark.scala
index 38ea2e47fdef8..ad74fb99b0c73 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchBenchmark.scala
@@ -268,17 +268,17 @@ object ColumnarBatchBenchmark {
 
     Int Read/Write:                          Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
     ------------------------------------------------------------------------------------------------
-    Java Array                                     177 /  181       1856.4           0.5       1.0X
-    ByteBuffer Unsafe                              318 /  322       1032.0           1.0       0.6X
-    ByteBuffer API                                1411 / 1418        232.2           4.3       0.1X
-    DirectByteBuffer                               467 /  474        701.8           1.4       0.4X
-    Unsafe Buffer                                  178 /  185       1843.6           0.5       1.0X
-    Column(on heap)                                178 /  184       1840.8           0.5       1.0X
-    Column(off heap)                               341 /  344        961.8           1.0       0.5X
-    Column(off heap direct)                        178 /  184       1845.4           0.5       1.0X
-    UnsafeRow (on heap)                            378 /  389        866.3           1.2       0.5X
-    UnsafeRow (off heap)                           393 /  402        834.0           1.2       0.4X
-    Column On Heap Append                          309 /  318       1059.1           0.9       0.6X
+    Java Array                                     177 /  183       1851.1           0.5       1.0X
+    ByteBuffer Unsafe                              314 /  330       1043.7           1.0       0.6X
+    ByteBuffer API                                1298 / 1307        252.4           4.0       0.1X
+    DirectByteBuffer                               465 /  483        704.2           1.4       0.4X
+    Unsafe Buffer                                  179 /  183       1835.5           0.5       1.0X
+    Column(on heap)                                181 /  186       1815.2           0.6       1.0X
+    Column(off heap)                               344 /  349        951.7           1.1       0.5X
+    Column(off heap direct)                        178 /  186       1838.6           0.5       1.0X
+    UnsafeRow (on heap)                            388 /  394        844.8           1.2       0.5X
+    UnsafeRow (off heap)                           400 /  403        819.4           1.2       0.4X
+    Column On Heap Append                          315 /  325       1041.8           1.0       0.6X
     */
     val benchmark = new Benchmark("Int Read/Write", count * iters)
     benchmark.addCase("Java Array")(javaArray)
@@ -337,8 +337,8 @@ object ColumnarBatchBenchmark {
 
     Boolean Read/Write:                      Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
     ------------------------------------------------------------------------------------------------
-    Bitset                                         726 /  727        462.4           2.2       1.0X
-    Byte Array                                     530 /  542        632.7           1.6       1.4X
+    Bitset                                         741 /  747        452.6           2.2       1.0X
+    Byte Array                                     531 /  542        631.6           1.6       1.4X
     */
     benchmark.run()
   }
@@ -394,8 +394,8 @@ object ColumnarBatchBenchmark {
 
     String Read/Write:                       Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
     ------------------------------------------------------------------------------------------------
-    On Heap                                        332 /  338         49.3          20.3       1.0X
-    Off Heap                                       466 /  467         35.2          28.4       0.7X
+    On Heap                                        351 /  362         46.6          21.4       1.0X
+    Off Heap                                       456 /  466         35.9          27.8       0.8X
     */
     val benchmark = new Benchmark("String Read/Write", count * iters)
     benchmark.addCase("On Heap")(column(MemoryMode.ON_HEAP))
@@ -479,10 +479,10 @@ object ColumnarBatchBenchmark {
 
     Array Vector Read:                       Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
     ------------------------------------------------------------------------------------------------
-    On Heap Read Size Only                         415 /  422        394.7           2.5       1.0X
-    Off Heap Read Size Only                        394 /  402        415.9           2.4       1.1X
-    On Heap Read Elements                         2558 / 2593         64.0          15.6       0.2X
-    Off Heap Read Elements                        3316 / 3317         49.4          20.2       0.1X
+    On Heap Read Size Only                         416 /  423        393.5           2.5       1.0X
+    Off Heap Read Size Only                        396 /  404        413.6           2.4       1.1X
+    On Heap Read Elements                         2569 / 2590         63.8          15.7       0.2X
+    Off Heap Read Elements                        3302 / 3333         49.6          20.2       0.1X
     */
     benchmark.run
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index cd90681ecabc6..1873c24ab063c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -732,8 +732,8 @@ class ColumnarBatchSuite extends SparkFunSuite {
     "Struct Column",
     10,
     new StructType().add("int", IntegerType).add("double", DoubleType)) { column =>
-      val c1 = column.getChildColumn(0)
-      val c2 = column.getChildColumn(1)
+      val c1 = column.getChild(0)
+      val c2 = column.getChild(1)
       assert(c1.dataType() == IntegerType)
       assert(c2.dataType() == DoubleType)
 
@@ -787,8 +787,8 @@ class ColumnarBatchSuite extends SparkFunSuite {
     10,
     new ArrayType(structType, true)) { column =>
       val data = column.arrayData()
-      val c0 = data.getChildColumn(0)
-      val c1 = data.getChildColumn(1)
+      val c0 = data.getChild(0)
+      val c1 = data.getChild(1)
       // Structs in child column: (0, 0), (1, 10), (2, 20), (3, 30), (4, 40), (5, 50)
       (0 until 6).foreach { i =>
         c0.putInt(i, i)
@@ -815,8 +815,8 @@ class ColumnarBatchSuite extends SparkFunSuite {
     new StructType()
       .add("int", IntegerType)
       .add("array", new ArrayType(IntegerType, true))) { column =>
-      val c0 = column.getChildColumn(0)
-      val c1 = column.getChildColumn(1)
+      val c0 = column.getChild(0)
+      val c1 = column.getChild(1)
       c0.putInt(0, 0)
       c0.putInt(1, 1)
       c0.putInt(2, 2)
@@ -844,13 +844,13 @@ class ColumnarBatchSuite extends SparkFunSuite {
     "Nest Struct in Struct",
     10,
     new StructType().add("int", IntegerType).add("struct", subSchema)) { column =>
-      val c0 = column.getChildColumn(0)
-      val c1 = column.getChildColumn(1)
+      val c0 = column.getChild(0)
+      val c1 = column.getChild(1)
       c0.putInt(0, 0)
       c0.putInt(1, 1)
       c0.putInt(2, 2)
-      val c1c0 = c1.getChildColumn(0)
-      val c1c1 = c1.getChildColumn(1)
+      val c1c0 = c1.getChild(0)
+      val c1c1 = c1.getChild(1)
       // Structs in c1: (7, 70), (8, 80), (9, 90)
       c1c0.putInt(0, 7)
       c1c0.putInt(1, 8)

From 4e75b0cb4b575d4799c02455eed286fe971c6c50 Mon Sep 17 00:00:00 2001
From: Sandor Murakozi <smurakozi@gmail.com>
Date: Mon, 22 Jan 2018 10:36:28 -0800
Subject: [PATCH 160/566] [SPARK-23121][CORE] Fix for ui becoming unaccessible
 for long running streaming apps

## What changes were proposed in this pull request?

The allJobs and the job pages attempt to use stage attempt and DAG visualization from the store, but for long running jobs they are not guaranteed to be retained, leading to exceptions when these pages are rendered.

To fix it `store.lastStageAttempt(stageId)` and `store.operationGraphForJob(jobId)` are wrapped in `store.asOption` and default values are used if the info is missing.

## How was this patch tested?

Manual testing of the UI, also using the test command reported in SPARK-23121:

./bin/spark-submit --class org.apache.spark.examples.streaming.HdfsWordCount ./examples/jars/spark-examples_2.11-2.4.0-SNAPSHOT.jar /spark

Closes #20287

Author: Sandor Murakozi <smurakozi@gmail.com>

Closes #20330 from smurakozi/SPARK-23121.

(cherry picked from commit 446948af1d8dbc080a26a6eec6f743d338f1d12b)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../apache/spark/ui/jobs/AllJobsPage.scala    | 24 ++++++++++---------
 .../org/apache/spark/ui/jobs/JobPage.scala    | 10 ++++++--
 .../org/apache/spark/ui/jobs/StagePage.scala  |  9 ++++---
 3 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
index ff916bb6a5759..c2668a7ff832f 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
@@ -36,6 +36,9 @@ import org.apache.spark.util.Utils
 
 /** Page showing list of all ongoing and recently finished jobs */
 private[ui] class AllJobsPage(parent: JobsTab, store: AppStatusStore) extends WebUIPage("") {
+
+  import ApiHelper._
+
   private val JOBS_LEGEND =
     <div class="legend-area"><svg width="150px" height="85px">
       <rect class="succeeded-job-legend"
@@ -65,10 +68,9 @@ private[ui] class AllJobsPage(parent: JobsTab, store: AppStatusStore) extends We
     }.map { job =>
       val jobId = job.jobId
       val status = job.status
-      val jobDescription = store.lastStageAttempt(job.stageIds.max).description
-      val displayJobDescription = jobDescription
-        .map(UIUtils.makeDescription(_, "", plainText = true).text)
-        .getOrElse("")
+      val (_, lastStageDescription) = lastStageNameAndDescription(store, job)
+      val jobDescription = UIUtils.makeDescription(lastStageDescription, "", plainText = true).text
+
       val submissionTime = job.submissionTime.get.getTime()
       val completionTime = job.completionTime.map(_.getTime()).getOrElse(System.currentTimeMillis())
       val classNameByStatus = status match {
@@ -80,7 +82,7 @@ private[ui] class AllJobsPage(parent: JobsTab, store: AppStatusStore) extends We
 
       // The timeline library treats contents as HTML, so we have to escape them. We need to add
       // extra layers of escaping in order to embed this in a Javascript string literal.
-      val escapedDesc = Utility.escape(displayJobDescription)
+      val escapedDesc = Utility.escape(jobDescription)
       val jsEscapedDesc = StringEscapeUtils.escapeEcmaScript(escapedDesc)
       val jobEventJsonAsStr =
         s"""
@@ -403,6 +405,8 @@ private[ui] class JobDataSource(
     sortColumn: String,
     desc: Boolean) extends PagedDataSource[JobTableRowData](pageSize) {
 
+  import ApiHelper._
+
   // Convert JobUIData to JobTableRowData which contains the final contents to show in the table
   // so that we can avoid creating duplicate contents during sorting the data
   private val data = jobs.map(jobRow).sorted(ordering(sortColumn, desc))
@@ -427,23 +431,21 @@ private[ui] class JobDataSource(
     val formattedDuration = duration.map(d => UIUtils.formatDuration(d)).getOrElse("Unknown")
     val submissionTime = jobData.submissionTime
     val formattedSubmissionTime = submissionTime.map(UIUtils.formatDate).getOrElse("Unknown")
-    val lastStageAttempt = store.lastStageAttempt(jobData.stageIds.max)
-    val lastStageDescription = lastStageAttempt.description.getOrElse("")
+    val (lastStageName, lastStageDescription) = lastStageNameAndDescription(store, jobData)
 
-    val formattedJobDescription =
-      UIUtils.makeDescription(lastStageDescription, basePath, plainText = false)
+    val jobDescription = UIUtils.makeDescription(lastStageDescription, basePath, plainText = false)
 
     val detailUrl = "%s/jobs/job?id=%s".format(basePath, jobData.jobId)
 
     new JobTableRowData(
       jobData,
-      lastStageAttempt.name,
+      lastStageName,
       lastStageDescription,
       duration.getOrElse(-1),
       formattedDuration,
       submissionTime.map(_.getTime()).getOrElse(-1L),
       formattedSubmissionTime,
-      formattedJobDescription,
+      jobDescription,
       detailUrl
     )
   }
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala
index bf59152c8c0cd..974e5c5ffd0a0 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala
@@ -336,8 +336,14 @@ private[ui] class JobPage(parent: JobsTab, store: AppStatusStore) extends WebUIP
     content ++= makeTimeline(activeStages ++ completedStages ++ failedStages,
       store.executorList(false), appStartTime)
 
-    content ++= UIUtils.showDagVizForJob(
-      jobId, store.operationGraphForJob(jobId))
+    val operationGraphContent = store.asOption(store.operationGraphForJob(jobId)) match {
+      case Some(operationGraph) => UIUtils.showDagVizForJob(jobId, operationGraph)
+      case None =>
+        <div id="no-info">
+          <p>No DAG visualization information to display for job {jobId}</p>
+        </div>
+    }
+    content ++= operationGraphContent
 
     if (shouldShowActiveStages) {
       content ++= <h4 id="active">Active Stages ({activeStages.size})</h4> ++
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 38f7b352114fa..8af2537aba7be 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -23,12 +23,10 @@ import java.util.concurrent.TimeUnit
 import javax.servlet.http.HttpServletRequest
 
 import scala.collection.mutable.{HashMap, HashSet}
-import scala.xml.{Elem, Node, Unparsed}
+import scala.xml.{Node, Unparsed}
 
 import org.apache.commons.lang3.StringEscapeUtils
 
-import org.apache.spark.SparkConf
-import org.apache.spark.internal.config._
 import org.apache.spark.scheduler.TaskLocality
 import org.apache.spark.status._
 import org.apache.spark.status.api.v1._
@@ -1012,4 +1010,9 @@ private object ApiHelper {
     }
   }
 
+  def lastStageNameAndDescription(store: AppStatusStore, job: JobData): (String, String) = {
+    val stage = store.asOption(store.lastStageAttempt(job.stageIds.max))
+    (stage.map(_.name).getOrElse(""), stage.flatMap(_.description).getOrElse(job.name))
+  }
+
 }

From 489ecb0ef23e5d9b705e5e5bae4fa3d871bdac91 Mon Sep 17 00:00:00 2001
From: Sameer Agarwal <sameerag@apache.org>
Date: Mon, 22 Jan 2018 10:49:08 -0800
Subject: [PATCH 161/566] Preparing Spark release v2.3.0-rc2

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 29a8a0062873f..6d46c31906260 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.1
+Version: 2.3.0
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 5c5a8e9b8232e..2ca9ab66462a8 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 2a625da682cdc..404c744f98261 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index adb1890439094..3c0b5288862fa 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 4cdcfa2a4da99..fe3bcfd68f988 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 494137f8e9ec1..90ca4016ecc79 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 32bebf5eeca99..8bb2c38f7d097 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 8dc4ddeea3ec0..0d5d4b1661012 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 5613f5e81384c..d6136e6c29a84 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index c34c9df39063b..364aaab74e67b 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index 213579037f51a..aabf3151890f9 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.1-SNAPSHOT
-SPARK_VERSION_SHORT: 2.3.1
+SPARK_VERSION: 2.3.0
+SPARK_VERSION_SHORT: 2.3.0
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index e584feed1b337..28de7a180c2e7 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 6c4031b392f65..9b74a4d327136 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 6935ade33546a..1da013353775c 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 60aa5fb1a1031..da8ce5928b8bb 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index d8ed3364c4bd4..65a9f9972d398 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 15d550d3bb445..f5a2dbbef3b6f 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 7ee7156efe22c..e9b8e4e18d6df 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 1f28798fd245a..46a5bd1611d30 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index ad3cd0059e183..72446ecb5cd82 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 53d903fc58aee..e49c8281c6ddb 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 4206a1ac4ad99..7ad92b07094f2 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 7e838812bdc2e..126624ecd8922 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index ee7eb98e3dfc8..a8bcff8cdc14c 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 2ea2e4a7adbdb..6e60fbac9ef5a 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 35ddfac1d6624..7e09e304cb1b7 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 29c1e53f480a3..d8c1792f5b0dd 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 61eb9dbcf6f7a..d9380ed2c6d4d 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index edba0a72ee6a4..9cb3ed2e7ae1d 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index a8e448a80c13e..39525a5219192 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.1-SNAPSHOT</version>
+  <version>2.3.0</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index b09fa9822184c..8065b058f8c0e 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.1.dev0"
+__version__ = "2.3.0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 3b445bfaf773d..c21a1cd108a8a 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index ae1c69f305589..544e08b333aaf 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index ea31b9c121433..3dc6955ce312c 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index d42bfff851572..7e1397ef9ba79 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 865d3b3f6ad4e..936267ce560e6 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index f39bffe417666..77d4cf7ec8459 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 6e04f528a1c2f..7ade122666a83 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index da50c9bd60fa6..da34cb8253272 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 6e8e034fc4de4..adbafac9bc458 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 9523085cf5513..4e005623e2b26 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 6facc7fb2333cc61409149e2f896bf84dd085fa3 Mon Sep 17 00:00:00 2001
From: Sameer Agarwal <sameerag@apache.org>
Date: Mon, 22 Jan 2018 10:49:29 -0800
Subject: [PATCH 162/566] Preparing development version 2.3.1-SNAPSHOT

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 6d46c31906260..29a8a0062873f 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.0
+Version: 2.3.1
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 2ca9ab66462a8..5c5a8e9b8232e 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 404c744f98261..2a625da682cdc 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 3c0b5288862fa..adb1890439094 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index fe3bcfd68f988..4cdcfa2a4da99 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 90ca4016ecc79..494137f8e9ec1 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 8bb2c38f7d097..32bebf5eeca99 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 0d5d4b1661012..8dc4ddeea3ec0 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index d6136e6c29a84..5613f5e81384c 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 364aaab74e67b..c34c9df39063b 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index aabf3151890f9..213579037f51a 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.0
-SPARK_VERSION_SHORT: 2.3.0
+SPARK_VERSION: 2.3.1-SNAPSHOT
+SPARK_VERSION_SHORT: 2.3.1
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 28de7a180c2e7..e584feed1b337 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 9b74a4d327136..6c4031b392f65 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 1da013353775c..6935ade33546a 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index da8ce5928b8bb..60aa5fb1a1031 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 65a9f9972d398..d8ed3364c4bd4 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index f5a2dbbef3b6f..15d550d3bb445 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index e9b8e4e18d6df..7ee7156efe22c 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 46a5bd1611d30..1f28798fd245a 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 72446ecb5cd82..ad3cd0059e183 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index e49c8281c6ddb..53d903fc58aee 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 7ad92b07094f2..4206a1ac4ad99 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 126624ecd8922..7e838812bdc2e 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index a8bcff8cdc14c..ee7eb98e3dfc8 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 6e60fbac9ef5a..2ea2e4a7adbdb 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 7e09e304cb1b7..35ddfac1d6624 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index d8c1792f5b0dd..29c1e53f480a3 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index d9380ed2c6d4d..61eb9dbcf6f7a 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 9cb3ed2e7ae1d..edba0a72ee6a4 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 39525a5219192..a8e448a80c13e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.0</version>
+  <version>2.3.1-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 8065b058f8c0e..b09fa9822184c 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.0"
+__version__ = "2.3.1.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index c21a1cd108a8a..3b445bfaf773d 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 544e08b333aaf..ae1c69f305589 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 3dc6955ce312c..ea31b9c121433 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 7e1397ef9ba79..d42bfff851572 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 936267ce560e6..865d3b3f6ad4e 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 77d4cf7ec8459..f39bffe417666 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 7ade122666a83..6e04f528a1c2f 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index da34cb8253272..da50c9bd60fa6 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index adbafac9bc458..6e8e034fc4de4 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 4e005623e2b26..9523085cf5513 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 566ef93a672aea1803d6977883204780c2f6982d Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek@japila.pl>
Date: Mon, 22 Jan 2018 13:55:14 -0600
Subject: [PATCH 163/566] [MINOR] Typo fixes

## What changes were proposed in this pull request?

Typo fixes

## How was this patch tested?

Local build / Doc-only changes

Author: Jacek Laskowski <jacek@japila.pl>

Closes #20344 from jaceklaskowski/typo-fixes.

(cherry picked from commit 76b8b840ddc951ee6203f9cccd2c2b9671c1b5e8)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../main/scala/org/apache/spark/SparkContext.scala   |  2 +-
 .../spark/sql/kafka010/KafkaSourceProvider.scala     |  4 ++--
 .../apache/spark/sql/kafka010/KafkaWriteTask.scala   |  2 +-
 .../org/apache/spark/sql/streaming/OutputMode.java   |  2 +-
 .../spark/sql/catalyst/analysis/Analyzer.scala       |  8 ++++----
 .../spark/sql/catalyst/analysis/unresolved.scala     |  2 +-
 .../catalyst/expressions/aggregate/interfaces.scala  | 12 +++++-------
 .../catalyst/plans/logical/LogicalPlanVisitor.scala  |  2 +-
 .../statsEstimation/BasicStatsPlanVisitor.scala      |  2 +-
 .../SizeInBytesOnlyStatsPlanVisitor.scala            |  4 ++--
 .../org/apache/spark/sql/internal/SQLConf.scala      |  2 +-
 .../apache/spark/sql/catalyst/plans/PlanTest.scala   |  2 +-
 .../scala/org/apache/spark/sql/DataFrameWriter.scala |  2 +-
 .../apache/spark/sql/execution/SparkSqlParser.scala  |  2 +-
 .../spark/sql/execution/WholeStageCodegenExec.scala  |  2 +-
 .../spark/sql/execution/command/SetCommand.scala     |  4 ++--
 .../spark/sql/execution/datasources/rules.scala      |  2 +-
 .../sql/execution/streaming/HDFSMetadataLog.scala    |  2 +-
 .../spark/sql/execution/streaming/OffsetSeq.scala    |  2 +-
 .../spark/sql/execution/streaming/OffsetSeqLog.scala |  2 +-
 .../execution/streaming/StreamingQueryWrapper.scala  |  2 +-
 .../sql/execution/streaming/state/StateStore.scala   |  2 +-
 .../spark/sql/execution/ui/ExecutionPage.scala       |  2 +-
 .../spark/sql/expressions/UserDefinedFunction.scala  |  4 ++--
 .../spark/sql/internal/BaseSessionStateBuilder.scala |  4 ++--
 .../spark/sql/streaming/DataStreamReader.scala       |  6 +++---
 .../results/columnresolution-negative.sql.out        |  2 +-
 .../sql-tests/results/columnresolution-views.sql.out |  2 +-
 .../sql-tests/results/columnresolution.sql.out       |  6 +++---
 .../scala/org/apache/spark/sql/SQLQuerySuite.scala   |  4 ++--
 .../apache/spark/sql/execution/SQLViewSuite.scala    |  2 +-
 .../apache/spark/sql/hive/HiveExternalCatalog.scala  |  4 ++--
 32 files changed, 50 insertions(+), 52 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 31f3cb9dfa0ae..3828d4f703247 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -2276,7 +2276,7 @@ class SparkContext(config: SparkConf) extends Logging {
   }
 
   /**
-   * Clean a closure to make it ready to be serialized and send to tasks
+   * Clean a closure to make it ready to be serialized and sent to tasks
    * (removes unreferenced variables in $outer's, updates REPL variables)
    * If <tt>checkSerializable</tt> is set, <tt>clean</tt> will also proactively
    * check to see if <tt>f</tt> is serializable and throw a <tt>SparkException</tt>
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
index 3914370a96595..62a998fbfb30b 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
@@ -307,7 +307,7 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
     if (caseInsensitiveParams.contains(s"kafka.${ConsumerConfig.GROUP_ID_CONFIG}")) {
       throw new IllegalArgumentException(
         s"Kafka option '${ConsumerConfig.GROUP_ID_CONFIG}' is not supported as " +
-          s"user-specified consumer groups is not used to track offsets.")
+          s"user-specified consumer groups are not used to track offsets.")
     }
 
     if (caseInsensitiveParams.contains(s"kafka.${ConsumerConfig.AUTO_OFFSET_RESET_CONFIG}")) {
@@ -335,7 +335,7 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
     {
       throw new IllegalArgumentException(
         s"Kafka option '${ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG}' is not supported as "
-          + "value are deserialized as byte arrays with ByteArrayDeserializer. Use DataFrame "
+          + "values are deserialized as byte arrays with ByteArrayDeserializer. Use DataFrame "
           + "operations to explicitly deserialize the values.")
     }
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
index baa60febf661d..d90630a8adc93 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Literal, Unsa
 import org.apache.spark.sql.types.{BinaryType, StringType}
 
 /**
- * A simple trait for writing out data in a single Spark task, without any concerns about how
+ * Writes out data in a single Spark task, without any concerns about how
  * to commit or abort tasks. Exceptions thrown by the implementation of this class will
  * automatically trigger task aborts.
  */
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/streaming/OutputMode.java b/sql/catalyst/src/main/java/org/apache/spark/sql/streaming/OutputMode.java
index 2800b3068f87b..470c128ee6c3d 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/streaming/OutputMode.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/streaming/OutputMode.java
@@ -21,7 +21,7 @@
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes;
 
 /**
- * OutputMode is used to what data will be written to a streaming sink when there is
+ * OutputMode describes what data will be written to a streaming sink when there is
  * new data available in a streaming DataFrame/Dataset.
  *
  * @since 2.0.0
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 35b35110e491f..2b14c8220d43b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -611,8 +611,8 @@ class Analyzer(
           if (AnalysisContext.get.nestedViewDepth > conf.maxNestedViewDepth) {
             view.failAnalysis(s"The depth of view ${view.desc.identifier} exceeds the maximum " +
               s"view resolution depth (${conf.maxNestedViewDepth}). Analysis is aborted to " +
-              "avoid errors. Increase the value of spark.sql.view.maxNestedViewDepth to work " +
-              "aroud this.")
+              s"avoid errors. Increase the value of ${SQLConf.MAX_NESTED_VIEW_DEPTH.key} to work " +
+              "around this.")
           }
           executeSameContext(child)
         }
@@ -653,7 +653,7 @@ class Analyzer(
         // Note that if the database is not defined, it is possible we are looking up a temp view.
         case e: NoSuchDatabaseException =>
           u.failAnalysis(s"Table or view not found: ${tableIdentWithDb.unquotedString}, the " +
-            s"database ${e.db} doesn't exsits.")
+            s"database ${e.db} doesn't exist.")
       }
     }
 
@@ -1524,7 +1524,7 @@ class Analyzer(
   }
 
   /**
-   * Extracts [[Generator]] from the projectList of a [[Project]] operator and create [[Generate]]
+   * Extracts [[Generator]] from the projectList of a [[Project]] operator and creates [[Generate]]
    * operator under [[Project]].
    *
    * This rule will throw [[AnalysisException]] for following cases:
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index d336f801d0770..a65f58fa61ff4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -294,7 +294,7 @@ case class UnresolvedStar(target: Option[Seq[String]]) extends Star with Unevalu
     } else {
       val from = input.inputSet.map(_.name).mkString(", ")
       val targetString = target.get.mkString(".")
-      throw new AnalysisException(s"cannot resolve '$targetString.*' give input columns '$from'")
+      throw new AnalysisException(s"cannot resolve '$targetString.*' given input columns '$from'")
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
index 19abce01a26cf..e1d16a2cd38b0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
@@ -190,17 +190,15 @@ abstract class AggregateFunction extends Expression {
   def defaultResult: Option[Literal] = None
 
   /**
-   * Wraps this [[AggregateFunction]] in an [[AggregateExpression]] because
-   * [[AggregateExpression]] is the container of an [[AggregateFunction]], aggregation mode,
-   * and the flag indicating if this aggregation is distinct aggregation or not.
-   * An [[AggregateFunction]] should not be used without being wrapped in
-   * an [[AggregateExpression]].
+   * Creates [[AggregateExpression]] with `isDistinct` flag disabled.
+   *
+   * @see `toAggregateExpression(isDistinct: Boolean)` for detailed description
    */
   def toAggregateExpression(): AggregateExpression = toAggregateExpression(isDistinct = false)
 
   /**
-   * Wraps this [[AggregateFunction]] in an [[AggregateExpression]] and set isDistinct
-   * field of the [[AggregateExpression]] to the given value because
+   * Wraps this [[AggregateFunction]] in an [[AggregateExpression]] and sets `isDistinct`
+   * flag of the [[AggregateExpression]] to the given value because
    * [[AggregateExpression]] is the container of an [[AggregateFunction]], aggregation mode,
    * and the flag indicating if this aggregation is distinct aggregation or not.
    * An [[AggregateFunction]] should not be used without being wrapped in
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanVisitor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanVisitor.scala
index e0748043c46e2..2c248d74869ce 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanVisitor.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanVisitor.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.plans.logical
 
 /**
- * A visitor pattern for traversing a [[LogicalPlan]] tree and compute some properties.
+ * A visitor pattern for traversing a [[LogicalPlan]] tree and computing some properties.
  */
 trait LogicalPlanVisitor[T] {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/BasicStatsPlanVisitor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/BasicStatsPlanVisitor.scala
index ca0775a2e8408..b6c16079d1984 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/BasicStatsPlanVisitor.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/BasicStatsPlanVisitor.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.plans.logical.statsEstimation
 import org.apache.spark.sql.catalyst.plans.logical._
 
 /**
- * An [[LogicalPlanVisitor]] that computes a the statistics used in a cost-based optimizer.
+ * A [[LogicalPlanVisitor]] that computes the statistics for the cost-based optimizer.
  */
 object BasicStatsPlanVisitor extends LogicalPlanVisitor[Statistics] {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/SizeInBytesOnlyStatsPlanVisitor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/SizeInBytesOnlyStatsPlanVisitor.scala
index 5e1c4e0bd6069..85f67c7d66075 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/SizeInBytesOnlyStatsPlanVisitor.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/SizeInBytesOnlyStatsPlanVisitor.scala
@@ -48,8 +48,8 @@ object SizeInBytesOnlyStatsPlanVisitor extends LogicalPlanVisitor[Statistics] {
   }
 
   /**
-   * For leaf nodes, use its computeStats. For other nodes, we assume the size in bytes is the
-   * sum of all of the children's.
+   * For leaf nodes, use its `computeStats`. For other nodes, we assume the size in bytes is the
+   * product of all of the children's `computeStats`.
    */
   override def default(p: LogicalPlan): Statistics = p match {
     case p: LeafNode => p.computeStats()
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index cc4f4bf332459..1cef09a5bf053 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -894,7 +894,7 @@ object SQLConf {
       .internal()
       .doc("The number of bins when generating histograms.")
       .intConf
-      .checkValue(num => num > 1, "The number of bins must be large than 1.")
+      .checkValue(num => num > 1, "The number of bins must be larger than 1.")
       .createWithDefault(254)
 
   val PERCENTILE_ACCURACY =
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
index 82c5307d54360..6241d5cbb1d25 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
@@ -154,7 +154,7 @@ trait PlanTestBase extends PredicateHelper { self: Suite =>
   }
 
   /**
-   * Sets all SQL configurations specified in `pairs`, calls `f`, and then restore all SQL
+   * Sets all SQL configurations specified in `pairs`, calls `f`, and then restores all SQL
    * configurations.
    */
   protected def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 97f12ff625c42..5f3d4448e4e54 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -311,7 +311,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
     if (partitioningColumns.isDefined) {
       throw new AnalysisException(
         "insertInto() can't be used together with partitionBy(). " +
-          "Partition columns have already be defined for the table. " +
+          "Partition columns have already been defined for the table. " +
           "It is not necessary to use partitionBy()."
       )
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index d3cfd2a1ffbf2..4828fa60a7b58 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -327,7 +327,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
   }
 
   /**
-   * Create a [[DescribeTableCommand]] logical plan.
+   * Create a [[DescribeColumnCommand]] or [[DescribeTableCommand]] logical commands.
    */
   override def visitDescribeTable(ctx: DescribeTableContext): LogicalPlan = withOrigin(ctx) {
     val isExtended = ctx.EXTENDED != null || ctx.FORMATTED != null
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index 065954559e487..6102937852347 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -58,7 +58,7 @@ trait CodegenSupport extends SparkPlan {
   }
 
   /**
-   * Whether this SparkPlan support whole stage codegen or not.
+   * Whether this SparkPlan supports whole stage codegen or not.
    */
   def supportCodegen: Boolean = true
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala
index 7477d025dfe89..3c900be839aa9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala
@@ -91,8 +91,8 @@ case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableComm
         if (sparkSession.conf.get(CATALOG_IMPLEMENTATION.key).equals("hive") &&
             key.startsWith("hive.")) {
           logWarning(s"'SET $key=$value' might not work, since Spark doesn't support changing " +
-            "the Hive config dynamically. Please passing the Hive-specific config by adding the " +
-            s"prefix spark.hadoop (e.g., spark.hadoop.$key) when starting a Spark application. " +
+            "the Hive config dynamically. Please pass the Hive-specific config by adding the " +
+            s"prefix spark.hadoop (e.g. spark.hadoop.$key) when starting a Spark application. " +
             "For details, see the link: https://spark.apache.org/docs/latest/configuration.html#" +
             "dynamically-loading-spark-properties.")
         }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index f64e079539c4f..5dbcf4a915cbf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.types.{AtomicType, StructType}
 import org.apache.spark.sql.util.SchemaUtils
 
 /**
- * Try to replaces [[UnresolvedRelation]]s if the plan is for direct query on files.
+ * Replaces [[UnresolvedRelation]]s if the plan is for direct query on files.
  */
 class ResolveSQLOnFile(sparkSession: SparkSession) extends Rule[LogicalPlan] {
   private def maybeSQLFile(u: UnresolvedRelation): Boolean = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
index 6e8154d58d4c6..00bc215a5dc8c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
@@ -330,7 +330,7 @@ object HDFSMetadataLog {
   /** A simple trait to abstract out the file management operations needed by HDFSMetadataLog. */
   trait FileManager {
 
-    /** List the files in a path that matches a filter. */
+    /** List the files in a path that match a filter. */
     def list(path: Path, filter: PathFilter): Array[FileStatus]
 
     /** Make directory at the give path and all its parent directories as needed. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
index a1b63a6de3823..73945b39b8967 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.internal.SQLConf.{SHUFFLE_PARTITIONS, STATE_STORE_PR
 case class OffsetSeq(offsets: Seq[Option[Offset]], metadata: Option[OffsetSeqMetadata] = None) {
 
   /**
-   * Unpacks an offset into [[StreamProgress]] by associating each offset with the order list of
+   * Unpacks an offset into [[StreamProgress]] by associating each offset with the ordered list of
    * sources.
    *
    * This method is typically used to associate a serialized offset with actual sources (which
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
index e3f4abcf9f1dc..2c8d7c7b0f3c5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.SparkSession
 /**
  * This class is used to log offsets to persistent files in HDFS.
  * Each file corresponds to a specific batch of offsets. The file
- * format contain a version string in the first line, followed
+ * format contains a version string in the first line, followed
  * by a the JSON string representation of the offsets separated
  * by a newline character. If a source offset is missing, then
  * that line will contain a string value defined in the
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryWrapper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryWrapper.scala
index 020c9cb4a7304..3f2cdadfbaeee 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryWrapper.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryWrapper.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.streaming.{StreamingQuery, StreamingQueryException, StreamingQueryProgress, StreamingQueryStatus}
 
 /**
- * Wrap non-serializable StreamExecution to make the query serializable as it's easy to for it to
+ * Wrap non-serializable StreamExecution to make the query serializable as it's easy for it to
  * get captured with normal usage. It's safe to capture the query but not use it in executors.
  * However, if the user tries to call its methods, it will throw `IllegalStateException`.
  */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
index 6fe632f958ffc..d1d9f95cb0977 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
@@ -94,7 +94,7 @@ trait StateStore {
   def abort(): Unit
 
   /**
-   * Return an iterator containing all the key-value pairs in the SateStore. Implementations must
+   * Return an iterator containing all the key-value pairs in the StateStore. Implementations must
    * ensure that updates (puts, removes) can be made while iterating over this iterator.
    */
   def iterator(): Iterator[UnsafeRowPair]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala
index f29e135ac357f..e0554f0c4d337 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala
@@ -80,7 +80,7 @@ class ExecutionPage(parent: SQLTab) extends WebUIPage("execution") with Logging
         planVisualization(metrics, graph) ++
         physicalPlanDescription(executionUIData.physicalPlanDescription)
     }.getOrElse {
-      <div>No information to display for Plan {executionId}</div>
+      <div>No information to display for query {executionId}</div>
     }
 
     UIUtils.headerSparkPage(s"Details for Query $executionId", content, parent, Some(5000))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
index 40a058d2cadd2..bdc4bb4422ae7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
@@ -27,8 +27,8 @@ import org.apache.spark.sql.types.DataType
  *
  * As an example:
  * {{{
- *   // Defined a UDF that returns true or false based on some numeric score.
- *   val predict = udf((score: Double) => if (score > 0.5) true else false)
+ *   // Define a UDF that returns true or false based on some numeric score.
+ *   val predict = udf((score: Double) => score > 0.5)
  *
  *   // Projects a column that adds a prediction column based on the score column.
  *   df.select( predict(df("score")) )
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
index 2867b4cd7da5e..007f8760edf82 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
@@ -206,7 +206,7 @@ abstract class BaseSessionStateBuilder(
   /**
    * Logical query plan optimizer.
    *
-   * Note: this depends on the `conf`, `catalog` and `experimentalMethods` fields.
+   * Note: this depends on `catalog` and `experimentalMethods` fields.
    */
   protected def optimizer: Optimizer = {
     new SparkOptimizer(catalog, experimentalMethods) {
@@ -263,7 +263,7 @@ abstract class BaseSessionStateBuilder(
    * An interface to register custom [[org.apache.spark.sql.util.QueryExecutionListener]]s
    * that listen for execution metrics.
    *
-   * This gets cloned from parent if available, otherwise is a new instance is created.
+   * This gets cloned from parent if available, otherwise a new instance is created.
    */
   protected def listenerManager: ExecutionListenerManager = {
     parentState.map(_.listenerManager.clone()).getOrElse(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index 52f2e2639cd86..9f5ca9f914284 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -118,7 +118,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
    * You can set the following option(s):
    * <ul>
    * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone
-   * to be used to parse timestamps in the JSON/CSV datasources or partition values.</li>
+   * to be used to parse timestamps in the JSON/CSV data sources or partition values.</li>
    * </ul>
    *
    * @since 2.0.0
@@ -129,12 +129,12 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
   }
 
   /**
-   * Adds input options for the underlying data source.
+   * (Java-specific) Adds input options for the underlying data source.
    *
    * You can set the following option(s):
    * <ul>
    * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone
-   * to be used to parse timestamps in the JSON/CSV datasources or partition values.</li>
+   * to be used to parse timestamps in the JSON/CSV data sources or partition values.</li>
    * </ul>
    *
    * @since 2.0.0
diff --git a/sql/core/src/test/resources/sql-tests/results/columnresolution-negative.sql.out b/sql/core/src/test/resources/sql-tests/results/columnresolution-negative.sql.out
index b5a4f5c2bf654..539f673c9d679 100644
--- a/sql/core/src/test/resources/sql-tests/results/columnresolution-negative.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/columnresolution-negative.sql.out
@@ -195,7 +195,7 @@ SELECT t1.x.y.* FROM t1
 struct<>
 -- !query 22 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 't1.x.y.*' give input columns 'i1';
+cannot resolve 't1.x.y.*' given input columns 'i1';
 
 
 -- !query 23
diff --git a/sql/core/src/test/resources/sql-tests/results/columnresolution-views.sql.out b/sql/core/src/test/resources/sql-tests/results/columnresolution-views.sql.out
index 7c451c2aa5b5c..2092119600954 100644
--- a/sql/core/src/test/resources/sql-tests/results/columnresolution-views.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/columnresolution-views.sql.out
@@ -88,7 +88,7 @@ SELECT global_temp.view1.* FROM global_temp.view1
 struct<>
 -- !query 10 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'global_temp.view1.*' give input columns 'i1';
+cannot resolve 'global_temp.view1.*' given input columns 'i1';
 
 
 -- !query 11
diff --git a/sql/core/src/test/resources/sql-tests/results/columnresolution.sql.out b/sql/core/src/test/resources/sql-tests/results/columnresolution.sql.out
index d3ca4443cce55..e10f516ad6e5b 100644
--- a/sql/core/src/test/resources/sql-tests/results/columnresolution.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/columnresolution.sql.out
@@ -179,7 +179,7 @@ SELECT mydb1.t1.* FROM mydb1.t1
 struct<>
 -- !query 21 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'mydb1.t1.*' give input columns 'i1';
+cannot resolve 'mydb1.t1.*' given input columns 'i1';
 
 
 -- !query 22
@@ -212,7 +212,7 @@ SELECT mydb1.t1.* FROM mydb1.t1
 struct<>
 -- !query 25 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'mydb1.t1.*' give input columns 'i1';
+cannot resolve 'mydb1.t1.*' given input columns 'i1';
 
 
 -- !query 26
@@ -420,7 +420,7 @@ SELECT mydb1.t5.* FROM mydb1.t5
 struct<>
 -- !query 50 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'mydb1.t5.*' give input columns 'i1, t5';
+cannot resolve 'mydb1.t5.*' given input columns 'i1, t5';
 
 
 -- !query 51
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 083a0c0b1b9a0..a79ab47f0197e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1896,12 +1896,12 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       var e = intercept[AnalysisException] {
         sql("SELECT a.* FROM temp_table_no_cols a")
       }.getMessage
-      assert(e.contains("cannot resolve 'a.*' give input columns ''"))
+      assert(e.contains("cannot resolve 'a.*' given input columns ''"))
 
       e = intercept[AnalysisException] {
         dfNoCols.select($"b.*")
       }.getMessage
-      assert(e.contains("cannot resolve 'b.*' give input columns ''"))
+      assert(e.contains("cannot resolve 'b.*' given input columns ''"))
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
index 14082197ba0bd..ce8fde28a941c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
@@ -663,7 +663,7 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils {
         }.getMessage
         assert(e.contains("The depth of view `default`.`view0` exceeds the maximum view " +
           "resolution depth (10). Analysis is aborted to avoid errors. Increase the value " +
-          "of spark.sql.view.maxNestedViewDepth to work aroud this."))
+          "of spark.sql.view.maxNestedViewDepth to work around this."))
       }
 
       val e = intercept[IllegalArgumentException] {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 632e3e0c4c3f9..3b8a8ca301c27 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -109,8 +109,8 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
   }
 
   /**
-   * Get the raw table metadata from hive metastore directly. The raw table metadata may contains
-   * special data source properties and should not be exposed outside of `HiveExternalCatalog`. We
+   * Get the raw table metadata from hive metastore directly. The raw table metadata may contain
+   * special data source properties that should not be exposed outside of `HiveExternalCatalog`. We
    * should interpret these special data source properties and restore the original table metadata
    * before returning it.
    */

From 7241556d8b550e22eed2341287812ea373dc1cb2 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Mon, 22 Jan 2018 15:21:09 -0800
Subject: [PATCH 164/566] [SPARK-22389][SQL] data source v2 partitioning
 reporting interface

## What changes were proposed in this pull request?

a new interface which allows data source to report partitioning and avoid shuffle at Spark side.

The design is pretty like the internal distribution/partitioing framework. Spark defines a `Distribution` interfaces and several concrete implementations, and ask the data source to report a `Partitioning`, the `Partitioning` should tell Spark if it can satisfy a `Distribution` or not.

## How was this patch tested?

new test

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20201 from cloud-fan/partition-reporting.

(cherry picked from commit 51eb750263dd710434ddb60311571fa3dcec66eb)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../plans/physical/partitioning.scala         |   2 +-
 .../v2/reader/ClusteredDistribution.java      |  38 ++++++
 .../sql/sources/v2/reader/Distribution.java   |  39 +++++++
 .../sql/sources/v2/reader/Partitioning.java   |  46 ++++++++
 .../v2/reader/SupportsReportPartitioning.java |  33 ++++++
 .../v2/DataSourcePartitioning.scala           |  56 +++++++++
 .../datasources/v2/DataSourceV2ScanExec.scala |   9 ++
 .../v2/JavaPartitionAwareDataSource.java      | 110 ++++++++++++++++++
 .../sql/sources/v2/DataSourceV2Suite.scala    |  79 +++++++++++++
 9 files changed, 411 insertions(+), 1 deletion(-)
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ClusteredDistribution.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Distribution.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Partitioning.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourcePartitioning.scala
 create mode 100644 sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaPartitionAwareDataSource.java

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
index 0189bd73c56bf..4d9a9925fe3ff 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
@@ -153,7 +153,7 @@ case class BroadcastDistribution(mode: BroadcastMode) extends Distribution {
  *   1. number of partitions.
  *   2. if it can satisfy a given distribution.
  */
-sealed trait Partitioning {
+trait Partitioning {
   /** Returns the number of partitions that the data is split across */
   val numPartitions: Int
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ClusteredDistribution.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ClusteredDistribution.java
new file mode 100644
index 0000000000000..7346500de45b6
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ClusteredDistribution.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import org.apache.spark.annotation.InterfaceStability;
+
+/**
+ * A concrete implementation of {@link Distribution}. Represents a distribution where records that
+ * share the same values for the {@link #clusteredColumns} will be produced by the same
+ * {@link ReadTask}.
+ */
+@InterfaceStability.Evolving
+public class ClusteredDistribution implements Distribution {
+
+  /**
+   * The names of the clustered columns. Note that they are order insensitive.
+   */
+  public final String[] clusteredColumns;
+
+  public ClusteredDistribution(String[] clusteredColumns) {
+    this.clusteredColumns = clusteredColumns;
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Distribution.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Distribution.java
new file mode 100644
index 0000000000000..a6201a222f541
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Distribution.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import org.apache.spark.annotation.InterfaceStability;
+
+/**
+ * An interface to represent data distribution requirement, which specifies how the records should
+ * be distributed among the {@link ReadTask}s that are returned by
+ * {@link DataSourceV2Reader#createReadTasks()}. Note that this interface has nothing to do with
+ * the data ordering inside one partition(the output records of a single {@link ReadTask}).
+ *
+ * The instance of this interface is created and provided by Spark, then consumed by
+ * {@link Partitioning#satisfy(Distribution)}. This means data source developers don't need to
+ * implement this interface, but need to catch as more concrete implementations of this interface
+ * as possible in {@link Partitioning#satisfy(Distribution)}.
+ *
+ * Concrete implementations until now:
+ * <ul>
+ *   <li>{@link ClusteredDistribution}</li>
+ * </ul>
+ */
+@InterfaceStability.Evolving
+public interface Distribution {}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Partitioning.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Partitioning.java
new file mode 100644
index 0000000000000..199e45d4a02ab
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Partitioning.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import org.apache.spark.annotation.InterfaceStability;
+
+/**
+ * An interface to represent the output data partitioning for a data source, which is returned by
+ * {@link SupportsReportPartitioning#outputPartitioning()}. Note that this should work like a
+ * snapshot. Once created, it should be deterministic and always report the same number of
+ * partitions and the same "satisfy" result for a certain distribution.
+ */
+@InterfaceStability.Evolving
+public interface Partitioning {
+
+  /**
+   * Returns the number of partitions(i.e., {@link ReadTask}s) the data source outputs.
+   */
+  int numPartitions();
+
+  /**
+   * Returns true if this partitioning can satisfy the given distribution, which means Spark does
+   * not need to shuffle the output data of this data source for some certain operations.
+   *
+   * Note that, Spark may add new concrete implementations of {@link Distribution} in new releases.
+   * This method should be aware of it and always return false for unrecognized distributions. It's
+   * recommended to check every Spark new release and support new distributions if possible, to
+   * avoid shuffle at Spark side for more cases.
+   */
+  boolean satisfy(Distribution distribution);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java
new file mode 100644
index 0000000000000..f786472ccf345
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import org.apache.spark.annotation.InterfaceStability;
+
+/**
+ * A mix in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * interface to report data partitioning and try to avoid shuffle at Spark side.
+ */
+@InterfaceStability.Evolving
+public interface SupportsReportPartitioning {
+
+  /**
+   * Returns the output data partitioning that this reader guarantees.
+   */
+  Partitioning outputPartitioning();
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourcePartitioning.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourcePartitioning.scala
new file mode 100644
index 0000000000000..943d0100aca56
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourcePartitioning.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression}
+import org.apache.spark.sql.catalyst.plans.physical
+import org.apache.spark.sql.sources.v2.reader.{ClusteredDistribution, Partitioning}
+
+/**
+ * An adapter from public data source partitioning to catalyst internal `Partitioning`.
+ */
+class DataSourcePartitioning(
+    partitioning: Partitioning,
+    colNames: AttributeMap[String]) extends physical.Partitioning {
+
+  override val numPartitions: Int = partitioning.numPartitions()
+
+  override def satisfies(required: physical.Distribution): Boolean = {
+    super.satisfies(required) || {
+      required match {
+        case d: physical.ClusteredDistribution if isCandidate(d.clustering) =>
+          val attrs = d.clustering.map(_.asInstanceOf[Attribute])
+          partitioning.satisfy(
+            new ClusteredDistribution(attrs.map { a =>
+              val name = colNames.get(a)
+              assert(name.isDefined, s"Attribute ${a.name} is not found in the data source output")
+              name.get
+            }.toArray))
+
+        case _ => false
+      }
+    }
+  }
+
+  private def isCandidate(clustering: Seq[Expression]): Boolean = {
+    clustering.forall {
+      case a: Attribute => colNames.contains(a)
+      case _ => false
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
index beb66738732be..69d871df3e1dd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
@@ -24,6 +24,7 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.physical
 import org.apache.spark.sql.execution.{ColumnarBatchScan, LeafExecNode, WholeStageCodegenExec}
 import org.apache.spark.sql.execution.streaming.continuous._
 import org.apache.spark.sql.sources.v2.reader._
@@ -42,6 +43,14 @@ case class DataSourceV2ScanExec(
 
   override def producedAttributes: AttributeSet = AttributeSet(fullOutput)
 
+  override def outputPartitioning: physical.Partitioning = reader match {
+    case s: SupportsReportPartitioning =>
+      new DataSourcePartitioning(
+        s.outputPartitioning(), AttributeMap(output.map(a => a -> a.name)))
+
+    case _ => super.outputPartitioning
+  }
+
   private lazy val readTasks: java.util.List[ReadTask[UnsafeRow]] = reader match {
     case r: SupportsScanUnsafeRow => r.createUnsafeRowReadTasks()
     case _ =>
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaPartitionAwareDataSource.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaPartitionAwareDataSource.java
new file mode 100644
index 0000000000000..806d0bcd93f18
--- /dev/null
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaPartitionAwareDataSource.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package test.org.apache.spark.sql.sources.v2;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.expressions.GenericRow;
+import org.apache.spark.sql.sources.v2.DataSourceV2;
+import org.apache.spark.sql.sources.v2.DataSourceV2Options;
+import org.apache.spark.sql.sources.v2.ReadSupport;
+import org.apache.spark.sql.sources.v2.reader.*;
+import org.apache.spark.sql.types.StructType;
+
+public class JavaPartitionAwareDataSource implements DataSourceV2, ReadSupport {
+
+  class Reader implements DataSourceV2Reader, SupportsReportPartitioning {
+    private final StructType schema = new StructType().add("a", "int").add("b", "int");
+
+    @Override
+    public StructType readSchema() {
+      return schema;
+    }
+
+    @Override
+    public List<ReadTask<Row>> createReadTasks() {
+      return java.util.Arrays.asList(
+        new SpecificReadTask(new int[]{1, 1, 3}, new int[]{4, 4, 6}),
+        new SpecificReadTask(new int[]{2, 4, 4}, new int[]{6, 2, 2}));
+    }
+
+    @Override
+    public Partitioning outputPartitioning() {
+      return new MyPartitioning();
+    }
+  }
+
+  static class MyPartitioning implements Partitioning {
+
+    @Override
+    public int numPartitions() {
+      return 2;
+    }
+
+    @Override
+    public boolean satisfy(Distribution distribution) {
+      if (distribution instanceof ClusteredDistribution) {
+        String[] clusteredCols = ((ClusteredDistribution) distribution).clusteredColumns;
+        return Arrays.asList(clusteredCols).contains("a");
+      }
+
+      return false;
+    }
+  }
+
+  static class SpecificReadTask implements ReadTask<Row>, DataReader<Row> {
+    private int[] i;
+    private int[] j;
+    private int current = -1;
+
+    SpecificReadTask(int[] i, int[] j) {
+      assert i.length == j.length;
+      this.i = i;
+      this.j = j;
+    }
+
+    @Override
+    public boolean next() throws IOException {
+      current += 1;
+      return current < i.length;
+    }
+
+    @Override
+    public Row get() {
+      return new GenericRow(new Object[] {i[current], j[current]});
+    }
+
+    @Override
+    public void close() throws IOException {
+
+    }
+
+    @Override
+    public DataReader<Row> createDataReader() {
+      return this;
+    }
+  }
+
+  @Override
+  public DataSourceV2Reader createReader(DataSourceV2Options options) {
+    return new Reader();
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
index 0ca29524c6d05..0620693b35d16 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
@@ -24,6 +24,7 @@ import test.org.apache.spark.sql.sources.v2._
 import org.apache.spark.SparkException
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
 import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector
 import org.apache.spark.sql.sources.{Filter, GreaterThan}
 import org.apache.spark.sql.sources.v2.reader._
@@ -95,6 +96,40 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext {
     }
   }
 
+  test("partitioning reporting") {
+    import org.apache.spark.sql.functions.{count, sum}
+    Seq(classOf[PartitionAwareDataSource], classOf[JavaPartitionAwareDataSource]).foreach { cls =>
+      withClue(cls.getName) {
+        val df = spark.read.format(cls.getName).load()
+        checkAnswer(df, Seq(Row(1, 4), Row(1, 4), Row(3, 6), Row(2, 6), Row(4, 2), Row(4, 2)))
+
+        val groupByColA = df.groupBy('a).agg(sum('b))
+        checkAnswer(groupByColA, Seq(Row(1, 8), Row(2, 6), Row(3, 6), Row(4, 4)))
+        assert(groupByColA.queryExecution.executedPlan.collectFirst {
+          case e: ShuffleExchangeExec => e
+        }.isEmpty)
+
+        val groupByColAB = df.groupBy('a, 'b).agg(count("*"))
+        checkAnswer(groupByColAB, Seq(Row(1, 4, 2), Row(2, 6, 1), Row(3, 6, 1), Row(4, 2, 2)))
+        assert(groupByColAB.queryExecution.executedPlan.collectFirst {
+          case e: ShuffleExchangeExec => e
+        }.isEmpty)
+
+        val groupByColB = df.groupBy('b).agg(sum('a))
+        checkAnswer(groupByColB, Seq(Row(2, 8), Row(4, 2), Row(6, 5)))
+        assert(groupByColB.queryExecution.executedPlan.collectFirst {
+          case e: ShuffleExchangeExec => e
+        }.isDefined)
+
+        val groupByAPlusB = df.groupBy('a + 'b).agg(count("*"))
+        checkAnswer(groupByAPlusB, Seq(Row(5, 2), Row(6, 2), Row(8, 1), Row(9, 1)))
+        assert(groupByAPlusB.queryExecution.executedPlan.collectFirst {
+          case e: ShuffleExchangeExec => e
+        }.isDefined)
+      }
+    }
+  }
+
   test("simple writable data source") {
     // TODO: java implementation.
     Seq(classOf[SimpleWritableDataSource]).foreach { cls =>
@@ -365,3 +400,47 @@ class BatchReadTask(start: Int, end: Int)
 
   override def close(): Unit = batch.close()
 }
+
+class PartitionAwareDataSource extends DataSourceV2 with ReadSupport {
+
+  class Reader extends DataSourceV2Reader with SupportsReportPartitioning {
+    override def readSchema(): StructType = new StructType().add("a", "int").add("b", "int")
+
+    override def createReadTasks(): JList[ReadTask[Row]] = {
+      // Note that we don't have same value of column `a` across partitions.
+      java.util.Arrays.asList(
+        new SpecificReadTask(Array(1, 1, 3), Array(4, 4, 6)),
+        new SpecificReadTask(Array(2, 4, 4), Array(6, 2, 2)))
+    }
+
+    override def outputPartitioning(): Partitioning = new MyPartitioning
+  }
+
+  class MyPartitioning extends Partitioning {
+    override def numPartitions(): Int = 2
+
+    override def satisfy(distribution: Distribution): Boolean = distribution match {
+      case c: ClusteredDistribution => c.clusteredColumns.contains("a")
+      case _ => false
+    }
+  }
+
+  override def createReader(options: DataSourceV2Options): DataSourceV2Reader = new Reader
+}
+
+class SpecificReadTask(i: Array[Int], j: Array[Int]) extends ReadTask[Row] with DataReader[Row] {
+  assert(i.length == j.length)
+
+  private var current = -1
+
+  override def createDataReader(): DataReader[Row] = this
+
+  override def next(): Boolean = {
+    current += 1
+    current < i.length
+  }
+
+  override def get(): Row = Row(i(current), j(current))
+
+  override def close(): Unit = {}
+}

From 832d69817c29e8c44fcab0d6a476f36d4ee0c837 Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Tue, 23 Jan 2018 04:08:32 -0800
Subject: [PATCH 165/566] [SPARK-22465][FOLLOWUP] Update the number of
 partitions of default partitioner when defaultParallelism is set

## What changes were proposed in this pull request?

#20002 purposed a way to safe check the default partitioner, however, if `spark.default.parallelism` is set, the defaultParallelism still could be smaller than the proper number of partitions for upstreams RDDs. This PR tries to extend the approach to address the condition when `spark.default.parallelism` is set.

The requirements where the PR helps with are :
- Max partitioner is not eligible since it is atleast an order smaller, and
- User has explicitly set 'spark.default.parallelism', and
- Value of 'spark.default.parallelism' is lower than max partitioner
- Since max partitioner was discarded due to being at least an order smaller, default parallelism is worse - even though user specified.

Under the rest cases, the changes should be no-op.

## How was this patch tested?

Add corresponding test cases in `PairRDDFunctionsSuite` and `PartitioningSuite`.

Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #20091 from jiangxb1987/partitioner.

(cherry picked from commit 96cb60bc33936c1aaf728a1738781073891480ff)
Signed-off-by: Mridul Muralidharan <mridul@gmail.com>
---
 .../scala/org/apache/spark/Partitioner.scala  | 51 ++++++++++---------
 .../org/apache/spark/PartitioningSuite.scala  | 44 +++++++++++++---
 .../spark/rdd/PairRDDFunctionsSuite.scala     | 45 +++++++++++++++-
 3 files changed, 108 insertions(+), 32 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala
index 437bbaae1968b..c940cb25d478b 100644
--- a/core/src/main/scala/org/apache/spark/Partitioner.scala
+++ b/core/src/main/scala/org/apache/spark/Partitioner.scala
@@ -43,17 +43,19 @@ object Partitioner {
   /**
    * Choose a partitioner to use for a cogroup-like operation between a number of RDDs.
    *
-   * If any of the RDDs already has a partitioner, and the number of partitions of the
-   * partitioner is either greater than or is less than and within a single order of
-   * magnitude of the max number of upstream partitions, choose that one.
+   * If spark.default.parallelism is set, we'll use the value of SparkContext defaultParallelism
+   * as the default partitions number, otherwise we'll use the max number of upstream partitions.
    *
-   * Otherwise, we use a default HashPartitioner. For the number of partitions, if
-   * spark.default.parallelism is set, then we'll use the value from SparkContext
-   * defaultParallelism, otherwise we'll use the max number of upstream partitions.
+   * When available, we choose the partitioner from rdds with maximum number of partitions. If this
+   * partitioner is eligible (number of partitions within an order of maximum number of partitions
+   * in rdds), or has partition number higher than default partitions number - we use this
+   * partitioner.
    *
-   * Unless spark.default.parallelism is set, the number of partitions will be the
-   * same as the number of partitions in the largest upstream RDD, as this should
-   * be least likely to cause out-of-memory errors.
+   * Otherwise, we'll use a new HashPartitioner with the default partitions number.
+   *
+   * Unless spark.default.parallelism is set, the number of partitions will be the same as the
+   * number of partitions in the largest upstream RDD, as this should be least likely to cause
+   * out-of-memory errors.
    *
    * We use two method parameters (rdd, others) to enforce callers passing at least 1 RDD.
    */
@@ -67,31 +69,32 @@ object Partitioner {
       None
     }
 
-    if (isEligiblePartitioner(hasMaxPartitioner, rdds)) {
+    val defaultNumPartitions = if (rdd.context.conf.contains("spark.default.parallelism")) {
+      rdd.context.defaultParallelism
+    } else {
+      rdds.map(_.partitions.length).max
+    }
+
+    // If the existing max partitioner is an eligible one, or its partitions number is larger
+    // than the default number of partitions, use the existing partitioner.
+    if (hasMaxPartitioner.nonEmpty && (isEligiblePartitioner(hasMaxPartitioner.get, rdds) ||
+        defaultNumPartitions < hasMaxPartitioner.get.getNumPartitions)) {
       hasMaxPartitioner.get.partitioner.get
     } else {
-      if (rdd.context.conf.contains("spark.default.parallelism")) {
-        new HashPartitioner(rdd.context.defaultParallelism)
-      } else {
-        new HashPartitioner(rdds.map(_.partitions.length).max)
-      }
+      new HashPartitioner(defaultNumPartitions)
     }
   }
 
   /**
-   * Returns true if the number of partitions of the RDD is either greater
-   * than or is less than and within a single order of magnitude of the
-   * max number of upstream partitions;
-   * otherwise, returns false
+   * Returns true if the number of partitions of the RDD is either greater than or is less than and
+   * within a single order of magnitude of the max number of upstream partitions, otherwise returns
+   * false.
    */
   private def isEligiblePartitioner(
-     hasMaxPartitioner: Option[RDD[_]],
+     hasMaxPartitioner: RDD[_],
      rdds: Seq[RDD[_]]): Boolean = {
-    if (hasMaxPartitioner.isEmpty) {
-      return false
-    }
     val maxPartitions = rdds.map(_.partitions.length).max
-    log10(maxPartitions) - log10(hasMaxPartitioner.get.getNumPartitions) < 1
+    log10(maxPartitions) - log10(hasMaxPartitioner.getNumPartitions) < 1
   }
 }
 
diff --git a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
index 155ca17db726b..9206b5debf4f3 100644
--- a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
+++ b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
@@ -262,14 +262,11 @@ class PartitioningSuite extends SparkFunSuite with SharedSparkContext with Priva
 
   test("defaultPartitioner") {
     val rdd1 = sc.parallelize((1 to 1000).map(x => (x, x)), 150)
-    val rdd2 = sc
-      .parallelize(Array((1, 2), (2, 3), (2, 4), (3, 4)))
+    val rdd2 = sc.parallelize(Array((1, 2), (2, 3), (2, 4), (3, 4)))
       .partitionBy(new HashPartitioner(10))
-    val rdd3 = sc
-      .parallelize(Array((1, 6), (7, 8), (3, 10), (5, 12), (13, 14)))
+    val rdd3 = sc.parallelize(Array((1, 6), (7, 8), (3, 10), (5, 12), (13, 14)))
       .partitionBy(new HashPartitioner(100))
-    val rdd4 = sc
-      .parallelize(Array((1, 2), (2, 3), (2, 4), (3, 4)))
+    val rdd4 = sc.parallelize(Array((1, 2), (2, 3), (2, 4), (3, 4)))
       .partitionBy(new HashPartitioner(9))
     val rdd5 = sc.parallelize((1 to 10).map(x => (x, x)), 11)
 
@@ -284,7 +281,42 @@ class PartitioningSuite extends SparkFunSuite with SharedSparkContext with Priva
     assert(partitioner3.numPartitions == rdd3.getNumPartitions)
     assert(partitioner4.numPartitions == rdd3.getNumPartitions)
     assert(partitioner5.numPartitions == rdd4.getNumPartitions)
+  }
 
+  test("defaultPartitioner when defaultParallelism is set") {
+    assert(!sc.conf.contains("spark.default.parallelism"))
+    try {
+      sc.conf.set("spark.default.parallelism", "4")
+
+      val rdd1 = sc.parallelize((1 to 1000).map(x => (x, x)), 150)
+      val rdd2 = sc.parallelize(Array((1, 2), (2, 3), (2, 4), (3, 4)))
+        .partitionBy(new HashPartitioner(10))
+      val rdd3 = sc.parallelize(Array((1, 6), (7, 8), (3, 10), (5, 12), (13, 14)))
+        .partitionBy(new HashPartitioner(100))
+      val rdd4 = sc.parallelize(Array((1, 2), (2, 3), (2, 4), (3, 4)))
+        .partitionBy(new HashPartitioner(9))
+      val rdd5 = sc.parallelize((1 to 10).map(x => (x, x)), 11)
+      val rdd6 = sc.parallelize(Array((1, 2), (2, 3), (2, 4), (3, 4)))
+        .partitionBy(new HashPartitioner(3))
+
+      val partitioner1 = Partitioner.defaultPartitioner(rdd1, rdd2)
+      val partitioner2 = Partitioner.defaultPartitioner(rdd2, rdd3)
+      val partitioner3 = Partitioner.defaultPartitioner(rdd3, rdd1)
+      val partitioner4 = Partitioner.defaultPartitioner(rdd1, rdd2, rdd3)
+      val partitioner5 = Partitioner.defaultPartitioner(rdd4, rdd5)
+      val partitioner6 = Partitioner.defaultPartitioner(rdd5, rdd5)
+      val partitioner7 = Partitioner.defaultPartitioner(rdd1, rdd6)
+
+      assert(partitioner1.numPartitions == rdd2.getNumPartitions)
+      assert(partitioner2.numPartitions == rdd3.getNumPartitions)
+      assert(partitioner3.numPartitions == rdd3.getNumPartitions)
+      assert(partitioner4.numPartitions == rdd3.getNumPartitions)
+      assert(partitioner5.numPartitions == rdd4.getNumPartitions)
+      assert(partitioner6.numPartitions == sc.defaultParallelism)
+      assert(partitioner7.numPartitions == sc.defaultParallelism)
+    } finally {
+      sc.conf.remove("spark.default.parallelism")
+    }
   }
 }
 
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index a39e0469272fe..47af5c3320dd9 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -322,8 +322,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
   }
 
   // See SPARK-22465
-  test("cogroup between multiple RDD" +
-    " with number of partitions similar in order of magnitude") {
+  test("cogroup between multiple RDD with number of partitions similar in order of magnitude") {
     val rdd1 = sc.parallelize((1 to 1000).map(x => (x, x)), 20)
     val rdd2 = sc
       .parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1)))
@@ -332,6 +331,48 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     assert(joined.getNumPartitions == rdd2.getNumPartitions)
   }
 
+  test("cogroup between multiple RDD when defaultParallelism is set without proper partitioner") {
+    assert(!sc.conf.contains("spark.default.parallelism"))
+    try {
+      sc.conf.set("spark.default.parallelism", "4")
+      val rdd1 = sc.parallelize((1 to 1000).map(x => (x, x)), 20)
+      val rdd2 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1)), 10)
+      val joined = rdd1.cogroup(rdd2)
+      assert(joined.getNumPartitions == sc.defaultParallelism)
+    } finally {
+      sc.conf.remove("spark.default.parallelism")
+    }
+  }
+
+  test("cogroup between multiple RDD when defaultParallelism is set with proper partitioner") {
+    assert(!sc.conf.contains("spark.default.parallelism"))
+    try {
+      sc.conf.set("spark.default.parallelism", "4")
+      val rdd1 = sc.parallelize((1 to 1000).map(x => (x, x)), 20)
+      val rdd2 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1)))
+        .partitionBy(new HashPartitioner(10))
+      val joined = rdd1.cogroup(rdd2)
+      assert(joined.getNumPartitions == rdd2.getNumPartitions)
+    } finally {
+      sc.conf.remove("spark.default.parallelism")
+    }
+  }
+
+  test("cogroup between multiple RDD when defaultParallelism is set; with huge number of " +
+    "partitions in upstream RDDs") {
+    assert(!sc.conf.contains("spark.default.parallelism"))
+    try {
+      sc.conf.set("spark.default.parallelism", "4")
+      val rdd1 = sc.parallelize((1 to 1000).map(x => (x, x)), 1000)
+      val rdd2 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1)))
+        .partitionBy(new HashPartitioner(10))
+      val joined = rdd1.cogroup(rdd2)
+      assert(joined.getNumPartitions == rdd2.getNumPartitions)
+    } finally {
+      sc.conf.remove("spark.default.parallelism")
+    }
+  }
+
   test("rightOuterJoin") {
     val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1)))
     val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w')))

From 29ed718732de40e956e37f0673743ae375cd30c5 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 23 Jan 2018 21:36:20 +0900
Subject: [PATCH 166/566] [SPARK-20749][SQL][FOLLOW-UP] Override prettyName for
 bit_length and octet_length

## What changes were proposed in this pull request?
We need to override the prettyName for bit_length and octet_length for getting the expected auto-generated alias name.

## How was this patch tested?
The existing tests

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20358 from gatorsmile/test2.3More.

(cherry picked from commit ee572ba8c1339d21c592001ec4f7f270005ff1cf)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 .../spark/sql/catalyst/parser/SqlBase.g4      |  2 +-
 .../expressions/stringExpressions.scala       |  4 ++
 .../sql-tests/results/operators.sql.out       |  4 +-
 .../scalar-subquery-predicate.sql.out         | 45 ++++++++++---------
 4 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 39d5e4ed56628..5fa75fe348e68 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -141,7 +141,7 @@ statement
         (LIKE? pattern=STRING)?                                        #showTables
     | SHOW TABLE EXTENDED ((FROM | IN) db=identifier)?
         LIKE pattern=STRING partitionSpec?                             #showTable
-    | SHOW DATABASES (LIKE? pattern=STRING)?                            #showDatabases
+    | SHOW DATABASES (LIKE? pattern=STRING)?                           #showDatabases
     | SHOW TBLPROPERTIES table=tableIdentifier
         ('(' key=tablePropertyKey ')')?                                #showTblProperties
     | SHOW COLUMNS (FROM | IN) tableIdentifier
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index e004bfc6af473..5cf783f1a5979 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -1708,6 +1708,8 @@ case class BitLength(child: Expression) extends UnaryExpression with ImplicitCas
       case BinaryType => defineCodeGen(ctx, ev, c => s"($c).length * 8")
     }
   }
+
+  override def prettyName: String = "bit_length"
 }
 
 /**
@@ -1735,6 +1737,8 @@ case class OctetLength(child: Expression) extends UnaryExpression with ImplicitC
       case BinaryType => defineCodeGen(ctx, ev, c => s"($c).length")
     }
   }
+
+  override def prettyName: String = "octet_length"
 }
 
 /**
diff --git a/sql/core/src/test/resources/sql-tests/results/operators.sql.out b/sql/core/src/test/resources/sql-tests/results/operators.sql.out
index 237b618a8b904..840655b7a6447 100644
--- a/sql/core/src/test/resources/sql-tests/results/operators.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/operators.sql.out
@@ -425,7 +425,7 @@ struct<(7 % 2):int,(7 % 0):int,(0 % 2):int,(7 % CAST(NULL AS INT)):int,(CAST(NUL
 -- !query 51
 select BIT_LENGTH('abc')
 -- !query 51 schema
-struct<bitlength(abc):int>
+struct<bit_length(abc):int>
 -- !query 51 output
 24
 
@@ -449,7 +449,7 @@ struct<length(abc):int>
 -- !query 54
 select OCTET_LENGTH('abc')
 -- !query 54 schema
-struct<octetlength(abc):int>
+struct<octet_length(abc):int>
 -- !query 54 output
 3
 
diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out
index a2b86db3e4f4c..dd82efba0dde1 100644
--- a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 29
+-- Number of queries: 27
 
 
 -- !query 0
@@ -307,7 +307,8 @@ struct<t1a:string>
 val1c
 val1d
 
--- !query 22
+
+-- !query 20
 SELECT count(t1a)
 FROM   t1 RIGHT JOIN t2
 ON     t1d = t2d
@@ -315,13 +316,13 @@ WHERE  t1a < (SELECT   max(t2a)
               FROM     t2
               WHERE    t2c = t1c
               GROUP BY t2c)
--- !query 22 schema
+-- !query 20 schema
 struct<count(t1a):bigint>
--- !query 22 output
+-- !query 20 output
 7
 
 
--- !query 23
+-- !query 21
 SELECT t1a
 FROM   t1
 WHERE  t1b <= (SELECT   max(t2b)
@@ -332,14 +333,14 @@ AND    t1b >= (SELECT   min(t2b)
                FROM     t2
                WHERE    t2c = t1c
                GROUP BY t2c)
--- !query 23 schema
+-- !query 21 schema
 struct<t1a:string>
--- !query 23 output
+-- !query 21 output
 val1b
 val1c
 
 
--- !query 24
+-- !query 22
 SELECT t1a
 FROM   t1
 WHERE  t1a <= (SELECT   max(t2a)
@@ -353,14 +354,14 @@ WHERE  t1a >= (SELECT   min(t2a)
                FROM     t2
                WHERE    t2c = t1c
                GROUP BY t2c)
--- !query 24 schema
+-- !query 22 schema
 struct<t1a:string>
--- !query 24 output
+-- !query 22 output
 val1b
 val1c
 
 
--- !query 25
+-- !query 23
 SELECT t1a
 FROM   t1
 WHERE  t1a <= (SELECT   max(t2a)
@@ -374,9 +375,9 @@ WHERE  t1a >= (SELECT   min(t2a)
                FROM     t2
                WHERE    t2c = t1c
                GROUP BY t2c)
--- !query 25 schema
+-- !query 23 schema
 struct<t1a:string>
--- !query 25 output
+-- !query 23 output
 val1a
 val1a
 val1b
@@ -387,7 +388,7 @@ val1d
 val1d
 
 
--- !query 26
+-- !query 24
 SELECT t1a
 FROM   t1
 WHERE  t1a <= (SELECT   max(t2a)
@@ -401,16 +402,16 @@ WHERE  t1a >= (SELECT   min(t2a)
                FROM     t2
                WHERE    t2c = t1c
                GROUP BY t2c)
--- !query 26 schema
+-- !query 24 schema
 struct<t1a:string>
--- !query 26 output
+-- !query 24 output
 val1a
 val1b
 val1c
 val1d
 
 
--- !query 27
+-- !query 25
 SELECT t1a
 FROM   t1
 WHERE  t1a <= (SELECT   max(t2a)
@@ -424,13 +425,13 @@ WHERE  t1a >= (SELECT   min(t2a)
                FROM     t2
                WHERE    t2c = t1c
                GROUP BY t2c)
--- !query 27 schema
+-- !query 25 schema
 struct<t1a:string>
--- !query 27 output
+-- !query 25 output
 val1a
 
 
--- !query 28
+-- !query 26
 SELECT   t1a
 FROM     t1
 GROUP BY t1a, t1c
@@ -438,8 +439,8 @@ HAVING   max(t1b) <= (SELECT   max(t2b)
                       FROM     t2
                       WHERE    t2c = t1c
                       GROUP BY t2c)
--- !query 28 schema
+-- !query 26 schema
 struct<t1a:string>
--- !query 28 output
+-- !query 26 output
 val1b
 val1c

From f8f522c01025e78eca1724c909c749374f855039 Mon Sep 17 00:00:00 2001
From: Bago Amirbekian <bago@databricks.com>
Date: Tue, 23 Jan 2018 14:11:23 -0800
Subject: [PATCH 167/566] [SPARK-22735][ML][DOC] Added VectorSizeHint docs and
 examples.

## What changes were proposed in this pull request?

Added documentation for new transformer.

Author: Bago Amirbekian <bago@databricks.com>

Closes #20285 from MrBago/sizeHintDocs.

(cherry picked from commit 05839d164836e544af79c13de25802552eadd636)
Signed-off-by: Xiangrui Meng <meng@databricks.com>
---
 docs/ml-features.md                           | 51 ++++++++++++
 .../ml/JavaVectorSizeHintExample.java         | 79 +++++++++++++++++++
 .../python/ml/vector_size_hint_example.py     | 57 +++++++++++++
 .../examples/ml/VectorSizeHintExample.scala   | 63 +++++++++++++++
 4 files changed, 250 insertions(+)
 create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java
 create mode 100644 examples/src/main/python/ml/vector_size_hint_example.py
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 466a8fbe99cf6..3370eb3893272 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1291,6 +1291,57 @@ for more details on the API.
 </div>
 </div>
 
+## VectorSizeHint
+
+It can sometimes be useful to explicitly specify the size of the vectors for a column of
+`VectorType`. For example, `VectorAssembler` uses size information from its input columns to
+produce size information and metadata for its output column. While in some cases this information
+can be obtained by inspecting the contents of the column, in a streaming dataframe the contents are
+not available until the stream is started. `VectorSizeHint` allows a user to explicitly specify the
+vector size for a column so that `VectorAssembler`, or other transformers that might
+need to know vector size, can use that column as an input.
+
+To use `VectorSizeHint` a user must set the `inputCol` and `size` parameters. Applying this
+transformer to a dataframe produces a new dataframe with updated metadata for `inputCol` specifying
+the vector size. Downstream operations on the resulting dataframe can get this size using the
+meatadata.
+
+`VectorSizeHint` can also take an optional `handleInvalid` parameter which controls its
+behaviour when the vector column contains nulls or vectors of the wrong size. By default
+`handleInvalid` is set to "error", indicating an exception should be thrown. This parameter can
+also be set to "skip", indicating that rows containing invalid values should be filtered out from
+the resulting dataframe, or "optimistic", indicating that the column should not be checked for
+invalid values and all rows should be kept. Note that the use of "optimistic" can cause the
+resulting dataframe to be in an inconsistent state, me:aning the metadata for the column
+`VectorSizeHint` was applied to does not match the contents of that column. Users should take care
+to avoid this kind of inconsistent state.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [VectorSizeHint Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorSizeHint)
+for more details on the API.
+
+{% include_example scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [VectorSizeHint Java docs](api/java/org/apache/spark/ml/feature/VectorSizeHint.html)
+for more details on the API.
+
+{% include_example java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java %}
+</div>
+
+<div data-lang="python" markdown="1">
+
+Refer to the [VectorSizeHint Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.VectorSizeHint)
+for more details on the API.
+
+{% include_example python/ml/vector_size_hint_example.py %}
+</div>
+</div>
+
 ## QuantileDiscretizer
 
 `QuantileDiscretizer` takes a column with continuous features and outputs a column with binned
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java
new file mode 100644
index 0000000000000..d649a2ccbaa72
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.sql.SparkSession;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.ml.feature.VectorAssembler;
+import org.apache.spark.ml.feature.VectorSizeHint;
+import org.apache.spark.ml.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import static org.apache.spark.sql.types.DataTypes.*;
+// $example off$
+
+public class JavaVectorSizeHintExample {
+  public static void main(String[] args) {
+    SparkSession spark = SparkSession
+      .builder()
+      .appName("JavaVectorSizeHintExample")
+      .getOrCreate();
+
+    // $example on$
+    StructType schema = createStructType(new StructField[]{
+      createStructField("id", IntegerType, false),
+      createStructField("hour", IntegerType, false),
+      createStructField("mobile", DoubleType, false),
+      createStructField("userFeatures", new VectorUDT(), false),
+      createStructField("clicked", DoubleType, false)
+    });
+    Row row0 = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0);
+    Row row1 = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0), 0.0);
+    Dataset<Row> dataset = spark.createDataFrame(Arrays.asList(row0, row1), schema);
+
+    VectorSizeHint sizeHint = new VectorSizeHint()
+      .setInputCol("userFeatures")
+      .setHandleInvalid("skip")
+      .setSize(3);
+
+    Dataset<Row> datasetWithSize = sizeHint.transform(dataset);
+    System.out.println("Rows where 'userFeatures' is not the right size are filtered out");
+    datasetWithSize.show(false);
+
+    VectorAssembler assembler = new VectorAssembler()
+      .setInputCols(new String[]{"hour", "mobile", "userFeatures"})
+      .setOutputCol("features");
+
+    // This dataframe can be used by downstream transformers as before
+    Dataset<Row> output = assembler.transform(datasetWithSize);
+    System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " +
+        "'features'");
+    output.select("features", "clicked").show(false);
+    // $example off$
+
+    spark.stop();
+  }
+}
+
diff --git a/examples/src/main/python/ml/vector_size_hint_example.py b/examples/src/main/python/ml/vector_size_hint_example.py
new file mode 100644
index 0000000000000..fb77dacec629d
--- /dev/null
+++ b/examples/src/main/python/ml/vector_size_hint_example.py
@@ -0,0 +1,57 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+# $example on$
+from pyspark.ml.linalg import Vectors
+from pyspark.ml.feature import (VectorSizeHint, VectorAssembler)
+# $example off$
+from pyspark.sql import SparkSession
+
+if __name__ == "__main__":
+    spark = SparkSession\
+        .builder\
+        .appName("VectorSizeHintExample")\
+        .getOrCreate()
+
+    # $example on$
+    dataset = spark.createDataFrame(
+        [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0),
+         (0, 18, 1.0, Vectors.dense([0.0, 10.0]), 0.0)],
+        ["id", "hour", "mobile", "userFeatures", "clicked"])
+
+    sizeHint = VectorSizeHint(
+        inputCol="userFeatures",
+        handleInvalid="skip",
+        size=3)
+
+    datasetWithSize = sizeHint.transform(dataset)
+    print("Rows where 'userFeatures' is not the right size are filtered out")
+    datasetWithSize.show(truncate=False)
+
+    assembler = VectorAssembler(
+        inputCols=["hour", "mobile", "userFeatures"],
+        outputCol="features")
+
+    # This dataframe can be used by downstream transformers as before
+    output = assembler.transform(datasetWithSize)
+    print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
+    output.select("features", "clicked").show(truncate=False)
+    # $example off$
+
+    spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala
new file mode 100644
index 0000000000000..688731a791f35
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.{VectorAssembler, VectorSizeHint}
+import org.apache.spark.ml.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+object VectorSizeHintExample {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder
+      .appName("VectorSizeHintExample")
+      .getOrCreate()
+
+    // $example on$
+    val dataset = spark.createDataFrame(
+      Seq(
+        (0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0),
+        (0, 18, 1.0, Vectors.dense(0.0, 10.0), 0.0))
+    ).toDF("id", "hour", "mobile", "userFeatures", "clicked")
+
+    val sizeHint = new VectorSizeHint()
+      .setInputCol("userFeatures")
+      .setHandleInvalid("skip")
+      .setSize(3)
+
+    val datasetWithSize = sizeHint.transform(dataset)
+    println("Rows where 'userFeatures' is not the right size are filtered out")
+    datasetWithSize.show(false)
+
+    val assembler = new VectorAssembler()
+      .setInputCols(Array("hour", "mobile", "userFeatures"))
+      .setOutputCol("features")
+
+    // This dataframe can be used by downstream transformers as before
+    val output = assembler.transform(datasetWithSize)
+    println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
+    output.select("features", "clicked").show(false)
+    // $example off$
+
+    spark.stop()
+  }
+}
+// scalastyle:on println

From 851c303867eb54405f6508919619debe84708933 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 23 Jan 2018 14:56:28 -0800
Subject: [PATCH 168/566] [SPARK-23192][SQL] Keep the Hint after Using Cached
 Data

## What changes were proposed in this pull request?

The hint of the plan segment is lost, if the plan segment is replaced by the cached data.

```Scala
      val df1 = spark.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value")
      val df2 = spark.createDataFrame(Seq((1, "1"), (2, "2"))).toDF("key", "value")
      df2.cache()
      val df3 = df1.join(broadcast(df2), Seq("key"), "inner")
```

This PR is to fix it.

## How was this patch tested?
Added a test

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20365 from gatorsmile/fixBroadcastHintloss.

(cherry picked from commit 613c290336e3826111164c24319f66774b1f65a3)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../apache/spark/sql/execution/CacheManager.scala   | 12 ++++++++----
 .../sql/execution/joins/BroadcastJoinSuite.scala    | 13 +++++++++++++
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
index b05fe49a6ac3b..432eb59d6fe57 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -26,7 +26,7 @@ import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{Dataset, SparkSession}
 import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
-import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, ResolvedHint}
 import org.apache.spark.sql.execution.columnar.InMemoryRelation
 import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
 import org.apache.spark.storage.StorageLevel
@@ -170,9 +170,13 @@ class CacheManager extends Logging {
   def useCachedData(plan: LogicalPlan): LogicalPlan = {
     val newPlan = plan transformDown {
       case currentFragment =>
-        lookupCachedData(currentFragment)
-          .map(_.cachedRepresentation.withOutput(currentFragment.output))
-          .getOrElse(currentFragment)
+        lookupCachedData(currentFragment).map { cached =>
+          val cachedPlan = cached.cachedRepresentation.withOutput(currentFragment.output)
+          currentFragment match {
+            case hint: ResolvedHint => ResolvedHint(cachedPlan, hint.hints)
+            case _ => cachedPlan
+          }
+        }.getOrElse(currentFragment)
     }
 
     newPlan transformAllExpressions {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
index 0bcd54e1fceab..1704bc8376f0d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
@@ -109,6 +109,19 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils {
     }
   }
 
+  test("broadcast hint is retained after using the cached data") {
+    withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") {
+      val df1 = spark.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value")
+      val df2 = spark.createDataFrame(Seq((1, "1"), (2, "2"))).toDF("key", "value")
+      df2.cache()
+      val df3 = df1.join(broadcast(df2), Seq("key"), "inner")
+      val numBroadCastHashJoin = df3.queryExecution.executedPlan.collect {
+        case b: BroadcastHashJoinExec => b
+      }.size
+      assert(numBroadCastHashJoin === 1)
+    }
+  }
+
   test("broadcast hint isn't propagated after a join") {
     withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") {
       val df1 = spark.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value")

From a23f6b13e8a4f0471ee33879a14746786bbf0435 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 23 Jan 2018 16:17:09 -0800
Subject: [PATCH 169/566] [SPARK-23195][SQL] Keep the Hint of Cached Data

## What changes were proposed in this pull request?
The broadcast hint of the cached plan is lost if we cache the plan. This PR is to correct it.

```Scala
  val df1 = spark.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value")
  val df2 = spark.createDataFrame(Seq((1, "1"), (2, "2"))).toDF("key", "value")
  broadcast(df2).cache()
  df2.collect()
  val df3 = df1.join(df2, Seq("key"), "inner")
```

## How was this patch tested?
Added a test.

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20368 from gatorsmile/cachedBroadcastHint.

(cherry picked from commit 44cc4daf3a03f1a220eef8ce3c86867745db9ab7)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../execution/columnar/InMemoryRelation.scala    |  4 ++--
 .../sql/execution/joins/BroadcastJoinSuite.scala | 16 ++++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
index 51928d914841e..5945808c4abfb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
@@ -63,7 +63,7 @@ case class InMemoryRelation(
     tableName: Option[String])(
     @transient var _cachedColumnBuffers: RDD[CachedBatch] = null,
     val batchStats: LongAccumulator = child.sqlContext.sparkContext.longAccumulator,
-    statsOfPlanToCache: Statistics = null)
+    statsOfPlanToCache: Statistics)
   extends logical.LeafNode with MultiInstanceRelation {
 
   override protected def innerChildren: Seq[SparkPlan] = Seq(child)
@@ -77,7 +77,7 @@ case class InMemoryRelation(
       // Underlying columnar RDD hasn't been materialized, use the stats from the plan to cache
       statsOfPlanToCache
     } else {
-      Statistics(sizeInBytes = batchStats.value.longValue)
+      Statistics(sizeInBytes = batchStats.value.longValue, hints = statsOfPlanToCache.hints)
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
index 1704bc8376f0d..889cab0489534 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
@@ -139,6 +139,22 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils {
     }
   }
 
+  test("broadcast hint is retained in a cached plan") {
+    Seq(true, false).foreach { materialized =>
+      withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") {
+        val df1 = spark.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value")
+        val df2 = spark.createDataFrame(Seq((1, "1"), (2, "2"))).toDF("key", "value")
+        broadcast(df2).cache()
+        if (materialized) df2.collect()
+        val df3 = df1.join(df2, Seq("key"), "inner")
+        val numBroadCastHashJoin = df3.queryExecution.executedPlan.collect {
+          case b: BroadcastHashJoinExec => b
+        }.size
+        assert(numBroadCastHashJoin === 1)
+      }
+    }
+  }
+
   private def assertBroadcastJoin(df : Dataset[Row]) : Unit = {
     val df1 = spark.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value")
     val joined = df1.join(df, Seq("key"), "inner")

From 3316a9d7104aece977384974cf61e5ec635ad350 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Tue, 23 Jan 2018 16:24:20 -0800
Subject: [PATCH 170/566] [SPARK-23197][DSTREAMS] Increased timeouts to resolve
 flakiness

## What changes were proposed in this pull request?

Increased timeout from 50 ms to 300 ms (50 ms was really too low).

## How was this patch tested?
Multiple rounds of tests.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #20371 from tdas/SPARK-23197.

(cherry picked from commit 15adcc8273e73352e5e1c3fc9915c0b004ec4836)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../scala/org/apache/spark/streaming/ReceiverSuite.scala    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala
index 145c48e5a9a72..fc6218a33f741 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala
@@ -105,13 +105,13 @@ class ReceiverSuite extends TestSuiteBase with TimeLimits with Serializable {
     assert(executor.errors.head.eq(exception))
 
     // Verify restarting actually stops and starts the receiver
-    receiver.restart("restarting", null, 100)
-    eventually(timeout(50 millis), interval(10 millis)) {
+    receiver.restart("restarting", null, 600)
+    eventually(timeout(300 millis), interval(10 millis)) {
       // receiver will be stopped async
       assert(receiver.isStopped)
       assert(receiver.onStopCalled)
     }
-    eventually(timeout(1000 millis), interval(100 millis)) {
+    eventually(timeout(1000 millis), interval(10 millis)) {
       // receiver will be started async
       assert(receiver.onStartCalled)
       assert(executor.isReceiverStarted)

From 9cfe90e5abb21086711e0efb7ed08026dba96ffc Mon Sep 17 00:00:00 2001
From: neilalex <neil@neilalex.com>
Date: Tue, 23 Jan 2018 22:31:14 -0800
Subject: [PATCH 171/566] [SPARK-21727][R] Allow multi-element atomic vector as
 column type in SparkR DataFrame
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

A fix to https://issues.apache.org/jira/browse/SPARK-21727, "Operating on an ArrayType in a SparkR DataFrame throws error"

## How was this patch tested?

- Ran tests at R\pkg\tests\run-all.R (see below attached results)
- Tested the following lines in SparkR, which now seem to execute without error:

```
indices <- 1:4
myDf <- data.frame(indices)
myDf$data <- list(rep(0, 20))
mySparkDf <- as.DataFrame(myDf)
collect(mySparkDf)
```

[2018-01-22 SPARK-21727 Test Results.txt](https://github.com/apache/spark/files/1653535/2018-01-22.SPARK-21727.Test.Results.txt)

felixcheung yanboliang sun-rui shivaram

_The contribution is my original work and I license the work to the project under the project’s open source license_

Author: neilalex <neil@neilalex.com>

Closes #20352 from neilalex/neilalex-sparkr-arraytype.

(cherry picked from commit f54b65c15a732540f7a41a9083eeb7a08feca125)
Signed-off-by: Felix Cheung <felixcheung@apache.org>
---
 R/pkg/R/serialize.R                | 11 +++----
 R/pkg/tests/fulltests/test_Serde.R | 47 ++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R
index 3bbf60d9b668c..263b9b576c0c5 100644
--- a/R/pkg/R/serialize.R
+++ b/R/pkg/R/serialize.R
@@ -30,14 +30,17 @@
 # POSIXct,POSIXlt -> Time
 #
 # list[T] -> Array[T], where T is one of above mentioned types
+# Multi-element vector of any of the above (except raw) -> Array[T]
 # environment -> Map[String, T], where T is a native type
 # jobj -> Object, where jobj is an object created in the backend
 # nolint end
 
 getSerdeType <- function(object) {
   type <- class(object)[[1]]
-  if (type != "list") {
-    type
+  if (is.atomic(object) & !is.raw(object) & length(object) > 1) {
+    "array"
+  } else if (type != "list") {
+     type
   } else {
     # Check if all elements are of same type
     elemType <- unique(sapply(object, function(elem) { getSerdeType(elem) }))
@@ -50,9 +53,7 @@ getSerdeType <- function(object) {
 }
 
 writeObject <- function(con, object, writeType = TRUE) {
-  # NOTE: In R vectors have same type as objects. So we don't support
-  # passing in vectors as arrays and instead require arrays to be passed
-  # as lists.
+  # NOTE: In R vectors have same type as objects
   type <- class(object)[[1]]  # class of POSIXlt is c("POSIXlt", "POSIXt")
   # Checking types is needed here, since 'is.na' only handles atomic vectors,
   # lists and pairlists
diff --git a/R/pkg/tests/fulltests/test_Serde.R b/R/pkg/tests/fulltests/test_Serde.R
index 6bbd201bf1d82..3577929323b8b 100644
--- a/R/pkg/tests/fulltests/test_Serde.R
+++ b/R/pkg/tests/fulltests/test_Serde.R
@@ -37,6 +37,53 @@ test_that("SerDe of primitive types", {
   expect_equal(class(x), "character")
 })
 
+test_that("SerDe of multi-element primitive vectors inside R data.frame", {
+  # vector of integers embedded in R data.frame
+  indices <- 1L:3L
+  myDf <- data.frame(indices)
+  myDf$data <- list(rep(0L, 3L))
+  mySparkDf <- as.DataFrame(myDf)
+  myResultingDf <- collect(mySparkDf)
+  myDfListedData <- data.frame(indices)
+  myDfListedData$data <- list(as.list(rep(0L, 3L)))
+  expect_equal(myResultingDf, myDfListedData)
+  expect_equal(class(myResultingDf[["data"]][[1]]), "list")
+  expect_equal(class(myResultingDf[["data"]][[1]][[1]]), "integer")
+
+  # vector of numeric embedded in R data.frame
+  myDf <- data.frame(indices)
+  myDf$data <- list(rep(0, 3L))
+  mySparkDf <- as.DataFrame(myDf)
+  myResultingDf <- collect(mySparkDf)
+  myDfListedData <- data.frame(indices)
+  myDfListedData$data <- list(as.list(rep(0, 3L)))
+  expect_equal(myResultingDf, myDfListedData)
+  expect_equal(class(myResultingDf[["data"]][[1]]), "list")
+  expect_equal(class(myResultingDf[["data"]][[1]][[1]]), "numeric")
+
+  # vector of logical embedded in R data.frame
+  myDf <- data.frame(indices)
+  myDf$data <- list(rep(TRUE, 3L))
+  mySparkDf <- as.DataFrame(myDf)
+  myResultingDf <- collect(mySparkDf)
+  myDfListedData <- data.frame(indices)
+  myDfListedData$data <- list(as.list(rep(TRUE, 3L)))
+  expect_equal(myResultingDf, myDfListedData)
+  expect_equal(class(myResultingDf[["data"]][[1]]), "list")
+  expect_equal(class(myResultingDf[["data"]][[1]][[1]]), "logical")
+
+  # vector of character embedded in R data.frame
+  myDf <- data.frame(indices)
+  myDf$data <- list(rep("abc", 3L))
+  mySparkDf <- as.DataFrame(myDf)
+  myResultingDf <- collect(mySparkDf)
+  myDfListedData <- data.frame(indices)
+  myDfListedData$data <- list(as.list(rep("abc", 3L)))
+  expect_equal(myResultingDf, myDfListedData)
+  expect_equal(class(myResultingDf[["data"]][[1]]), "list")
+  expect_equal(class(myResultingDf[["data"]][[1]][[1]]), "character")
+})
+
 test_that("SerDe of list of primitive types", {
   x <- list(1L, 2L, 3L)
   y <- callJStatic("SparkRHandler", "echo", x)

From d656be74b87746efc020d5cae3bfa294f8f98594 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 23 Jan 2018 22:39:31 -0800
Subject: [PATCH 172/566] Revert "[SPARK-23195][SQL] Keep the Hint of Cached
 Data"

This reverts commit a23f6b13e8a4f0471ee33879a14746786bbf0435.
---
 .../execution/columnar/InMemoryRelation.scala    |  4 ++--
 .../sql/execution/joins/BroadcastJoinSuite.scala | 16 ----------------
 2 files changed, 2 insertions(+), 18 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
index 5945808c4abfb..51928d914841e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
@@ -63,7 +63,7 @@ case class InMemoryRelation(
     tableName: Option[String])(
     @transient var _cachedColumnBuffers: RDD[CachedBatch] = null,
     val batchStats: LongAccumulator = child.sqlContext.sparkContext.longAccumulator,
-    statsOfPlanToCache: Statistics)
+    statsOfPlanToCache: Statistics = null)
   extends logical.LeafNode with MultiInstanceRelation {
 
   override protected def innerChildren: Seq[SparkPlan] = Seq(child)
@@ -77,7 +77,7 @@ case class InMemoryRelation(
       // Underlying columnar RDD hasn't been materialized, use the stats from the plan to cache
       statsOfPlanToCache
     } else {
-      Statistics(sizeInBytes = batchStats.value.longValue, hints = statsOfPlanToCache.hints)
+      Statistics(sizeInBytes = batchStats.value.longValue)
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
index 889cab0489534..1704bc8376f0d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
@@ -139,22 +139,6 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils {
     }
   }
 
-  test("broadcast hint is retained in a cached plan") {
-    Seq(true, false).foreach { materialized =>
-      withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") {
-        val df1 = spark.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value")
-        val df2 = spark.createDataFrame(Seq((1, "1"), (2, "2"))).toDF("key", "value")
-        broadcast(df2).cache()
-        if (materialized) df2.collect()
-        val df3 = df1.join(df2, Seq("key"), "inner")
-        val numBroadCastHashJoin = df3.queryExecution.executedPlan.collect {
-          case b: BroadcastHashJoinExec => b
-        }.size
-        assert(numBroadCastHashJoin === 1)
-      }
-    }
-  }
-
   private def assertBroadcastJoin(df : Dataset[Row]) : Unit = {
     val df1 = spark.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value")
     val joined = df1.join(df, Seq("key"), "inner")

From 84a189a3429c64eeabc7b4e8fc0488ec16742002 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 24 Jan 2018 20:29:47 +0900
Subject: [PATCH 173/566] [SPARK-23177][SQL][PYSPARK][BACKPORT-2.3] Extract
 zero-parameter UDFs from aggregate

## What changes were proposed in this pull request?

We extract Python UDFs in logical aggregate which depends on aggregate expression or grouping key in ExtractPythonUDFFromAggregate rule. But Python UDFs which don't depend on above expressions should also be extracted to avoid the issue reported in the JIRA.

A small code snippet to reproduce that issue looks like:
```python
import pyspark.sql.functions as f

df = spark.createDataFrame([(1,2), (3,4)])
f_udf = f.udf(lambda: str("const_str"))
df2 = df.distinct().withColumn("a", f_udf())
df2.show()
```

Error exception is raised as:
```
: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: Binding attribute, tree: pythonUDF0#50
        at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:56)
        at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:91)
        at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:90)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:267)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:267)
        at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:266)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)
        at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
        at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:272)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:256)
        at org.apache.spark.sql.catalyst.expressions.BindReferences$.bindReference(BoundAttribute.scala:90)
        at org.apache.spark.sql.execution.aggregate.HashAggregateExec$$anonfun$38.apply(HashAggregateExec.scala:514)
        at org.apache.spark.sql.execution.aggregate.HashAggregateExec$$anonfun$38.apply(HashAggregateExec.scala:513)
```

This exception raises because `HashAggregateExec` tries to bind the aliased Python UDF expression (e.g., `pythonUDF0#50 AS a#44`) to grouping key.

## How was this patch tested?

Added test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #20379 from viirya/SPARK-23177-backport-2.3.
---
 python/pyspark/sql/tests.py                               | 8 ++++++++
 .../spark/sql/execution/python/ExtractPythonUDFs.scala    | 5 +++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 4fee2ecde391b..fbb18c494f899 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1100,6 +1100,14 @@ def myudf(x):
         rows = [r[0] for r in df.selectExpr("udf(id)").take(2)]
         self.assertEqual(rows, [None, PythonOnlyPoint(1, 1)])
 
+    def test_nonparam_udf_with_aggregate(self):
+        import pyspark.sql.functions as f
+
+        df = self.spark.createDataFrame([(1, 2), (1, 2)])
+        f_udf = f.udf(lambda: "const_str")
+        rows = df.distinct().withColumn("a", f_udf()).collect()
+        self.assertEqual(rows, [Row(_1=1, _2=2, a=u'const_str')])
+
     def test_infer_schema_with_udt(self):
         from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
         row = Row(label=1.0, point=ExamplePoint(1.0, 2.0))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
index 2f53fe788c7d0..e6616d013cba6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.execution.{FilterExec, ProjectExec, SparkPlan}
 
 /**
  * Extracts all the Python UDFs in logical aggregate, which depends on aggregate expression or
- * grouping key, evaluate them after aggregate.
+ * grouping key, or doesn't depend on any above expressions, evaluate them after aggregate.
  */
 object ExtractPythonUDFFromAggregate extends Rule[LogicalPlan] {
 
@@ -44,7 +44,8 @@ object ExtractPythonUDFFromAggregate extends Rule[LogicalPlan] {
 
   private def hasPythonUdfOverAggregate(expr: Expression, agg: Aggregate): Boolean = {
     expr.find {
-      e => e.isInstanceOf[PythonUDF] && e.find(belongAggregate(_, agg)).isDefined
+      e => e.isInstanceOf[PythonUDF] &&
+        (e.references.isEmpty || e.find(belongAggregate(_, agg)).isDefined)
     }.isDefined
   }
 

From 17317c8fb99715836fcebc39ffb04648ab7fb762 Mon Sep 17 00:00:00 2001
From: Henry Robinson <henry@cloudera.com>
Date: Wed, 24 Jan 2018 21:19:09 +0900
Subject: [PATCH 174/566] [SPARK-23148][SQL] Allow pathnames with special
 characters for CSV / JSON / text
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…JSON / text

## What changes were proposed in this pull request?

Fix for JSON and CSV data sources when file names include characters
that would be changed by URL encoding.

## How was this patch tested?

New unit tests for JSON, CSV and text suites

Author: Henry Robinson <henry@cloudera.com>

Closes #20355 from henryr/spark-23148.

(cherry picked from commit de36f65d3a819c00d6bf6979deef46c824203669)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 .../execution/datasources/CodecStreams.scala   |  6 +++---
 .../datasources/csv/CSVDataSource.scala        | 11 ++++++-----
 .../datasources/json/JsonDataSource.scala      | 10 ++++++----
 .../spark/sql/FileBasedDataSourceSuite.scala   | 18 ++++++++++++++++--
 4 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CodecStreams.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CodecStreams.scala
index 54549f698aca5..c0df6c779d7bd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CodecStreams.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CodecStreams.scala
@@ -45,11 +45,11 @@ object CodecStreams {
   }
 
   /**
-   * Creates an input stream from the string path and add a closure for the input stream to be
+   * Creates an input stream from the given path and add a closure for the input stream to be
    * closed on task completion.
    */
-  def createInputStreamWithCloseResource(config: Configuration, path: String): InputStream = {
-    val inputStream = createInputStream(config, new Path(path))
+  def createInputStreamWithCloseResource(config: Configuration, path: Path): InputStream = {
+    val inputStream = createInputStream(config, path)
     Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => inputStream.close()))
     inputStream
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 2031381dd2e10..4870d75fc5f08 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -17,11 +17,12 @@
 
 package org.apache.spark.sql.execution.datasources.csv
 
+import java.net.URI
 import java.nio.charset.{Charset, StandardCharsets}
 
 import com.univocity.parsers.csv.CsvParser
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.FileStatus
+import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.io.{LongWritable, Text}
 import org.apache.hadoop.mapred.TextInputFormat
 import org.apache.hadoop.mapreduce.Job
@@ -32,7 +33,6 @@ import org.apache.spark.input.{PortableDataStream, StreamInputFormat}
 import org.apache.spark.rdd.{BinaryFileRDD, RDD}
 import org.apache.spark.sql.{Dataset, Encoders, SparkSession}
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.datasources.text.TextFileFormat
 import org.apache.spark.sql.types.StructType
@@ -206,7 +206,7 @@ object MultiLineCSVDataSource extends CSVDataSource {
       parser: UnivocityParser,
       schema: StructType): Iterator[InternalRow] = {
     UnivocityParser.parseStream(
-      CodecStreams.createInputStreamWithCloseResource(conf, file.filePath),
+      CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath))),
       parser.options.headerFlag,
       parser,
       schema)
@@ -218,8 +218,9 @@ object MultiLineCSVDataSource extends CSVDataSource {
       parsedOptions: CSVOptions): StructType = {
     val csv = createBaseRdd(sparkSession, inputPaths, parsedOptions)
     csv.flatMap { lines =>
+      val path = new Path(lines.getPath())
       UnivocityParser.tokenizeStream(
-        CodecStreams.createInputStreamWithCloseResource(lines.getConfiguration, lines.getPath()),
+        CodecStreams.createInputStreamWithCloseResource(lines.getConfiguration, path),
         shouldDropHeader = false,
         new CsvParser(parsedOptions.asParserSettings))
     }.take(1).headOption match {
@@ -230,7 +231,7 @@ object MultiLineCSVDataSource extends CSVDataSource {
           UnivocityParser.tokenizeStream(
             CodecStreams.createInputStreamWithCloseResource(
               lines.getConfiguration,
-              lines.getPath()),
+              new Path(lines.getPath())),
             parsedOptions.headerFlag,
             new CsvParser(parsedOptions.asParserSettings))
         }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala
index 8b7c2709afde1..77e7edc8e7a20 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala
@@ -18,11 +18,12 @@
 package org.apache.spark.sql.execution.datasources.json
 
 import java.io.InputStream
+import java.net.URI
 
 import com.fasterxml.jackson.core.{JsonFactory, JsonParser}
 import com.google.common.io.ByteStreams
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.FileStatus
+import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.io.Text
 import org.apache.hadoop.mapreduce.Job
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
@@ -168,9 +169,10 @@ object MultiLineJsonDataSource extends JsonDataSource {
   }
 
   private def createParser(jsonFactory: JsonFactory, record: PortableDataStream): JsonParser = {
+    val path = new Path(record.getPath())
     CreateJacksonParser.inputStream(
       jsonFactory,
-      CodecStreams.createInputStreamWithCloseResource(record.getConfiguration, record.getPath()))
+      CodecStreams.createInputStreamWithCloseResource(record.getConfiguration, path))
   }
 
   override def readFile(
@@ -180,7 +182,7 @@ object MultiLineJsonDataSource extends JsonDataSource {
       schema: StructType): Iterator[InternalRow] = {
     def partitionedFileString(ignored: Any): UTF8String = {
       Utils.tryWithResource {
-        CodecStreams.createInputStreamWithCloseResource(conf, file.filePath)
+        CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath)))
       } { inputStream =>
         UTF8String.fromBytes(ByteStreams.toByteArray(inputStream))
       }
@@ -193,6 +195,6 @@ object MultiLineJsonDataSource extends JsonDataSource {
       parser.options.columnNameOfCorruptRecord)
 
     safeParser.parse(
-      CodecStreams.createInputStreamWithCloseResource(conf, file.filePath))
+      CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath))))
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
index 22fb496bc838e..c272c99ae45a8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
@@ -23,6 +23,7 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
 
   private val allFileBasedDataSources = Seq("orc", "parquet", "csv", "json", "text")
+  private val nameWithSpecialChars = "sp&cial%c hars"
 
   allFileBasedDataSources.foreach { format =>
     test(s"Writing empty datasets should not fail - $format") {
@@ -54,7 +55,7 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext {
   // Only ORC/Parquet support this. `CSV` and `JSON` returns an empty schema.
   // `TEXT` data source always has a single column whose name is `value`.
   Seq("orc", "parquet").foreach { format =>
-    test(s"SPARK-15474 Write and read back non-emtpy schema with empty dataframe - $format") {
+    test(s"SPARK-15474 Write and read back non-empty schema with empty dataframe - $format") {
       withTempPath { file =>
         val path = file.getCanonicalPath
         val emptyDf = Seq((true, 1, "str")).toDF().limit(0)
@@ -69,7 +70,6 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext {
 
   allFileBasedDataSources.foreach { format =>
     test(s"SPARK-22146 read files containing special characters using $format") {
-      val nameWithSpecialChars = s"sp&cial%chars"
       withTempDir { dir =>
         val tmpFile = s"$dir/$nameWithSpecialChars"
         spark.createDataset(Seq("a", "b")).write.format(format).save(tmpFile)
@@ -78,4 +78,18 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  // Separate test case for formats that support multiLine as an option.
+  Seq("json", "csv").foreach { format =>
+    test("SPARK-23148 read files containing special characters " +
+      s"using $format with multiline enabled") {
+      withTempDir { dir =>
+        val tmpFile = s"$dir/$nameWithSpecialChars"
+        spark.createDataset(Seq("a", "b")).write.format(format).save(tmpFile)
+        val reader = spark.read.format(format).option("multiLine", true)
+        val fileContent = reader.load(tmpFile)
+        checkAnswer(fileContent, Seq(Row("a"), Row("b")))
+      }
+    }
+  }
 }

From 4336e67f41344fd587808182741ae4ef9fb2b76a Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Wed, 24 Jan 2018 09:37:54 -0800
Subject: [PATCH 175/566] [SPARK-20906][SPARKR] Add API doc example for
 Constrained Logistic Regression

## What changes were proposed in this pull request?

doc only changes

## How was this patch tested?

manual

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #20380 from felixcheung/rclrdoc.

(cherry picked from commit e18d6f5326e0d9ea03d31de5ce04cb84d3b8ab37)
Signed-off-by: Felix Cheung <felixcheung@apache.org>
---
 R/pkg/R/mllib_classification.R                    | 15 ++++++++++++++-
 R/pkg/tests/fulltests/test_mllib_classification.R | 10 +++++-----
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R
index 7cd072a1d6f89..f6e9b1357561b 100644
--- a/R/pkg/R/mllib_classification.R
+++ b/R/pkg/R/mllib_classification.R
@@ -279,11 +279,24 @@ function(object, path, overwrite = FALSE) {
 #' savedModel <- read.ml(path)
 #' summary(savedModel)
 #'
-#' # multinomial logistic regression
+#' # binary logistic regression against two classes with
+#' # upperBoundsOnCoefficients and upperBoundsOnIntercepts
+#' ubc <- matrix(c(1.0, 0.0, 1.0, 0.0), nrow = 1, ncol = 4)
+#' model <- spark.logit(training, Species ~ .,
+#'                       upperBoundsOnCoefficients = ubc,
+#'                       upperBoundsOnIntercepts = 1.0)
 #'
+#' # multinomial logistic regression
 #' model <- spark.logit(training, Class ~ ., regParam = 0.5)
 #' summary <- summary(model)
 #'
+#' # multinomial logistic regression with
+#' # lowerBoundsOnCoefficients and lowerBoundsOnIntercepts
+#' lbc <- matrix(c(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0), nrow = 2, ncol = 4)
+#' lbi <- as.array(c(0.0, 0.0))
+#' model <- spark.logit(training, Species ~ ., family = "multinomial",
+#'                      lowerBoundsOnCoefficients = lbc,
+#'                      lowerBoundsOnIntercepts = lbi)
 #' }
 #' @note spark.logit since 2.1.0
 setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"),
diff --git a/R/pkg/tests/fulltests/test_mllib_classification.R b/R/pkg/tests/fulltests/test_mllib_classification.R
index ad47717ddc12f..a46c47dccd02e 100644
--- a/R/pkg/tests/fulltests/test_mllib_classification.R
+++ b/R/pkg/tests/fulltests/test_mllib_classification.R
@@ -124,7 +124,7 @@ test_that("spark.logit", {
   # Petal.Width   0.42122607
   # nolint end
 
-  # Test multinomial logistic regression againt three classes
+  # Test multinomial logistic regression against three classes
   df <- suppressWarnings(createDataFrame(iris))
   model <- spark.logit(df, Species ~ ., regParam = 0.5)
   summary <- summary(model)
@@ -196,7 +196,7 @@ test_that("spark.logit", {
   #
   # nolint end
 
-  # Test multinomial logistic regression againt two classes
+  # Test multinomial logistic regression against two classes
   df <- suppressWarnings(createDataFrame(iris))
   training <- df[df$Species %in% c("versicolor", "virginica"), ]
   model <- spark.logit(training, Species ~ ., regParam = 0.5, family = "multinomial")
@@ -208,7 +208,7 @@ test_that("spark.logit", {
   expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
   expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
 
-  # Test binomial logistic regression againt two classes
+  # Test binomial logistic regression against two classes
   model <- spark.logit(training, Species ~ ., regParam = 0.5)
   summary <- summary(model)
   coefsR <- c(-6.08, 0.25, 0.16, 0.48, 1.04)
@@ -239,7 +239,7 @@ test_that("spark.logit", {
   prediction2 <- collect(select(predict(model2, df2), "prediction"))
   expect_equal(sort(prediction2$prediction), c("0.0", "0.0", "0.0", "0.0", "0.0"))
 
-  # Test binomial logistic regression againt two classes with upperBoundsOnCoefficients
+  # Test binomial logistic regression against two classes with upperBoundsOnCoefficients
   # and upperBoundsOnIntercepts
   u <- matrix(c(1.0, 0.0, 1.0, 0.0), nrow = 1, ncol = 4)
   model <- spark.logit(training, Species ~ ., upperBoundsOnCoefficients = u,
@@ -252,7 +252,7 @@ test_that("spark.logit", {
   expect_error(spark.logit(training, Species ~ ., upperBoundsOnCoefficients = as.array(c(1, 2)),
                            upperBoundsOnIntercepts = 1.0))
 
-  # Test binomial logistic regression againt two classes with lowerBoundsOnCoefficients
+  # Test binomial logistic regression against two classes with lowerBoundsOnCoefficients
   # and lowerBoundsOnIntercepts
   l <- matrix(c(0.0, -1.0, 0.0, -1.0), nrow = 1, ncol = 4)
   model <- spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = l,

From 2221a30352f1c0f5483c91301f32e66672a43644 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Wed, 24 Jan 2018 10:00:42 -0800
Subject: [PATCH 176/566] [SPARK-23020][CORE][FOLLOWUP] Fix Java style check
 issues.

## What changes were proposed in this pull request?

This is a follow-up of #20297 which broke lint-java checks.
This pr fixes the lint-java issues.

```
[ERROR] src/test/java/org/apache/spark/launcher/BaseSuite.java:[21,8] (imports) UnusedImports: Unused import - java.util.concurrent.TimeUnit.
[ERROR] src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java:[27,8] (imports) UnusedImports: Unused import - java.util.concurrent.TimeUnit.
```

## How was this patch tested?

Checked manually in my local environment.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #20376 from ueshin/issues/SPARK-23020/fup1.

(cherry picked from commit 8c273b4162b6138c4abba64f595c2750d1ef8bcb)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../test/java/org/apache/spark/launcher/SparkLauncherSuite.java  | 1 -
 launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
index a042375c6ae91..1543f4fdb0162 100644
--- a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
+++ b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
@@ -24,7 +24,6 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Properties;
-import java.util.concurrent.TimeUnit;
 
 import org.junit.Test;
 import static org.junit.Assert.*;
diff --git a/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java b/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java
index 3722a59d9438e..438349e027a24 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java
@@ -18,7 +18,6 @@
 package org.apache.spark.launcher;
 
 import java.time.Duration;
-import java.util.concurrent.TimeUnit;
 
 import org.junit.After;
 import org.slf4j.bridge.SLF4JBridgeHandler;

From 30272c668b2cd8c0b0ee78c600bc3feb17bd6647 Mon Sep 17 00:00:00 2001
From: zuotingbing <zuo.tingbing9@zte.com.cn>
Date: Wed, 24 Jan 2018 10:07:24 -0800
Subject: [PATCH 177/566] [SPARK-22837][SQL] Session timeout checker does not
 work in SessionManager.

## What changes were proposed in this pull request?

Currently we do not call the `super.init(hiveConf)` in `SparkSQLSessionManager.init`. So we do not load the config `HIVE_SERVER2_SESSION_CHECK_INTERVAL HIVE_SERVER2_IDLE_SESSION_TIMEOUT HIVE_SERVER2_IDLE_SESSION_CHECK_OPERATION` , which cause the session timeout checker does not work.

## How was this patch tested?

manual tests

Author: zuotingbing <zuo.tingbing9@zte.com.cn>

Closes #20025 from zuotingbing/SPARK-22837.

(cherry picked from commit bbb87b350d9d0d393db3fb7ca61dcbae538553bb)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../thriftserver/SparkSQLSessionManager.scala    | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
index 48c0ebef3e0ce..2958b771f3648 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
@@ -40,22 +40,8 @@ private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, sqlContext:
   private lazy val sparkSqlOperationManager = new SparkSQLOperationManager()
 
   override def init(hiveConf: HiveConf) {
-    setSuperField(this, "hiveConf", hiveConf)
-
-    // Create operation log root directory, if operation logging is enabled
-    if (hiveConf.getBoolVar(ConfVars.HIVE_SERVER2_LOGGING_OPERATION_ENABLED)) {
-      invoke(classOf[SessionManager], this, "initOperationLogRootDir")
-    }
-
-    val backgroundPoolSize = hiveConf.getIntVar(ConfVars.HIVE_SERVER2_ASYNC_EXEC_THREADS)
-    setSuperField(this, "backgroundOperationPool", Executors.newFixedThreadPool(backgroundPoolSize))
-    getAncestorField[Log](this, 3, "LOG").info(
-      s"HiveServer2: Async execution pool size $backgroundPoolSize")
-
     setSuperField(this, "operationManager", sparkSqlOperationManager)
-    addService(sparkSqlOperationManager)
-
-    initCompositeService(hiveConf)
+    super.init(hiveConf)
   }
 
   override def openSession(

From 500c94434d8f5267b1488accd176cf54b69e6ba4 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Wed, 24 Jan 2018 12:58:44 -0800
Subject: [PATCH 178/566] [SPARK-23198][SS][TEST] Fix
 KafkaContinuousSourceStressForDontFailOnDataLossSuite to test
 ContinuousExecution

## What changes were proposed in this pull request?

Currently, `KafkaContinuousSourceStressForDontFailOnDataLossSuite` runs on `MicroBatchExecution`. It should test `ContinuousExecution`.

## How was this patch tested?

Pass the updated test suite.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20374 from dongjoon-hyun/SPARK-23198.

(cherry picked from commit bc9641d9026aeae3571915b003ac971f6245d53c)
Signed-off-by: Shixiong Zhu <zsxwing@gmail.com>
---
 .../apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala
index b3dade414f625..a7083fa4e3417 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala
@@ -91,6 +91,7 @@ class KafkaContinuousSourceStressForDontFailOnDataLossSuite
     ds.writeStream
       .format("memory")
       .queryName("memory")
+      .trigger(Trigger.Continuous("1 second"))
       .start()
   }
 }

From a857ad56621f644a26b9d27079b76ab21f3726ae Mon Sep 17 00:00:00 2001
From: caoxuewen <cao.xuewen@zte.com.cn>
Date: Wed, 24 Jan 2018 13:06:09 -0800
Subject: [PATCH 179/566] [MINOR][SQL] add new unit test to LimitPushdown
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

This PR is repaired as follows
1、update y -> x in "left outer join" test case ,maybe is mistake.
2、add a new test case："left outer join and left sides are limited"
3、add a new test case："left outer join and right sides are limited"
4、add a new test case: "right outer join and right sides are limited"
5、add a new test case: "right outer join and left sides are limited"
6、Remove annotations without code implementation

## How was this patch tested?

add new unit test case.

Author: caoxuewen <cao.xuewen@zte.com.cn>

Closes #20381 from heary-cao/LimitPushdownSuite.

(cherry picked from commit 6f0ba8472d1128551fa8090deebcecde0daebc53)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../sql/catalyst/optimizer/Optimizer.scala    |  1 -
 .../optimizer/LimitPushdownSuite.scala        | 30 ++++++++++++++++++-
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 0f9daa5f04c76..8d207708c12ad 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -352,7 +352,6 @@ object LimitPushDown extends Rule[LogicalPlan] {
     // on both sides if it is applied multiple times. Therefore:
     //   - If one side is already limited, stack another limit on top if the new limit is smaller.
     //     The redundant limit will be collapsed by the CombineLimits rule.
-    //   - If neither side is limited, limit the side that is estimated to be bigger.
     case LocalLimit(exp, join @ Join(left, right, joinType, _)) =>
       val newJoin = joinType match {
         case RightOuter => join.copy(right = maybePushLocalLimit(exp, right))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LimitPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LimitPushdownSuite.scala
index cc98d2350c777..17fb9fc5d11e3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LimitPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LimitPushdownSuite.scala
@@ -93,7 +93,21 @@ class LimitPushdownSuite extends PlanTest {
   test("left outer join") {
     val originalQuery = x.join(y, LeftOuter).limit(1)
     val optimized = Optimize.execute(originalQuery.analyze)
-    val correctAnswer = Limit(1, LocalLimit(1, y).join(y, LeftOuter)).analyze
+    val correctAnswer = Limit(1, LocalLimit(1, x).join(y, LeftOuter)).analyze
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("left outer join and left sides are limited") {
+    val originalQuery = x.limit(2).join(y, LeftOuter).limit(1)
+    val optimized = Optimize.execute(originalQuery.analyze)
+    val correctAnswer = Limit(1, LocalLimit(1, x).join(y, LeftOuter)).analyze
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("left outer join and right sides are limited") {
+    val originalQuery = x.join(y.limit(2), LeftOuter).limit(1)
+    val optimized = Optimize.execute(originalQuery.analyze)
+    val correctAnswer = Limit(1, LocalLimit(1, x).join(Limit(2, y), LeftOuter)).analyze
     comparePlans(optimized, correctAnswer)
   }
 
@@ -104,6 +118,20 @@ class LimitPushdownSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
+  test("right outer join and right sides are limited") {
+    val originalQuery = x.join(y.limit(2), RightOuter).limit(1)
+    val optimized = Optimize.execute(originalQuery.analyze)
+    val correctAnswer = Limit(1, x.join(LocalLimit(1, y), RightOuter)).analyze
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("right outer join and left sides are limited") {
+    val originalQuery = x.limit(2).join(y, RightOuter).limit(1)
+    val optimized = Optimize.execute(originalQuery.analyze)
+    val correctAnswer = Limit(1, Limit(2, x).join(LocalLimit(1, y), RightOuter)).analyze
+    comparePlans(optimized, correctAnswer)
+  }
+
   test("larger limits are not pushed on top of smaller ones in right outer join") {
     val originalQuery = x.join(y.limit(5), RightOuter).limit(10)
     val optimized = Optimize.execute(originalQuery.analyze)

From 012695256a61f1830ff02780611d4aada00a88a0 Mon Sep 17 00:00:00 2001
From: zhoukang <zhoukang199191@gmail.com>
Date: Thu, 25 Jan 2018 15:24:52 +0800
Subject: [PATCH 180/566] [SPARK-23129][CORE] Make deserializeStream of
 DiskMapIterator init lazily

## What changes were proposed in this pull request?

Currently,the deserializeStream in ExternalAppendOnlyMap#DiskMapIterator init when DiskMapIterator instance created.This will cause memory use overhead when ExternalAppendOnlyMap spill too much times.

We can avoid this by making deserializeStream init when it is used the first time.
This patch make deserializeStream init lazily.

## How was this patch tested?

Exist tests

Author: zhoukang <zhoukang199191@gmail.com>

Closes #20292 from caneGuy/zhoukang/lay-diskmapiterator.

(cherry picked from commit 45b4bbfddc18a77011c3bc1bfd71b2cd3466443c)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../util/collection/ExternalAppendOnlyMap.scala | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
index 375f4a6921225..5c6dd45ec58e3 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
@@ -463,7 +463,7 @@ class ExternalAppendOnlyMap[K, V, C](
 
     // An intermediate stream that reads from exactly one batch
     // This guards against pre-fetching and other arbitrary behavior of higher level streams
-    private var deserializeStream = nextBatchStream()
+    private var deserializeStream: DeserializationStream = null
     private var nextItem: (K, C) = null
     private var objectsRead = 0
 
@@ -528,7 +528,11 @@ class ExternalAppendOnlyMap[K, V, C](
     override def hasNext: Boolean = {
       if (nextItem == null) {
         if (deserializeStream == null) {
-          return false
+          // In case of deserializeStream has not been initialized
+          deserializeStream = nextBatchStream()
+          if (deserializeStream == null) {
+            return false
+          }
         }
         nextItem = readNextItem()
       }
@@ -536,19 +540,18 @@ class ExternalAppendOnlyMap[K, V, C](
     }
 
     override def next(): (K, C) = {
-      val item = if (nextItem == null) readNextItem() else nextItem
-      if (item == null) {
+      if (!hasNext) {
         throw new NoSuchElementException
       }
+      val item = nextItem
       nextItem = null
       item
     }
 
     private def cleanup() {
       batchIndex = batchOffsets.length  // Prevent reading any other batch
-      val ds = deserializeStream
-      if (ds != null) {
-        ds.close()
+      if (deserializeStream != null) {
+        deserializeStream.close()
         deserializeStream = null
       }
       if (fileStream != null) {

From abd3e1bb5fc11a3074ee64a5ed0dc97020dca61c Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@databricks.com>
Date: Thu, 25 Jan 2018 16:40:41 +0800
Subject: [PATCH 181/566] [SPARK-23208][SQL] Fix code generation for complex
 create array (related) expressions

## What changes were proposed in this pull request?
The `GenArrayData.genCodeToCreateArrayData` produces illegal java code when code splitting is enabled. This is used in `CreateArray` and `CreateMap` expressions for complex object arrays.

This issue is caused by a typo.

## How was this patch tested?
Added a regression test in `complexTypesSuite`.

Author: Herman van Hovell <hvanhovell@databricks.com>

Closes #20391 from hvanhovell/SPARK-23208.

(cherry picked from commit e29b08add92462a6505fef966629e74ba30e994e)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/expressions/complexTypeCreator.scala     | 2 +-
 .../spark/sql/catalyst/optimizer/complexTypesSuite.scala  | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 3dc2ee03a86e3..047b80ac5289c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -111,7 +111,7 @@ private [sql] object GenArrayData {
       val assignmentString = ctx.splitExpressionsWithCurrentInputs(
         expressions = assignments,
         funcName = "apply",
-        extraArguments = ("Object[]", arrayDataName) :: Nil)
+        extraArguments = ("Object[]", arrayName) :: Nil)
 
       (s"Object[] $arrayName = new Object[$numElements];",
        assignmentString,
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
index 0d11958876ce9..de544ac314789 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
@@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Range}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
+import org.apache.spark.sql.catalyst.util.GenericArrayData
 import org.apache.spark.sql.types._
 
 /**
@@ -31,7 +32,7 @@ import org.apache.spark.sql.types._
 * i.e. {{{create_named_struct(square, `x` * `x`).square}}} can be simplified to {{{`x` * `x`}}}.
 * sam applies to create_array and create_map
 */
-class ComplexTypesSuite extends PlanTest{
+class ComplexTypesSuite extends PlanTest with ExpressionEvalHelper {
 
   object Optimizer extends RuleExecutor[LogicalPlan] {
     val batches =
@@ -171,6 +172,11 @@ class ComplexTypesSuite extends PlanTest{
     assert(ctx.inlinedMutableStates.length == 0)
   }
 
+  test("SPARK-23208: Test code splitting for create array related methods") {
+    val inputs = (1 to 2500).map(x => Literal(s"l_$x"))
+    checkEvaluation(CreateArray(inputs), new GenericArrayData(inputs.map(_.eval())))
+  }
+
   test("simplify map ops") {
     val rel = relation
       .select(

From e66c66cd2d4f0cd67cbc2aa6f95135176f1165e4 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Thu, 25 Jan 2018 01:48:11 -0800
Subject: [PATCH 182/566] [SPARK-23163][DOC][PYTHON] Sync ML Python API with
 Scala

## What changes were proposed in this pull request?

This syncs the ML Python API with Scala for differences found after the 2.3 QA audit.

## How was this patch tested?

NA

Author: Bryan Cutler <cutlerb@gmail.com>

Closes #20354 from BryanCutler/pyspark-ml-doc-sync-23163.

(cherry picked from commit 39ee2acf96f1e1496cff8e4d2614d27fca76d43b)
Signed-off-by: Felix Cheung <felixcheung@apache.org>
---
 python/pyspark/ml/evaluation.py | 8 +++++++-
 python/pyspark/ml/feature.py    | 2 +-
 python/pyspark/ml/fpm.py        | 2 +-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index aa8dbe708a115..0cbce9b40048f 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -334,7 +334,13 @@ class ClusteringEvaluator(JavaEvaluator, HasPredictionCol, HasFeaturesCol,
     .. note:: Experimental
 
     Evaluator for Clustering results, which expects two input
-    columns: prediction and features.
+    columns: prediction and features. The metric computes the Silhouette
+    measure using the squared Euclidean distance.
+
+    The Silhouette is a measure for the validation of the consistency
+    within clusters. It ranges between 1 and -1, where a value close to
+    1 means that the points in a cluster are close to the other points
+    in the same cluster and far from the points of the other clusters.
 
     >>> from pyspark.ml.linalg import Vectors
     >>> featureAndPredictions = map(lambda x: (Vectors.dense(x[0]), x[1]),
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index eb79b193103e2..da85ba761a145 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -3440,7 +3440,7 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
 
     selectorType = Param(Params._dummy(), "selectorType",
                          "The selector type of the ChisqSelector. " +
-                         "Supported options: numTopFeatures (default), percentile and fpr.",
+                         "Supported options: numTopFeatures (default), percentile, fpr, fdr, fwe.",
                          typeConverter=TypeConverters.toString)
 
     numTopFeatures = \
diff --git a/python/pyspark/ml/fpm.py b/python/pyspark/ml/fpm.py
index dd7dda5f03124..b8dafd49d354d 100644
--- a/python/pyspark/ml/fpm.py
+++ b/python/pyspark/ml/fpm.py
@@ -144,7 +144,7 @@ def freqItemsets(self):
     @since("2.2.0")
     def associationRules(self):
         """
-        Data with three columns:
+        DataFrame with three columns:
         * `antecedent`  - Array of the same type as the input column.
         * `consequent`  - Array of the same type as the input column.
         * `confidence`  - Confidence for the rule (`DoubleType`).

From c79e771f8952e6773c3a84cc617145216feddbcf Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 25 Jan 2018 19:49:58 +0800
Subject: [PATCH 183/566] [SPARK-21717][SQL] Decouple consume functions of
 physical operators in whole-stage codegen

## What changes were proposed in this pull request?

It has been observed in SPARK-21603 that whole-stage codegen suffers performance degradation, if the generated functions are too long to be optimized by JIT.

We basically produce a single function to incorporate generated codes from all physical operators in whole-stage. Thus, it is possibly to grow the size of generated function over a threshold that we can't have JIT optimization for it anymore.

This patch is trying to decouple the logic of consuming rows in physical operators to avoid a giant function processing rows.

## How was this patch tested?

Added tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #18931 from viirya/SPARK-21717.

(cherry picked from commit d20bbc2d87ae6bd56d236a7c3d036b52c5f20ff5)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../expressions/codegen/CodeGenerator.scala   |  38 ++++-
 .../apache/spark/sql/internal/SQLConf.scala   |  12 ++
 .../sql/execution/WholeStageCodegenExec.scala | 135 +++++++++++++++---
 .../execution/WholeStageCodegenSuite.scala    |  47 +++++-
 4 files changed, 203 insertions(+), 29 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index f96ed7628fda1..4dcbb702893da 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -1245,6 +1245,31 @@ class CodegenContext {
       ""
     }
   }
+
+  /**
+   * Returns the length of parameters for a Java method descriptor. `this` contributes one unit
+   * and a parameter of type long or double contributes two units. Besides, for nullable parameter,
+   * we also need to pass a boolean parameter for the null status.
+   */
+  def calculateParamLength(params: Seq[Expression]): Int = {
+    def paramLengthForExpr(input: Expression): Int = {
+      // For a nullable expression, we need to pass in an extra boolean parameter.
+      (if (input.nullable) 1 else 0) + javaType(input.dataType) match {
+        case JAVA_LONG | JAVA_DOUBLE => 2
+        case _ => 1
+      }
+    }
+    // Initial value is 1 for `this`.
+    1 + params.map(paramLengthForExpr(_)).sum
+  }
+
+  /**
+   * In Java, a method descriptor is valid only if it represents method parameters with a total
+   * length less than a pre-defined constant.
+   */
+  def isValidParamLength(paramLength: Int): Boolean = {
+    paramLength <= CodeGenerator.MAX_JVM_METHOD_PARAMS_LENGTH
+  }
 }
 
 /**
@@ -1311,26 +1336,29 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
 object CodeGenerator extends Logging {
 
   // This is the value of HugeMethodLimit in the OpenJDK JVM settings
-  val DEFAULT_JVM_HUGE_METHOD_LIMIT = 8000
+  final val DEFAULT_JVM_HUGE_METHOD_LIMIT = 8000
+
+  // The max valid length of method parameters in JVM.
+  final val MAX_JVM_METHOD_PARAMS_LENGTH = 255
 
   // This is the threshold over which the methods in an inner class are grouped in a single
   // method which is going to be called by the outer class instead of the many small ones
-  val MERGE_SPLIT_METHODS_THRESHOLD = 3
+  final val MERGE_SPLIT_METHODS_THRESHOLD = 3
 
   // The number of named constants that can exist in the class is limited by the Constant Pool
   // limit, 65,536. We cannot know how many constants will be inserted for a class, so we use a
   // threshold of 1000k bytes to determine when a function should be inlined to a private, inner
   // class.
-  val GENERATED_CLASS_SIZE_THRESHOLD = 1000000
+  final val GENERATED_CLASS_SIZE_THRESHOLD = 1000000
 
   // This is the threshold for the number of global variables, whose types are primitive type or
   // complex type (e.g. more than one-dimensional array), that will be placed at the outer class
-  val OUTER_CLASS_VARIABLES_THRESHOLD = 10000
+  final val OUTER_CLASS_VARIABLES_THRESHOLD = 10000
 
   // This is the maximum number of array elements to keep global variables in one Java array
   // 32767 is the maximum integer value that does not require a constant pool entry in a Java
   // bytecode instruction
-  val MUTABLESTATEARRAY_SIZE_LIMIT = 32768
+  final val MUTABLESTATEARRAY_SIZE_LIMIT = 32768
 
   /**
    * Compile the Java source code into a Java class, using Janino.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 1cef09a5bf053..470f88c213561 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -661,6 +661,15 @@ object SQLConf {
     .intConf
     .createWithDefault(CodeGenerator.DEFAULT_JVM_HUGE_METHOD_LIMIT)
 
+  val WHOLESTAGE_SPLIT_CONSUME_FUNC_BY_OPERATOR =
+    buildConf("spark.sql.codegen.splitConsumeFuncByOperator")
+      .internal()
+      .doc("When true, whole stage codegen would put the logic of consuming rows of each " +
+        "physical operator into individual methods, instead of a single big method. This can be " +
+        "used to avoid oversized function that can miss the opportunity of JIT optimization.")
+      .booleanConf
+      .createWithDefault(true)
+
   val FILES_MAX_PARTITION_BYTES = buildConf("spark.sql.files.maxPartitionBytes")
     .doc("The maximum number of bytes to pack into a single partition when reading files.")
     .longConf
@@ -1263,6 +1272,9 @@ class SQLConf extends Serializable with Logging {
 
   def hugeMethodLimit: Int = getConf(WHOLESTAGE_HUGE_METHOD_LIMIT)
 
+  def wholeStageSplitConsumeFuncByOperator: Boolean =
+    getConf(WHOLESTAGE_SPLIT_CONSUME_FUNC_BY_OPERATOR)
+
   def tableRelationCacheSize: Int =
     getConf(StaticSQLConf.FILESOURCE_TABLE_RELATION_CACHE_SIZE)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index 6102937852347..8ea9e81b2e53b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.execution
 
 import java.util.Locale
 
+import scala.collection.mutable
+
 import org.apache.spark.broadcast
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
@@ -106,6 +108,31 @@ trait CodegenSupport extends SparkPlan {
    */
   protected def doProduce(ctx: CodegenContext): String
 
+  private def prepareRowVar(ctx: CodegenContext, row: String, colVars: Seq[ExprCode]): ExprCode = {
+    if (row != null) {
+      ExprCode("", "false", row)
+    } else {
+      if (colVars.nonEmpty) {
+        val colExprs = output.zipWithIndex.map { case (attr, i) =>
+          BoundReference(i, attr.dataType, attr.nullable)
+        }
+        val evaluateInputs = evaluateVariables(colVars)
+        // generate the code to create a UnsafeRow
+        ctx.INPUT_ROW = row
+        ctx.currentVars = colVars
+        val ev = GenerateUnsafeProjection.createCode(ctx, colExprs, false)
+        val code = s"""
+          |$evaluateInputs
+          |${ev.code.trim}
+         """.stripMargin.trim
+        ExprCode(code, "false", ev.value)
+      } else {
+        // There is no columns
+        ExprCode("", "false", "unsafeRow")
+      }
+    }
+  }
+
   /**
    * Consume the generated columns or row from current SparkPlan, call its parent's `doConsume()`.
    *
@@ -126,28 +153,7 @@ trait CodegenSupport extends SparkPlan {
         }
       }
 
-    val rowVar = if (row != null) {
-      ExprCode("", "false", row)
-    } else {
-      if (outputVars.nonEmpty) {
-        val colExprs = output.zipWithIndex.map { case (attr, i) =>
-          BoundReference(i, attr.dataType, attr.nullable)
-        }
-        val evaluateInputs = evaluateVariables(outputVars)
-        // generate the code to create a UnsafeRow
-        ctx.INPUT_ROW = row
-        ctx.currentVars = outputVars
-        val ev = GenerateUnsafeProjection.createCode(ctx, colExprs, false)
-        val code = s"""
-          |$evaluateInputs
-          |${ev.code.trim}
-         """.stripMargin.trim
-        ExprCode(code, "false", ev.value)
-      } else {
-        // There is no columns
-        ExprCode("", "false", "unsafeRow")
-      }
-    }
+    val rowVar = prepareRowVar(ctx, row, outputVars)
 
     // Set up the `currentVars` in the codegen context, as we generate the code of `inputVars`
     // before calling `parent.doConsume`. We can't set up `INPUT_ROW`, because parent needs to
@@ -156,13 +162,96 @@ trait CodegenSupport extends SparkPlan {
     ctx.INPUT_ROW = null
     ctx.freshNamePrefix = parent.variablePrefix
     val evaluated = evaluateRequiredVariables(output, inputVars, parent.usedInputs)
+
+    // Under certain conditions, we can put the logic to consume the rows of this operator into
+    // another function. So we can prevent a generated function too long to be optimized by JIT.
+    // The conditions:
+    // 1. The config "spark.sql.codegen.splitConsumeFuncByOperator" is enabled.
+    // 2. `inputVars` are all materialized. That is guaranteed to be true if the parent plan uses
+    //    all variables in output (see `requireAllOutput`).
+    // 3. The number of output variables must less than maximum number of parameters in Java method
+    //    declaration.
+    val confEnabled = SQLConf.get.wholeStageSplitConsumeFuncByOperator
+    val requireAllOutput = output.forall(parent.usedInputs.contains(_))
+    val paramLength = ctx.calculateParamLength(output) + (if (row != null) 1 else 0)
+    val consumeFunc = if (confEnabled && requireAllOutput && ctx.isValidParamLength(paramLength)) {
+      constructDoConsumeFunction(ctx, inputVars, row)
+    } else {
+      parent.doConsume(ctx, inputVars, rowVar)
+    }
     s"""
        |${ctx.registerComment(s"CONSUME: ${parent.simpleString}")}
        |$evaluated
-       |${parent.doConsume(ctx, inputVars, rowVar)}
+       |$consumeFunc
+     """.stripMargin
+  }
+
+  /**
+   * To prevent concatenated function growing too long to be optimized by JIT. We can separate the
+   * parent's `doConsume` codes of a `CodegenSupport` operator into a function to call.
+   */
+  private def constructDoConsumeFunction(
+      ctx: CodegenContext,
+      inputVars: Seq[ExprCode],
+      row: String): String = {
+    val (args, params, inputVarsInFunc) = constructConsumeParameters(ctx, output, inputVars, row)
+    val rowVar = prepareRowVar(ctx, row, inputVarsInFunc)
+
+    val doConsume = ctx.freshName("doConsume")
+    ctx.currentVars = inputVarsInFunc
+    ctx.INPUT_ROW = null
+
+    val doConsumeFuncName = ctx.addNewFunction(doConsume,
+      s"""
+         | private void $doConsume(${params.mkString(", ")}) throws java.io.IOException {
+         |   ${parent.doConsume(ctx, inputVarsInFunc, rowVar)}
+         | }
+       """.stripMargin)
+
+    s"""
+       | $doConsumeFuncName(${args.mkString(", ")});
      """.stripMargin
   }
 
+  /**
+   * Returns arguments for calling method and method definition parameters of the consume function.
+   * And also returns the list of `ExprCode` for the parameters.
+   */
+  private def constructConsumeParameters(
+      ctx: CodegenContext,
+      attributes: Seq[Attribute],
+      variables: Seq[ExprCode],
+      row: String): (Seq[String], Seq[String], Seq[ExprCode]) = {
+    val arguments = mutable.ArrayBuffer[String]()
+    val parameters = mutable.ArrayBuffer[String]()
+    val paramVars = mutable.ArrayBuffer[ExprCode]()
+
+    if (row != null) {
+      arguments += row
+      parameters += s"InternalRow $row"
+    }
+
+    variables.zipWithIndex.foreach { case (ev, i) =>
+      val paramName = ctx.freshName(s"expr_$i")
+      val paramType = ctx.javaType(attributes(i).dataType)
+
+      arguments += ev.value
+      parameters += s"$paramType $paramName"
+      val paramIsNull = if (!attributes(i).nullable) {
+        // Use constant `false` without passing `isNull` for non-nullable variable.
+        "false"
+      } else {
+        val isNull = ctx.freshName(s"exprIsNull_$i")
+        arguments += ev.isNull
+        parameters += s"boolean $isNull"
+        isNull
+      }
+
+      paramVars += ExprCode("", paramIsNull, paramName)
+    }
+    (arguments, parameters, paramVars)
+  }
+
   /**
    * Returns source code to evaluate all the variables, and clear the code of them, to prevent
    * them to be evaluated twice.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
index 22ca128c27768..242bb48c22942 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -205,7 +205,7 @@ class WholeStageCodegenSuite extends QueryTest with SharedSQLContext {
     val codeWithShortFunctions = genGroupByCode(3)
     val (_, maxCodeSize1) = CodeGenerator.compile(codeWithShortFunctions)
     assert(maxCodeSize1 < SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.defaultValue.get)
-    val codeWithLongFunctions = genGroupByCode(20)
+    val codeWithLongFunctions = genGroupByCode(50)
     val (_, maxCodeSize2) = CodeGenerator.compile(codeWithLongFunctions)
     assert(maxCodeSize2 > SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.defaultValue.get)
   }
@@ -228,4 +228,49 @@ class WholeStageCodegenSuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  test("Control splitting consume function by operators with config") {
+    import testImplicits._
+    val df = spark.range(10).select(Seq.tabulate(2) {i => ('id + i).as(s"c$i")} : _*)
+
+    Seq(true, false).foreach { config =>
+      withSQLConf(SQLConf.WHOLESTAGE_SPLIT_CONSUME_FUNC_BY_OPERATOR.key -> s"$config") {
+        val plan = df.queryExecution.executedPlan
+        val wholeStageCodeGenExec = plan.find(p => p match {
+          case wp: WholeStageCodegenExec => true
+          case _ => false
+        })
+        assert(wholeStageCodeGenExec.isDefined)
+        val code = wholeStageCodeGenExec.get.asInstanceOf[WholeStageCodegenExec].doCodeGen()._2
+        assert(code.body.contains("project_doConsume") == config)
+      }
+    }
+  }
+
+  test("Skip splitting consume function when parameter number exceeds JVM limit") {
+    import testImplicits._
+
+    Seq((255, false), (254, true)).foreach { case (columnNum, hasSplit) =>
+      withTempPath { dir =>
+        val path = dir.getCanonicalPath
+        spark.range(10).select(Seq.tabulate(columnNum) {i => ('id + i).as(s"c$i")} : _*)
+          .write.mode(SaveMode.Overwrite).parquet(path)
+
+        withSQLConf(SQLConf.WHOLESTAGE_MAX_NUM_FIELDS.key -> "255",
+            SQLConf.WHOLESTAGE_SPLIT_CONSUME_FUNC_BY_OPERATOR.key -> "true") {
+          val projection = Seq.tabulate(columnNum)(i => s"c$i + c$i as newC$i")
+          val df = spark.read.parquet(path).selectExpr(projection: _*)
+
+          val plan = df.queryExecution.executedPlan
+          val wholeStageCodeGenExec = plan.find(p => p match {
+            case wp: WholeStageCodegenExec => true
+            case _ => false
+          })
+          assert(wholeStageCodeGenExec.isDefined)
+          val code = wholeStageCodeGenExec.get.asInstanceOf[WholeStageCodegenExec].doCodeGen()._2
+          assert(code.body.contains("project_doConsume") == hasSplit)
+        }
+      }
+    }
+  }
 }

From 8866f9c24673e739ee87c7341d75dd5c133a744f Mon Sep 17 00:00:00 2001
From: Nick Pentreath <nickp@za.ibm.com>
Date: Thu, 25 Jan 2018 15:01:22 +0200
Subject: [PATCH 184/566] [SPARK-23112][DOC] Add highlights and migration guide
 for 2.3

Update ML user guide with highlights and migration guide for `2.3`.

## How was this patch tested?

Doc only.

Author: Nick Pentreath <nickp@za.ibm.com>

Closes #20363 from MLnick/SPARK-23112-ml-guide.

(cherry picked from commit 8532e26f335b67b74c976712ad82c20ea6dbbf80)
Signed-off-by: Nick Pentreath <nickp@za.ibm.com>
---
 docs/ml-guide.md            | 78 ++++++++++++++-----------------------
 docs/ml-migration-guides.md | 23 +++++++++++
 2 files changed, 52 insertions(+), 49 deletions(-)

diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index f6288e7c32d97..b957445579ffd 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -72,32 +72,31 @@ To use MLlib in Python, you will need [NumPy](http://www.numpy.org) version 1.4
 [^1]: To learn more about the benefits and background of system optimised natives, you may wish to
     watch Sam Halliday's ScalaX talk on [High Performance Linear Algebra in Scala](http://fommil.github.io/scalax14/#/).
 
-# Highlights in 2.2
+# Highlights in 2.3
 
-The list below highlights some of the new features and enhancements added to MLlib in the `2.2`
+The list below highlights some of the new features and enhancements added to MLlib in the `2.3`
 release of Spark:
 
-* [`ALS`](ml-collaborative-filtering.html) methods for _top-k_ recommendations for all
- users or items, matching the functionality in `mllib`
- ([SPARK-19535](https://issues.apache.org/jira/browse/SPARK-19535)).
- Performance was also improved for both `ml` and `mllib`
- ([SPARK-11968](https://issues.apache.org/jira/browse/SPARK-11968) and
- [SPARK-20587](https://issues.apache.org/jira/browse/SPARK-20587))
-* [`Correlation`](ml-statistics.html#correlation) and
- [`ChiSquareTest`](ml-statistics.html#hypothesis-testing) stats functions for `DataFrames`
- ([SPARK-19636](https://issues.apache.org/jira/browse/SPARK-19636) and
- [SPARK-19635](https://issues.apache.org/jira/browse/SPARK-19635))
-* [`FPGrowth`](ml-frequent-pattern-mining.html#fp-growth) algorithm for frequent pattern mining
- ([SPARK-14503](https://issues.apache.org/jira/browse/SPARK-14503))
-* `GLM` now supports the full `Tweedie` family
- ([SPARK-18929](https://issues.apache.org/jira/browse/SPARK-18929))
-* [`Imputer`](ml-features.html#imputer) feature transformer to impute missing values in a dataset
- ([SPARK-13568](https://issues.apache.org/jira/browse/SPARK-13568))
-* [`LinearSVC`](ml-classification-regression.html#linear-support-vector-machine)
- for linear Support Vector Machine classification
- ([SPARK-14709](https://issues.apache.org/jira/browse/SPARK-14709))
-* Logistic regression now supports constraints on the coefficients during training
- ([SPARK-20047](https://issues.apache.org/jira/browse/SPARK-20047))
+* Built-in support for reading images into a `DataFrame` was added
+([SPARK-21866](https://issues.apache.org/jira/browse/SPARK-21866)).
+* [`OneHotEncoderEstimator`](ml-features.html#onehotencoderestimator) was added, and should be
+used instead of the existing `OneHotEncoder` transformer. The new estimator supports
+transforming multiple columns.
+* Multiple column support was also added to `QuantileDiscretizer` and `Bucketizer`
+([SPARK-22397](https://issues.apache.org/jira/browse/SPARK-22397) and
+[SPARK-20542](https://issues.apache.org/jira/browse/SPARK-20542))
+* A new [`FeatureHasher`](ml-features.html#featurehasher) transformer was added
+ ([SPARK-13969](https://issues.apache.org/jira/browse/SPARK-13969)).
+* Added support for evaluating multiple models in parallel when performing cross-validation using
+[`TrainValidationSplit` or `CrossValidator`](ml-tuning.html)
+([SPARK-19357](https://issues.apache.org/jira/browse/SPARK-19357)).
+* Improved support for custom pipeline components in Python (see
+[SPARK-21633](https://issues.apache.org/jira/browse/SPARK-21633) and 
+[SPARK-21542](https://issues.apache.org/jira/browse/SPARK-21542)).
+* `DataFrame` functions for descriptive summary statistics over vector columns
+([SPARK-19634](https://issues.apache.org/jira/browse/SPARK-19634)).
+* Robust linear regression with Huber loss
+([SPARK-3181](https://issues.apache.org/jira/browse/SPARK-3181)).
 
 # Migration guide
 
@@ -115,36 +114,17 @@ There are no breaking changes.
 
 **Deprecations**
 
-There are no deprecations.
+* `OneHotEncoder` has been deprecated and will be removed in `3.0`. It has been replaced by the
+new [`OneHotEncoderEstimator`](ml-features.html#onehotencoderestimator)
+(see [SPARK-13030](https://issues.apache.org/jira/browse/SPARK-13030)). **Note** that
+`OneHotEncoderEstimator` will be renamed to `OneHotEncoder` in `3.0` (but
+`OneHotEncoderEstimator` will be kept as an alias).
 
 **Changes of behavior**
 
 * [SPARK-21027](https://issues.apache.org/jira/browse/SPARK-21027):
- We are now setting the default parallelism used in `OneVsRest` to be 1 (i.e. serial), in 2.2 and earlier version,
- the `OneVsRest` parallelism would be parallelism of the default threadpool in scala.
-
-## From 2.1 to 2.2
-
-### Breaking changes
-
-There are no breaking changes.
-
-### Deprecations and changes of behavior
-
-**Deprecations**
-
-There are no deprecations.
-
-**Changes of behavior**
-
-* [SPARK-19787](https://issues.apache.org/jira/browse/SPARK-19787):
- Default value of `regParam` changed from `1.0` to `0.1` for `ALS.train` method (marked `DeveloperApi`).
- **Note** this does _not affect_ the `ALS` Estimator or Model, nor MLlib's `ALS` class.
-* [SPARK-14772](https://issues.apache.org/jira/browse/SPARK-14772):
- Fixed inconsistency between Python and Scala APIs for `Param.copy` method.
-* [SPARK-11569](https://issues.apache.org/jira/browse/SPARK-11569):
- `StringIndexer` now handles `NULL` values in the same way as unseen values. Previously an exception
- would always be thrown regardless of the setting of the `handleInvalid` parameter.
+ We are now setting the default parallelism used in `OneVsRest` to be 1 (i.e. serial). In 2.2 and
+ earlier versions, the level of parallelism was set to the default threadpool size in Scala.
   
 ## Previous Spark versions
 
diff --git a/docs/ml-migration-guides.md b/docs/ml-migration-guides.md
index 687d7c8930362..f4b0df58cf63b 100644
--- a/docs/ml-migration-guides.md
+++ b/docs/ml-migration-guides.md
@@ -7,6 +7,29 @@ description: MLlib migration guides from before Spark SPARK_VERSION_SHORT
 
 The migration guide for the current Spark version is kept on the [MLlib Guide main page](ml-guide.html#migration-guide).
 
+## From 2.1 to 2.2
+
+### Breaking changes
+
+There are no breaking changes.
+
+### Deprecations and changes of behavior
+
+**Deprecations**
+
+There are no deprecations.
+
+**Changes of behavior**
+
+* [SPARK-19787](https://issues.apache.org/jira/browse/SPARK-19787):
+ Default value of `regParam` changed from `1.0` to `0.1` for `ALS.train` method (marked `DeveloperApi`).
+ **Note** this does _not affect_ the `ALS` Estimator or Model, nor MLlib's `ALS` class.
+* [SPARK-14772](https://issues.apache.org/jira/browse/SPARK-14772):
+ Fixed inconsistency between Python and Scala APIs for `Param.copy` method.
+* [SPARK-11569](https://issues.apache.org/jira/browse/SPARK-11569):
+ `StringIndexer` now handles `NULL` values in the same way as unseen values. Previously an exception
+ would always be thrown regardless of the setting of the `handleInvalid` parameter.
+ 
 ## From 2.0 to 2.1
 
 ### Breaking changes

From 2f65c20ea74a87729eaf3c9b2aebcfb10c0ecf4b Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Fri, 26 Jan 2018 07:50:48 +0900
Subject: [PATCH 185/566] [SPARK-23081][PYTHON] Add colRegex API to PySpark

## What changes were proposed in this pull request?

Add colRegex API to PySpark

## How was this patch tested?

add a test in sql/tests.py

Author: Huaxin Gao <huaxing@us.ibm.com>

Closes #20390 from huaxingao/spark-23081.

(cherry picked from commit 8480c0c57698b7dcccec5483d67b17cf2c7527ed)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 python/pyspark/sql/dataframe.py               | 23 +++++++++++++++++++
 .../scala/org/apache/spark/sql/Dataset.scala  |  8 +++----
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 2d5e9b91468cf..ac403080acfdf 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -819,6 +819,29 @@ def columns(self):
         """
         return [f.name for f in self.schema.fields]
 
+    @since(2.3)
+    def colRegex(self, colName):
+        """
+        Selects column based on the column name specified as a regex and returns it
+        as :class:`Column`.
+
+        :param colName: string, column name specified as a regex.
+
+        >>> df = spark.createDataFrame([("a", 1), ("b", 2), ("c",  3)], ["Col1", "Col2"])
+        >>> df.select(df.colRegex("`(Col1)?+.+`")).show()
+        +----+
+        |Col2|
+        +----+
+        |   1|
+        |   2|
+        |   3|
+        +----+
+        """
+        if not isinstance(colName, basestring):
+            raise ValueError("colName should be provided as string")
+        jc = self._jdf.colRegex(colName)
+        return Column(jc)
+
     @ignore_unicode_prefix
     @since(1.3)
     def alias(self, alias):
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 912f411fa3845..edb6644ed5ac0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1194,7 +1194,7 @@ class Dataset[T] private[sql](
   def orderBy(sortExprs: Column*): Dataset[T] = sort(sortExprs : _*)
 
   /**
-   * Selects column based on the column name and return it as a [[Column]].
+   * Selects column based on the column name and returns it as a [[Column]].
    *
    * @note The column name can also reference to a nested column like `a.b`.
    *
@@ -1220,7 +1220,7 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Selects column based on the column name and return it as a [[Column]].
+   * Selects column based on the column name and returns it as a [[Column]].
    *
    * @note The column name can also reference to a nested column like `a.b`.
    *
@@ -1240,7 +1240,7 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Selects column based on the column name specified as a regex and return it as [[Column]].
+   * Selects column based on the column name specified as a regex and returns it as [[Column]].
    * @group untypedrel
    * @since 2.3.0
    */
@@ -2729,7 +2729,7 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Return an iterator that contains all rows in this Dataset.
+   * Returns an iterator that contains all rows in this Dataset.
    *
    * The iterator will consume as much memory as the largest partition in this Dataset.
    *

From 26a8b4e398ee6d1de06a5f3ac1d6d342c9b67d78 Mon Sep 17 00:00:00 2001
From: Kris Mok <kris.mok@databricks.com>
Date: Thu, 25 Jan 2018 16:11:33 -0800
Subject: [PATCH 186/566] [SPARK-23032][SQL] Add a per-query codegenStageId to
 WholeStageCodegenExec

## What changes were proposed in this pull request?

**Proposal**

Add a per-query ID to the codegen stages as represented by `WholeStageCodegenExec` operators. This ID will be used in
-  the explain output of the physical plan, and in
- the generated class name.

Specifically, this ID will be stable within a query, counting up from 1 in depth-first post-order for all the `WholeStageCodegenExec` inserted into a plan.
The ID value 0 is reserved for "free-floating" `WholeStageCodegenExec` objects, which may have been created for one-off purposes, e.g. for fallback handling of codegen stages that failed to codegen the whole stage and wishes to codegen a subset of the children operators (as seen in `org.apache.spark.sql.execution.FileSourceScanExec#doExecute`).

Example: for the following query:
```scala
scala> spark.conf.set("spark.sql.autoBroadcastJoinThreshold", 1)

scala> val df1 = spark.range(10).select('id as 'x, 'id + 1 as 'y).orderBy('x).select('x + 1 as 'z, 'y)
df1: org.apache.spark.sql.DataFrame = [z: bigint, y: bigint]

scala> val df2 = spark.range(5)
df2: org.apache.spark.sql.Dataset[Long] = [id: bigint]

scala> val query = df1.join(df2, 'z === 'id)
query: org.apache.spark.sql.DataFrame = [z: bigint, y: bigint ... 1 more field]
```

The explain output before the change is:
```scala
scala> query.explain
== Physical Plan ==
*SortMergeJoin [z#9L], [id#13L], Inner
:- *Sort [z#9L ASC NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(z#9L, 200)
:     +- *Project [(x#3L + 1) AS z#9L, y#4L]
:        +- *Sort [x#3L ASC NULLS FIRST], true, 0
:           +- Exchange rangepartitioning(x#3L ASC NULLS FIRST, 200)
:              +- *Project [id#0L AS x#3L, (id#0L + 1) AS y#4L]
:                 +- *Range (0, 10, step=1, splits=8)
+- *Sort [id#13L ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(id#13L, 200)
      +- *Range (0, 5, step=1, splits=8)
```
Note how codegen'd operators are annotated with a prefix `"*"`. See how the `SortMergeJoin` operator and its direct children `Sort` operators are adjacent and all annotated with the `"*"`, so it's hard to tell they're actually in separate codegen stages.

and after this change it'll be:
```scala
scala> query.explain
== Physical Plan ==
*(6) SortMergeJoin [z#9L], [id#13L], Inner
:- *(3) Sort [z#9L ASC NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(z#9L, 200)
:     +- *(2) Project [(x#3L + 1) AS z#9L, y#4L]
:        +- *(2) Sort [x#3L ASC NULLS FIRST], true, 0
:           +- Exchange rangepartitioning(x#3L ASC NULLS FIRST, 200)
:              +- *(1) Project [id#0L AS x#3L, (id#0L + 1) AS y#4L]
:                 +- *(1) Range (0, 10, step=1, splits=8)
+- *(5) Sort [id#13L ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(id#13L, 200)
      +- *(4) Range (0, 5, step=1, splits=8)
```
Note that the annotated prefix becomes `"*(id) "`. See how the `SortMergeJoin` operator and its direct children `Sort` operators have different codegen stage IDs.

It'll also show up in the name of the generated class, as a suffix in the format of `GeneratedClass$GeneratedIterator$id`.

For example, note how `GeneratedClass$GeneratedIteratorForCodegenStage3` and `GeneratedClass$GeneratedIteratorForCodegenStage6` in the following stack trace corresponds to the IDs shown in the explain output above:
```
"Executor task launch worker for task 42412957" daemon prio=5 tid=0x58 nid=NA runnable
  java.lang.Thread.State: RUNNABLE
	  at org.apache.spark.sql.execution.UnsafeExternalRowSorter.insertRow(UnsafeExternalRowSorter.java:109)
	  at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.sort_addToSorter$(generated.java:32)
	  at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(generated.java:41)
	  at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	  at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$9$$anon$1.hasNext(WholeStageCodegenExec.scala:494)
	  at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage6.findNextInnerJoinRows$(generated.java:42)
	  at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage6.processNext(generated.java:101)
	  at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	  at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$11$$anon$2.hasNext(WholeStageCodegenExec.scala:513)
	  at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:253)
	  at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	  at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:828)
	  at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:828)
	  at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	  at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	  at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	  at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	  at org.apache.spark.scheduler.Task.run(Task.scala:109)
	  at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	  at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	  at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	  at java.lang.Thread.run(Thread.java:748)
```

**Rationale**

Right now, the codegen from Spark SQL lacks the means to differentiate between a couple of things:

1. It's hard to tell which physical operators are in the same WholeStageCodegen stage. Note that this "stage" is a separate notion from Spark's RDD execution stages; this one is only to delineate codegen units.
There can be adjacent physical operators that are both codegen'd but are in separate codegen stages. Some of this is due to hacky implementation details, such as the case with `SortMergeJoin` and its `Sort` inputs -- they're hard coded to be split into separate stages although both are codegen'd.
When printing out the explain output of the physical plan, you'd only see the codegen'd physical operators annotated with a preceding star (`'*'`) but would have no way to figure out if they're in the same stage.

2. Performance/error diagnosis
The generated code has class/method names that are hard to differentiate between queries or even between codegen stages within the same query. If we use a Java-level profiler to collect profiles, or if we encounter a Java-level exception with a stack trace in it, it's really hard to tell which part of a query it's at.
By introducing a per-query codegen stage ID, we'd at least be able to know which codegen stage (and in turn, which group of physical operators) was a profile tick or an exception happened.

The reason why this proposal uses a per-query ID is because it's stable within a query, so that multiple runs of the same query will see the same resulting IDs. This both benefits understandability for users, and also it plays well with the codegen cache in Spark SQL which uses the generated source code as the key.

The downside to using per-query IDs as opposed to a per-session or globally incrementing ID is of course we can't tell apart different query runs with this ID alone. But for now I believe this is a good enough tradeoff.

## How was this patch tested?

Existing tests. This PR does not involve any runtime behavior changes other than some name changes.
The SQL query test suites that compares explain outputs have been updates to ignore the newly added `codegenStageId`.

Author: Kris Mok <kris.mok@databricks.com>

Closes #20224 from rednaxelafx/wsc-codegenstageid.

(cherry picked from commit e57f394818b0a62f99609e1032fede7e981f306f)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../apache/spark/sql/internal/SQLConf.scala   | 10 +++
 .../sql/execution/DataSourceScanExec.scala    |  2 +-
 .../sql/execution/WholeStageCodegenExec.scala | 85 +++++++++++++++++--
 .../columnar/InMemoryTableScanExec.scala      |  2 +-
 .../datasources/v2/DataSourceV2ScanExec.scala |  2 +-
 .../apache/spark/sql/SQLQueryTestSuite.scala  |  3 +-
 .../execution/WholeStageCodegenSuite.scala    | 34 ++++++++
 .../columnar/InMemoryColumnarQuerySuite.scala |  2 +-
 .../sql/hive/execution/HiveExplainSuite.scala | 39 +++++++--
 9 files changed, 158 insertions(+), 21 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 470f88c213561..b0d18b6dced76 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -629,6 +629,14 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
+  val WHOLESTAGE_CODEGEN_USE_ID_IN_CLASS_NAME =
+    buildConf("spark.sql.codegen.useIdInClassName")
+    .internal()
+    .doc("When true, embed the (whole-stage) codegen stage ID into " +
+      "the class name of the generated class as a suffix")
+    .booleanConf
+    .createWithDefault(true)
+
   val WHOLESTAGE_MAX_NUM_FIELDS = buildConf("spark.sql.codegen.maxFields")
     .internal()
     .doc("The maximum number of fields (including nested fields) that will be supported before" +
@@ -1264,6 +1272,8 @@ class SQLConf extends Serializable with Logging {
 
   def wholeStageEnabled: Boolean = getConf(WHOLESTAGE_CODEGEN_ENABLED)
 
+  def wholeStageUseIdInClassName: Boolean = getConf(WHOLESTAGE_CODEGEN_USE_ID_IN_CLASS_NAME)
+
   def wholeStageMaxNumFields: Int = getConf(WHOLESTAGE_MAX_NUM_FIELDS)
 
   def codegenFallback: Boolean = getConf(CODEGEN_FALLBACK)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 7c7d79c2bbd7c..aa66ee7e948ea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -324,7 +324,7 @@ case class FileSourceScanExec(
       // in the case of fallback, this batched scan should never fail because of:
       // 1) only primitive types are supported
       // 2) the number of columns should be smaller than spark.sql.codegen.maxFields
-      WholeStageCodegenExec(this).execute()
+      WholeStageCodegenExec(this)(codegenStageId = 0).execute()
     } else {
       val unsafeRows = {
         val scan = inputRDD
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index 8ea9e81b2e53b..0e525b1e22eb9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution
 
 import java.util.Locale
+import java.util.function.Supplier
 
 import scala.collection.mutable
 
@@ -414,6 +415,58 @@ object WholeStageCodegenExec {
   }
 }
 
+object WholeStageCodegenId {
+  // codegenStageId: ID for codegen stages within a query plan.
+  // It does not affect equality, nor does it participate in destructuring pattern matching
+  // of WholeStageCodegenExec.
+  //
+  // This ID is used to help differentiate between codegen stages. It is included as a part
+  // of the explain output for physical plans, e.g.
+  //
+  // == Physical Plan ==
+  // *(5) SortMergeJoin [x#3L], [y#9L], Inner
+  // :- *(2) Sort [x#3L ASC NULLS FIRST], false, 0
+  // :  +- Exchange hashpartitioning(x#3L, 200)
+  // :     +- *(1) Project [(id#0L % 2) AS x#3L]
+  // :        +- *(1) Filter isnotnull((id#0L % 2))
+  // :           +- *(1) Range (0, 5, step=1, splits=8)
+  // +- *(4) Sort [y#9L ASC NULLS FIRST], false, 0
+  //    +- Exchange hashpartitioning(y#9L, 200)
+  //       +- *(3) Project [(id#6L % 2) AS y#9L]
+  //          +- *(3) Filter isnotnull((id#6L % 2))
+  //             +- *(3) Range (0, 5, step=1, splits=8)
+  //
+  // where the ID makes it obvious that not all adjacent codegen'd plan operators are of the
+  // same codegen stage.
+  //
+  // The codegen stage ID is also optionally included in the name of the generated classes as
+  // a suffix, so that it's easier to associate a generated class back to the physical operator.
+  // This is controlled by SQLConf: spark.sql.codegen.useIdInClassName
+  //
+  // The ID is also included in various log messages.
+  //
+  // Within a query, a codegen stage in a plan starts counting from 1, in "insertion order".
+  // WholeStageCodegenExec operators are inserted into a plan in depth-first post-order.
+  // See CollapseCodegenStages.insertWholeStageCodegen for the definition of insertion order.
+  //
+  // 0 is reserved as a special ID value to indicate a temporary WholeStageCodegenExec object
+  // is created, e.g. for special fallback handling when an existing WholeStageCodegenExec
+  // failed to generate/compile code.
+
+  private val codegenStageCounter = ThreadLocal.withInitial(new Supplier[Integer] {
+    override def get() = 1  // TODO: change to Scala lambda syntax when upgraded to Scala 2.12+
+  })
+
+  def resetPerQuery(): Unit = codegenStageCounter.set(1)
+
+  def getNextStageId(): Int = {
+    val counter = codegenStageCounter
+    val id = counter.get()
+    counter.set(id + 1)
+    id
+  }
+}
+
 /**
  * WholeStageCodegen compiles a subtree of plans that support codegen together into single Java
  * function.
@@ -442,7 +495,8 @@ object WholeStageCodegenExec {
  * `doCodeGen()` will create a `CodeGenContext`, which will hold a list of variables for input,
  * used to generated code for [[BoundReference]].
  */
-case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with CodegenSupport {
+case class WholeStageCodegenExec(child: SparkPlan)(val codegenStageId: Int)
+    extends UnaryExecNode with CodegenSupport {
 
   override def output: Seq[Attribute] = child.output
 
@@ -454,6 +508,12 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
     "pipelineTime" -> SQLMetrics.createTimingMetric(sparkContext,
       WholeStageCodegenExec.PIPELINE_DURATION_METRIC))
 
+  def generatedClassName(): String = if (conf.wholeStageUseIdInClassName) {
+    s"GeneratedIteratorForCodegenStage$codegenStageId"
+  } else {
+    "GeneratedIterator"
+  }
+
   /**
    * Generates code for this subtree.
    *
@@ -471,19 +531,23 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
         }
        """, inlineToOuterClass = true)
 
+    val className = generatedClassName()
+
     val source = s"""
       public Object generate(Object[] references) {
-        return new GeneratedIterator(references);
+        return new $className(references);
       }
 
-      ${ctx.registerComment(s"""Codegend pipeline for\n${child.treeString.trim}""")}
-      final class GeneratedIterator extends org.apache.spark.sql.execution.BufferedRowIterator {
+      ${ctx.registerComment(
+        s"""Codegend pipeline for stage (id=$codegenStageId)
+           |${this.treeString.trim}""".stripMargin)}
+      final class $className extends ${classOf[BufferedRowIterator].getName} {
 
         private Object[] references;
         private scala.collection.Iterator[] inputs;
         ${ctx.declareMutableStates()}
 
-        public GeneratedIterator(Object[] references) {
+        public $className(Object[] references) {
           this.references = references;
         }
 
@@ -516,7 +580,7 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
     } catch {
       case _: Exception if !Utils.isTesting && sqlContext.conf.codegenFallback =>
         // We should already saw the error message
-        logWarning(s"Whole-stage codegen disabled for this plan:\n $treeString")
+        logWarning(s"Whole-stage codegen disabled for plan (id=$codegenStageId):\n $treeString")
         return child.execute()
     }
 
@@ -525,7 +589,7 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
       logInfo(s"Found too long generated codes and JIT optimization might not work: " +
         s"the bytecode size ($maxCodeSize) is above the limit " +
         s"${sqlContext.conf.hugeMethodLimit}, and the whole-stage codegen was disabled " +
-        s"for this plan. To avoid this, you can raise the limit " +
+        s"for this plan (id=$codegenStageId). To avoid this, you can raise the limit " +
         s"`${SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.key}`:\n$treeString")
       child match {
         // The fallback solution of batch file source scan still uses WholeStageCodegenExec
@@ -603,10 +667,12 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
       verbose: Boolean,
       prefix: String = "",
       addSuffix: Boolean = false): StringBuilder = {
-    child.generateTreeString(depth, lastChildren, builder, verbose, "*")
+    child.generateTreeString(depth, lastChildren, builder, verbose, s"*($codegenStageId) ")
   }
 
   override def needStopCheck: Boolean = true
+
+  override protected def otherCopyArgs: Seq[AnyRef] = Seq(codegenStageId.asInstanceOf[Integer])
 }
 
 
@@ -657,13 +723,14 @@ case class CollapseCodegenStages(conf: SQLConf) extends Rule[SparkPlan] {
     case plan if plan.output.length == 1 && plan.output.head.dataType.isInstanceOf[ObjectType] =>
       plan.withNewChildren(plan.children.map(insertWholeStageCodegen))
     case plan: CodegenSupport if supportCodegen(plan) =>
-      WholeStageCodegenExec(insertInputAdapter(plan))
+      WholeStageCodegenExec(insertInputAdapter(plan))(WholeStageCodegenId.getNextStageId())
     case other =>
       other.withNewChildren(other.children.map(insertWholeStageCodegen))
   }
 
   def apply(plan: SparkPlan): SparkPlan = {
     if (conf.wholeStageEnabled) {
+      WholeStageCodegenId.resetPerQuery()
       insertWholeStageCodegen(plan)
     } else {
       plan
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
index 28b3875505cd2..c167f1e7dc621 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
@@ -274,7 +274,7 @@ case class InMemoryTableScanExec(
 
   protected override def doExecute(): RDD[InternalRow] = {
     if (supportsBatch) {
-      WholeStageCodegenExec(this).execute()
+      WholeStageCodegenExec(this)(codegenStageId = 0).execute()
     } else {
       inputRDD
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
index 69d871df3e1dd..2c22239e81869 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
@@ -88,7 +88,7 @@ case class DataSourceV2ScanExec(
 
   override protected def doExecute(): RDD[InternalRow] = {
     if (supportsBatch) {
-      WholeStageCodegenExec(this).execute()
+      WholeStageCodegenExec(this)(codegenStageId = 0).execute()
     } else {
       val numOutputRows = longMetric("numOutputRows")
       inputRDD.map { r =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
index 054ada56d99ad..beac9699585d5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
@@ -230,7 +230,8 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
         .replaceAll("Location.*/sql/core/", s"Location ${notIncludedMsg}sql/core/")
         .replaceAll("Created By.*", s"Created By $notIncludedMsg")
         .replaceAll("Created Time.*", s"Created Time $notIncludedMsg")
-        .replaceAll("Last Access.*", s"Last Access $notIncludedMsg"))
+        .replaceAll("Last Access.*", s"Last Access $notIncludedMsg")
+        .replaceAll("\\*\\(\\d+\\) ", "*"))  // remove the WholeStageCodegen codegenStageIds
 
       // If the output is not pre-sorted, sort it.
       if (isSorted(df.queryExecution.analyzed)) (schema, answer) else (schema, answer.sorted)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
index 242bb48c22942..28ad712feaae6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution
 
+import org.apache.spark.metrics.source.CodegenMetrics
 import org.apache.spark.sql.{QueryTest, Row, SaveMode}
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeAndComment, CodeGenerator}
 import org.apache.spark.sql.execution.aggregate.HashAggregateExec
@@ -273,4 +274,37 @@ class WholeStageCodegenSuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  test("codegen stage IDs should be preserved in transformations after CollapseCodegenStages") {
+    // test case adapted from DataFrameSuite to trigger ReuseExchange
+    withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "2") {
+      val df = spark.range(100)
+      val join = df.join(df, "id")
+      val plan = join.queryExecution.executedPlan
+      assert(!plan.find(p =>
+        p.isInstanceOf[WholeStageCodegenExec] &&
+          p.asInstanceOf[WholeStageCodegenExec].codegenStageId == 0).isDefined,
+        "codegen stage IDs should be preserved through ReuseExchange")
+      checkAnswer(join, df.toDF)
+    }
+  }
+
+  test("including codegen stage ID in generated class name should not regress codegen caching") {
+    import testImplicits._
+
+    withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_USE_ID_IN_CLASS_NAME.key -> "true") {
+      val bytecodeSizeHisto = CodegenMetrics.METRIC_GENERATED_METHOD_BYTECODE_SIZE
+
+      // the same query run twice should hit the codegen cache
+      spark.range(3).select('id + 2).collect
+      val after1 = bytecodeSizeHisto.getCount
+      spark.range(3).select('id + 2).collect
+      val after2 = bytecodeSizeHisto.getCount // same query shape as above, deliberately
+      // bytecodeSizeHisto's count is always monotonically increasing if new compilation to
+      // bytecode had occurred. If the count stayed the same that means we've got a cache hit.
+      assert(after1 == after2, "Should hit codegen cache. No new compilation to bytecode expected")
+
+      // a different query can result in codegen cache miss, that's by design
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
index ff7c5e58e9863..2280da927cf70 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
@@ -477,7 +477,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
         assert(planBeforeFilter.head.isInstanceOf[InMemoryTableScanExec])
 
         val execPlan = if (enabled == "true") {
-          WholeStageCodegenExec(planBeforeFilter.head)
+          WholeStageCodegenExec(planBeforeFilter.head)(codegenStageId = 0)
         } else {
           planBeforeFilter.head
         }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
index a4273de5fe260..f84d188075b72 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
@@ -154,14 +154,39 @@ class HiveExplainSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
     }
   }
 
-  test("EXPLAIN CODEGEN command") {
-    checkKeywordsExist(sql("EXPLAIN CODEGEN SELECT 1"),
-      "WholeStageCodegen",
-      "Generated code:",
-      "/* 001 */ public Object generate(Object[] references) {",
-      "/* 002 */   return new GeneratedIterator(references);",
-      "/* 003 */ }"
+  test("explain output of physical plan should contain proper codegen stage ID") {
+    checkKeywordsExist(sql(
+      """
+        |EXPLAIN SELECT t1.id AS a, t2.id AS b FROM
+        |(SELECT * FROM range(3)) t1 JOIN
+        |(SELECT * FROM range(10)) t2 ON t1.id == t2.id % 3
+      """.stripMargin),
+      "== Physical Plan ==",
+      "*(2) Project ",
+      "+- *(2) BroadcastHashJoin ",
+      "   :- BroadcastExchange ",
+      "   :  +- *(1) Range ",
+      "   +- *(2) Range "
     )
+  }
+
+  test("EXPLAIN CODEGEN command") {
+    // the generated class name in this test should stay in sync with
+    //   org.apache.spark.sql.execution.WholeStageCodegenExec.generatedClassName()
+    for ((useIdInClassName, expectedClassName) <- Seq(
+           ("true", "GeneratedIteratorForCodegenStage1"),
+           ("false", "GeneratedIterator"))) {
+      withSQLConf(
+          SQLConf.WHOLESTAGE_CODEGEN_USE_ID_IN_CLASS_NAME.key -> useIdInClassName) {
+        checkKeywordsExist(sql("EXPLAIN CODEGEN SELECT 1"),
+          "WholeStageCodegen",
+          "Generated code:",
+           "/* 001 */ public Object generate(Object[] references) {",
+          s"/* 002 */   return new $expectedClassName(references);",
+           "/* 003 */ }"
+        )
+      }
+    }
 
     checkKeywordsNotExist(sql("EXPLAIN CODEGEN SELECT 1"),
       "== Physical Plan =="

From 87d128ffd4a56fe3995f041bf4a8c4cba6c20092 Mon Sep 17 00:00:00 2001
From: Sid Murching <sid.murching@databricks.com>
Date: Thu, 25 Jan 2018 18:15:29 -0600
Subject: [PATCH 187/566] [SPARK-23205][ML] Update ImageSchema.readImages to
 correctly set alpha values for four-channel images

## What changes were proposed in this pull request?

When parsing raw image data in ImageSchema.decode(), we use a [java.awt.Color](https://docs.oracle.com/javase/7/docs/api/java/awt/Color.html#Color(int)) constructor that sets alpha = 255, even for four-channel images (which may have different alpha values). This PR fixes this issue & adds a unit test to verify correctness of reading four-channel images.

## How was this patch tested?

Updates an existing unit test ("readImages pixel values test" in `ImageSchemaSuite`) to also verify correctness when reading a four-channel image.

Author: Sid Murching <sid.murching@databricks.com>

Closes #20389 from smurching/image-schema-bugfix.

(cherry picked from commit 7bd46d9871567597216cc02e1dc72ff5806ecdf8)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 data/mllib/images/multi-channel/BGRA_alpha_60.png | Bin 0 -> 747 bytes
 .../org/apache/spark/ml/image/ImageSchema.scala   |   5 ++---
 .../apache/spark/ml/image/ImageSchemaSuite.scala  |   9 ++++++---
 3 files changed, 8 insertions(+), 6 deletions(-)
 create mode 100644 data/mllib/images/multi-channel/BGRA_alpha_60.png

diff --git a/data/mllib/images/multi-channel/BGRA_alpha_60.png b/data/mllib/images/multi-channel/BGRA_alpha_60.png
new file mode 100644
index 0000000000000000000000000000000000000000..913637cd2828ab4e2ff4b2bbd92c4cf362f871c4
GIT binary patch
literal 747
zcmV<H0u=p;P)<h;3K|Lk000e1NJLTq003kF003kN1^@s6aN?Cz0008ANkl<Zc%1E>
zL3V>M3<cdjj@85E*xjX@vVdR^MwZ6#XD1$Y2=Vk=AtYbFuh&oc=ly<{1+BH$INtTa
ze$CI{@%X-rKrxl#XvSHhO{pG6r{~)+jtqe@EoKK~#(t1>90}vf5ExS_g|iB()io}T
zdzc-a9(QkC83JQkyn5Lt1Z5CrRi#xHpD{9oI-@&0jtqe@HUD3>kZs0McGO5TM~1+d
z7FE_NXiaZ3z?maMU@%u%R=mvsm?J}AJhD?K_8)UCLtrp7B&y$7129;Iz!*D2yjN8<
z0y9X4z_>+*xc3{0=Ex8j*D|Cx*=8j4K{5o!c7|A?%)le4CMiSsY+o_Vo>AV}VFh50
z41v-2iea`H1DYg5U<}czr}rCyox2Qyfqu6aXGTB<$q*RG3>nT0#|)AoFmf{_nrt%+
z=R=0T=#wE<sGazlKvQH0j6PQk@3yM4;;_1uAu!r9#Jb<;W+Y^%41sYjL#+Ef%?wAA
zWC#pfhSavpj7#;_`H&$n_A{imU1nUW_s)k5fwBL|MY<W$1e^~U0;9_(7rWibv<B2a
z>~ZFN$PgH=pIpR#r$}~v0vQ6s<%(gm8QC*8vEQiGG6cq@D~4&`dke3xtTJT?jHV1R
zn*p1-WHaVkhQK(LAu?mT_I%GyhQKgo$ZgD^p$y@(n<2xRQG=Ep8^{nCn;A09ds8(A
zT2-xU83JRGAy_kN4BT(jY8e8f?Uz2S`+1phqfY#&mLV|C{nDp(Kbg^7%Mci;cTmZU
z&sv7SNV$V*STh2UAuvMkprV>#Myssnk&_`_<2W4$=?*U$0wXp<yvA|L2$C6dEkj`3
zJ3}hliDt~m5E%E&kji$l8FMW|V8mod>)$=dj2RgM;~p7O*^V>AfDC~V@@+tF<5118
dqE*&-`~khx#S9AHa*F@}002ovPDHLkV1lO_Oc($F

literal 0
HcmV?d00001

diff --git a/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala b/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala
index f7850b238465b..dcc40b6668c7a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala
@@ -169,12 +169,11 @@ object ImageSchema {
         var offset = 0
         for (h <- 0 until height) {
           for (w <- 0 until width) {
-            val color = new Color(img.getRGB(w, h))
-
+            val color = new Color(img.getRGB(w, h), hasAlpha)
             decoded(offset) = color.getBlue.toByte
             decoded(offset + 1) = color.getGreen.toByte
             decoded(offset + 2) = color.getRed.toByte
-            if (nChannels == 4) {
+            if (hasAlpha) {
               decoded(offset + 3) = color.getAlpha.toByte
             }
             offset += nChannels
diff --git a/mllib/src/test/scala/org/apache/spark/ml/image/ImageSchemaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/image/ImageSchemaSuite.scala
index dba61cd1eb1cc..a8833c615865d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/image/ImageSchemaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/image/ImageSchemaSuite.scala
@@ -53,11 +53,11 @@ class ImageSchemaSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(df.count === 1)
 
     df = readImages(imagePath, null, true, -1, false, 1.0, 0)
-    assert(df.count === 9)
+    assert(df.count === 10)
 
     df = readImages(imagePath, null, true, -1, true, 1.0, 0)
     val countTotal = df.count
-    assert(countTotal === 7)
+    assert(countTotal === 8)
 
     df = readImages(imagePath, null, true, -1, true, 0.5, 0)
     // Random number about half of the size of the original dataset
@@ -103,6 +103,9 @@ class ImageSchemaSuite extends SparkFunSuite with MLlibTestSparkContext {
         -71, -58, -56, -73, -64))),
     "BGRA.png" -> (("CV_8UC4",
       Array[Byte](-128, -128, -8, -1, -128, -128, -8, -1, -128,
-        -128, -8, -1, 127, 127, -9, -1, 127, 127, -9, -1)))
+        -128, -8, -1, 127, 127, -9, -1, 127, 127, -9, -1))),
+    "BGRA_alpha_60.png" -> (("CV_8UC4",
+      Array[Byte](-128, -128, -8, 60, -128, -128, -8, 60, -128,
+        -128, -8, 60, 127, 127, -9, 60, 127, 127, -9, 60)))
   )
 }

From fdf140e25998ad45ce0f931e2e1e36bf4b382dca Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 26 Jan 2018 11:58:20 +0800
Subject: [PATCH 188/566] [SPARK-23020][CORE] Fix race in SparkAppHandle
 cleanup, again.

Third time is the charm?

There was still a race that was left in previous attempts. If the handle
closes the connection, the close() implementation would clean up state
that would prevent the thread from waiting on the connection thread to
finish. That could cause the race causing the test flakiness reported
in the bug.

The fix is to move the "wait for connection thread" code to a separate
close method that is used by the handle; that also simplifies the code
a bit and makes it also easier to follow.

I included an unrelated, but correct, change to a YARN test so that
it triggers when the PR is built.

Tested by inserting a sleep in the connection thread to mimic the race;
test failed reliably with the sleep, passes now. (Sleep not included in
the patch.) Also ran YARN tests to make sure.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20388 from vanzin/SPARK-23020.

(cherry picked from commit 70a68b328b856c17eb22cc86fee0ebe8d64f8825)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/launcher/AbstractAppHandle.java     | 42 ++++++++------
 .../spark/launcher/ChildProcAppHandle.java    | 11 +---
 .../spark/launcher/InProcessAppHandle.java    |  9 +--
 .../apache/spark/launcher/LauncherServer.java | 55 +++++++++----------
 .../spark/deploy/yarn/YarnClusterSuite.scala  |  5 +-
 5 files changed, 55 insertions(+), 67 deletions(-)

diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java
index daf0972f824dd..84a25a5254151 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java
@@ -20,6 +20,7 @@
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.concurrent.atomic.AtomicReference;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
@@ -29,15 +30,15 @@ abstract class AbstractAppHandle implements SparkAppHandle {
 
   private final LauncherServer server;
 
-  private LauncherConnection connection;
+  private LauncherServer.ServerConnection connection;
   private List<Listener> listeners;
-  private State state;
+  private AtomicReference<State> state;
   private String appId;
   private volatile boolean disposed;
 
   protected AbstractAppHandle(LauncherServer server) {
     this.server = server;
-    this.state = State.UNKNOWN;
+    this.state = new AtomicReference<>(State.UNKNOWN);
   }
 
   @Override
@@ -50,7 +51,7 @@ public synchronized void addListener(Listener l) {
 
   @Override
   public State getState() {
-    return state;
+    return state.get();
   }
 
   @Override
@@ -73,7 +74,7 @@ public synchronized void disconnect() {
     if (!isDisposed()) {
       if (connection != null) {
         try {
-          connection.close();
+          connection.closeAndWait();
         } catch (IOException ioe) {
           // no-op.
         }
@@ -82,7 +83,7 @@ public synchronized void disconnect() {
     }
   }
 
-  void setConnection(LauncherConnection connection) {
+  void setConnection(LauncherServer.ServerConnection connection) {
     this.connection = connection;
   }
 
@@ -99,12 +100,9 @@ boolean isDisposed() {
    */
   synchronized void dispose() {
     if (!isDisposed()) {
-      // Unregister first to make sure that the connection with the app has been really
-      // terminated.
       server.unregister(this);
-      if (!getState().isFinal()) {
-        setState(State.LOST);
-      }
+      // Set state to LOST if not yet final.
+      setState(State.LOST, false);
       this.disposed = true;
     }
   }
@@ -113,14 +111,24 @@ void setState(State s) {
     setState(s, false);
   }
 
-  synchronized void setState(State s, boolean force) {
-    if (force || !state.isFinal()) {
-      state = s;
+  void setState(State s, boolean force) {
+    if (force) {
+      state.set(s);
       fireEvent(false);
-    } else {
-      LOG.log(Level.WARNING, "Backend requested transition from final state {0} to {1}.",
-        new Object[] { state, s });
+      return;
     }
+
+    State current = state.get();
+    while (!current.isFinal()) {
+      if (state.compareAndSet(current, s)) {
+        fireEvent(false);
+        return;
+      }
+      current = state.get();
+    }
+
+    LOG.log(Level.WARNING, "Backend requested transition from final state {0} to {1}.",
+      new Object[] { current, s });
   }
 
   synchronized void setAppId(String appId) {
diff --git a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
index 2b99461652e1f..5e3c95676ecbe 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
@@ -104,19 +104,12 @@ void monitorChild() {
         ec = 1;
       }
 
-      State currState = getState();
-      State newState = null;
       if (ec != 0) {
+        State currState = getState();
         // Override state with failure if the current state is not final, or is success.
         if (!currState.isFinal() || currState == State.FINISHED) {
-          newState = State.FAILED;
+          setState(State.FAILED, true);
         }
-      } else if (!currState.isFinal()) {
-        newState = State.LOST;
-      }
-
-      if (newState != null) {
-        setState(newState, true);
       }
 
       disconnect();
diff --git a/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
index f04263cb74a58..b8030e0063a37 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
@@ -66,14 +66,7 @@ synchronized void start(String appName, Method main, String[] args) {
         setState(State.FAILED);
       }
 
-      synchronized (InProcessAppHandle.this) {
-        if (!isDisposed()) {
-          disconnect();
-          if (!getState().isFinal()) {
-            setState(State.LOST, true);
-          }
-        }
-      }
+      disconnect();
     });
 
     app.setName(String.format(THREAD_NAME_FMT, THREAD_IDS.incrementAndGet(), appName));
diff --git a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
index 8091885c4f562..f4ecd52fdeab8 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
@@ -218,32 +218,6 @@ void unregister(AbstractAppHandle handle) {
       }
     }
 
-    // If there is a live connection for this handle, we need to wait for it to finish before
-    // returning, otherwise there might be a race between the connection thread processing
-    // buffered data and the handle cleaning up after itself, leading to potentially the wrong
-    // state being reported for the handle.
-    ServerConnection conn = null;
-    synchronized (clients) {
-      for (ServerConnection c : clients) {
-        if (c.handle == handle) {
-          conn = c;
-          break;
-        }
-      }
-    }
-
-    if (conn != null) {
-      synchronized (conn) {
-        if (conn.isOpen()) {
-          try {
-            conn.wait();
-          } catch (InterruptedException ie) {
-            // Ignore.
-          }
-        }
-      }
-    }
-
     unref();
   }
 
@@ -312,9 +286,10 @@ private String createSecret() {
     }
   }
 
-  private class ServerConnection extends LauncherConnection {
+  class ServerConnection extends LauncherConnection {
 
     private TimerTask timeout;
+    private volatile Thread connectionThread;
     volatile AbstractAppHandle handle;
 
     ServerConnection(Socket socket, TimerTask timeout) throws IOException {
@@ -322,6 +297,12 @@ private class ServerConnection extends LauncherConnection {
       this.timeout = timeout;
     }
 
+    @Override
+    public void run() {
+      this.connectionThread = Thread.currentThread();
+      super.run();
+    }
+
     @Override
     protected void handle(Message msg) throws IOException {
       try {
@@ -376,9 +357,23 @@ public void close() throws IOException {
         clients.remove(this);
       }
 
-      synchronized (this) {
-        super.close();
-        notifyAll();
+      super.close();
+    }
+
+    /**
+     * Close the connection and wait for any buffered data to be processed before returning.
+     * This ensures any changes reported by the child application take effect.
+     */
+    public void closeAndWait() throws IOException {
+      close();
+
+      Thread connThread = this.connectionThread;
+      if (Thread.currentThread() != connThread) {
+        try {
+          connThread.join();
+        } catch (InterruptedException ie) {
+          // Ignore.
+        }
       }
     }
 
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index e9dcfaf6ba4f0..5003326b440bf 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -45,8 +45,7 @@ import org.apache.spark.util.Utils
 
 /**
  * Integration tests for YARN; these tests use a mini Yarn cluster to run Spark-on-YARN
- * applications, and require the Spark assembly to be built before they can be successfully
- * run.
+ * applications.
  */
 @ExtendedYarnTest
 class YarnClusterSuite extends BaseYarnClusterSuite {
@@ -152,7 +151,7 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
   }
 
   test("run Python application in yarn-cluster mode using " +
-    " spark.yarn.appMasterEnv to override local envvar") {
+    "spark.yarn.appMasterEnv to override local envvar") {
     testPySpark(
       clientMode = false,
       extraConf = Map(

From d6cdc699e350adc6b5ed938192e0e72cff5f52d8 Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Fri, 26 Jan 2018 12:23:14 +0200
Subject: [PATCH 189/566] [SPARK-22799][ML] Bucketizer should throw exception
 if single- and multi-column params are both set

## What changes were proposed in this pull request?

Currently there is a mixed situation when both single- and multi-column are supported. In some cases exceptions are thrown, in others only a warning log is emitted. In this discussion https://issues.apache.org/jira/browse/SPARK-8418?focusedCommentId=16275049&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-16275049, the decision was to throw an exception.

The PR throws an exception in `Bucketizer`, instead of logging a warning.

## How was this patch tested?

modified UT

Author: Marco Gaido <marcogaido91@gmail.com>
Author: Joseph K. Bradley <joseph@databricks.com>

Closes #19993 from mgaido91/SPARK-22799.

(cherry picked from commit cd3956df0f96dd416b6161bf7ce2962e06d0a62e)
Signed-off-by: Nick Pentreath <nickp@za.ibm.com>
---
 .../apache/spark/ml/feature/Bucketizer.scala  | 44 +++++-------
 .../org/apache/spark/ml/param/params.scala    | 69 +++++++++++++++++++
 .../spark/ml/feature/BucketizerSuite.scala    | 41 +++++------
 .../apache/spark/ml/param/ParamsSuite.scala   | 22 ++++++
 4 files changed, 131 insertions(+), 45 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
index 8299a3e95d822..c13bf47eacb94 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -32,11 +32,13 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
 
 /**
- * `Bucketizer` maps a column of continuous features to a column of feature buckets. Since 2.3.0,
+ * `Bucketizer` maps a column of continuous features to a column of feature buckets.
+ *
+ * Since 2.3.0,
  * `Bucketizer` can map multiple columns at once by setting the `inputCols` parameter. Note that
- * when both the `inputCol` and `inputCols` parameters are set, a log warning will be printed and
- * only `inputCol` will take effect, while `inputCols` will be ignored. The `splits` parameter is
- * only used for single column usage, and `splitsArray` is for multiple columns.
+ * when both the `inputCol` and `inputCols` parameters are set, an Exception will be thrown. The
+ * `splits` parameter is only used for single column usage, and `splitsArray` is for multiple
+ * columns.
  */
 @Since("1.4.0")
 final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String)
@@ -134,28 +136,11 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
   @Since("2.3.0")
   def setOutputCols(value: Array[String]): this.type = set(outputCols, value)
 
-  /**
-   * Determines whether this `Bucketizer` is going to map multiple columns. If and only if
-   * `inputCols` is set, it will map multiple columns. Otherwise, it just maps a column specified
-   * by `inputCol`. A warning will be printed if both are set.
-   */
-  private[feature] def isBucketizeMultipleColumns(): Boolean = {
-    if (isSet(inputCols) && isSet(inputCol)) {
-      logWarning("Both `inputCol` and `inputCols` are set, we ignore `inputCols` and this " +
-        "`Bucketizer` only map one column specified by `inputCol`")
-      false
-    } else if (isSet(inputCols)) {
-      true
-    } else {
-      false
-    }
-  }
-
   @Since("2.0.0")
   override def transform(dataset: Dataset[_]): DataFrame = {
     val transformedSchema = transformSchema(dataset.schema)
 
-    val (inputColumns, outputColumns) = if (isBucketizeMultipleColumns()) {
+    val (inputColumns, outputColumns) = if (isSet(inputCols)) {
       ($(inputCols).toSeq, $(outputCols).toSeq)
     } else {
       (Seq($(inputCol)), Seq($(outputCol)))
@@ -170,7 +155,7 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
       }
     }
 
-    val seqOfSplits = if (isBucketizeMultipleColumns()) {
+    val seqOfSplits = if (isSet(inputCols)) {
       $(splitsArray).toSeq
     } else {
       Seq($(splits))
@@ -201,9 +186,18 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
 
   @Since("1.4.0")
   override def transformSchema(schema: StructType): StructType = {
-    if (isBucketizeMultipleColumns()) {
+    ParamValidators.checkSingleVsMultiColumnParams(this, Seq(outputCol, splits),
+      Seq(outputCols, splitsArray))
+
+    if (isSet(inputCols)) {
+      require(getInputCols.length == getOutputCols.length &&
+        getInputCols.length == getSplitsArray.length, s"Bucketizer $this has mismatched Params " +
+        s"for multi-column transform.  Params (inputCols, outputCols, splitsArray) should have " +
+        s"equal lengths, but they have different lengths: " +
+        s"(${getInputCols.length}, ${getOutputCols.length}, ${getSplitsArray.length}).")
+
       var transformedSchema = schema
-      $(inputCols).zip($(outputCols)).zipWithIndex.map { case ((inputCol, outputCol), idx) =>
+      $(inputCols).zip($(outputCols)).zipWithIndex.foreach { case ((inputCol, outputCol), idx) =>
         SchemaUtils.checkNumericType(transformedSchema, inputCol)
         transformedSchema = SchemaUtils.appendColumn(transformedSchema,
           prepOutputField($(splitsArray)(idx), outputCol))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 1b4b401ac4aa0..9a83a5882ce29 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -249,6 +249,75 @@ object ParamValidators {
   def arrayLengthGt[T](lowerBound: Double): Array[T] => Boolean = { (value: Array[T]) =>
     value.length > lowerBound
   }
+
+  /**
+   * Utility for Param validity checks for Transformers which have both single- and multi-column
+   * support.  This utility assumes that `inputCol` indicates single-column usage and
+   * that `inputCols` indicates multi-column usage.
+   *
+   * This checks to ensure that exactly one set of Params has been set, and it
+   * raises an `IllegalArgumentException` if not.
+   *
+   * @param singleColumnParams Params which should be set (or have defaults) if `inputCol` has been
+   *                           set.  This does not need to include `inputCol`.
+   * @param multiColumnParams Params which should be set (or have defaults) if `inputCols` has been
+   *                           set.  This does not need to include `inputCols`.
+   */
+  def checkSingleVsMultiColumnParams(
+      model: Params,
+      singleColumnParams: Seq[Param[_]],
+      multiColumnParams: Seq[Param[_]]): Unit = {
+    val name = s"${model.getClass.getSimpleName} $model"
+
+    def checkExclusiveParams(
+        isSingleCol: Boolean,
+        requiredParams: Seq[Param[_]],
+        excludedParams: Seq[Param[_]]): Unit = {
+      val badParamsMsgBuilder = new mutable.StringBuilder()
+
+      val mustUnsetParams = excludedParams.filter(p => model.isSet(p))
+        .map(_.name).mkString(", ")
+      if (mustUnsetParams.nonEmpty) {
+        badParamsMsgBuilder ++=
+          s"The following Params are not applicable and should not be set: $mustUnsetParams."
+      }
+
+      val mustSetParams = requiredParams.filter(p => !model.isDefined(p))
+        .map(_.name).mkString(", ")
+      if (mustSetParams.nonEmpty) {
+        badParamsMsgBuilder ++=
+          s"The following Params must be defined but are not set: $mustSetParams."
+      }
+
+      val badParamsMsg = badParamsMsgBuilder.toString()
+
+      if (badParamsMsg.nonEmpty) {
+        val errPrefix = if (isSingleCol) {
+          s"$name has the inputCol Param set for single-column transform."
+        } else {
+          s"$name has the inputCols Param set for multi-column transform."
+        }
+        throw new IllegalArgumentException(s"$errPrefix $badParamsMsg")
+      }
+    }
+
+    val inputCol = model.getParam("inputCol")
+    val inputCols = model.getParam("inputCols")
+
+    if (model.isSet(inputCol)) {
+      require(!model.isSet(inputCols), s"$name requires " +
+        s"exactly one of inputCol, inputCols Params to be set, but both are set.")
+
+      checkExclusiveParams(isSingleCol = true, requiredParams = singleColumnParams,
+        excludedParams = multiColumnParams)
+    } else if (model.isSet(inputCols)) {
+      checkExclusiveParams(isSingleCol = false, requiredParams = multiColumnParams,
+        excludedParams = singleColumnParams)
+    } else {
+      throw new IllegalArgumentException(s"$name requires " +
+        s"exactly one of inputCol, inputCols Params to be set, but neither is set.")
+    }
+  }
 }
 
 // specialize primitive-typed params because Java doesn't recognize scala.Double, scala.Int, ...
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
index d9c97ae8067d3..7403680ae3fdc 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
@@ -216,8 +216,6 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
       .setOutputCols(Array("result1", "result2"))
       .setSplitsArray(splits)
 
-    assert(bucketizer1.isBucketizeMultipleColumns())
-
     bucketizer1.transform(dataFrame).select("result1", "expected1", "result2", "expected2")
     BucketizerSuite.checkBucketResults(bucketizer1.transform(dataFrame),
       Seq("result1", "result2"),
@@ -233,8 +231,6 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
       .setOutputCols(Array("result"))
       .setSplitsArray(Array(splits(0)))
 
-    assert(bucketizer2.isBucketizeMultipleColumns())
-
     withClue("Invalid feature value -0.9 was not caught as an invalid feature!") {
       intercept[SparkException] {
         bucketizer2.transform(badDF1).collect()
@@ -268,8 +264,6 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
       .setOutputCols(Array("result1", "result2"))
       .setSplitsArray(splits)
 
-    assert(bucketizer.isBucketizeMultipleColumns())
-
     BucketizerSuite.checkBucketResults(bucketizer.transform(dataFrame),
       Seq("result1", "result2"),
       Seq("expected1", "expected2"))
@@ -295,8 +289,6 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
       .setOutputCols(Array("result1", "result2"))
       .setSplitsArray(splits)
 
-    assert(bucketizer.isBucketizeMultipleColumns())
-
     bucketizer.setHandleInvalid("keep")
     BucketizerSuite.checkBucketResults(bucketizer.transform(dataFrame),
       Seq("result1", "result2"),
@@ -335,7 +327,6 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
       .setInputCols(Array("myInputCol"))
       .setOutputCols(Array("myOutputCol"))
       .setSplitsArray(Array(Array(0.1, 0.8, 0.9)))
-    assert(t.isBucketizeMultipleColumns())
     testDefaultReadWrite(t)
   }
 
@@ -348,8 +339,6 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
       .setOutputCols(Array("result1", "result2"))
       .setSplitsArray(Array(Array(-0.5, 0.0, 0.5), Array(-0.5, 0.0, 0.5)))
 
-    assert(bucket.isBucketizeMultipleColumns())
-
     val pl = new Pipeline()
       .setStages(Array(bucket))
       .fit(df)
@@ -401,15 +390,27 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
     }
   }
 
-  test("Both inputCol and inputCols are set") {
-    val bucket = new Bucketizer()
-      .setInputCol("feature1")
-      .setOutputCol("result")
-      .setSplits(Array(-0.5, 0.0, 0.5))
-      .setInputCols(Array("feature1", "feature2"))
-
-    // When both are set, we ignore `inputCols` and just map the column specified by `inputCol`.
-    assert(bucket.isBucketizeMultipleColumns() == false)
+  test("assert exception is thrown if both multi-column and single-column params are set") {
+    val df = Seq((0.5, 0.3), (0.5, -0.4)).toDF("feature1", "feature2")
+    ParamsSuite.testExclusiveParams(new Bucketizer, df, ("inputCol", "feature1"),
+      ("inputCols", Array("feature1", "feature2")))
+    ParamsSuite.testExclusiveParams(new Bucketizer, df, ("inputCol", "feature1"),
+      ("outputCol", "result1"), ("splits", Array(-0.5, 0.0, 0.5)),
+      ("outputCols", Array("result1", "result2")))
+    ParamsSuite.testExclusiveParams(new Bucketizer, df, ("inputCol", "feature1"),
+      ("outputCol", "result1"), ("splits", Array(-0.5, 0.0, 0.5)),
+      ("splitsArray", Array(Array(-0.5, 0.0, 0.5), Array(-0.5, 0.0, 0.5))))
+
+    // this should fail because at least one of inputCol and inputCols must be set
+    ParamsSuite.testExclusiveParams(new Bucketizer, df, ("outputCol", "feature1"),
+      ("splits", Array(-0.5, 0.0, 0.5)))
+
+    // the following should fail because not all the params are set
+    ParamsSuite.testExclusiveParams(new Bucketizer, df, ("inputCol", "feature1"),
+      ("outputCol", "result1"))
+    ParamsSuite.testExclusiveParams(new Bucketizer, df,
+      ("inputCols", Array("feature1", "feature2")),
+      ("outputCols", Array("result1", "result2")))
   }
 }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
index 85198ad4c913a..36e06091d24de 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
@@ -20,8 +20,10 @@ package org.apache.spark.ml.param
 import java.io.{ByteArrayOutputStream, ObjectOutputStream}
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.{Estimator, Transformer}
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.util.MyParams
+import org.apache.spark.sql.Dataset
 
 class ParamsSuite extends SparkFunSuite {
 
@@ -430,4 +432,24 @@ object ParamsSuite extends SparkFunSuite {
     require(copyReturnType === obj.getClass,
       s"${clazz.getName}.copy should return ${clazz.getName} instead of ${copyReturnType.getName}.")
   }
+
+  /**
+   * Checks that the class throws an exception in case multiple exclusive params are set.
+   * The params to be checked are passed as arguments with their value.
+   */
+  def testExclusiveParams(
+      model: Params,
+      dataset: Dataset[_],
+      paramsAndValues: (String, Any)*): Unit = {
+    val m = model.copy(ParamMap.empty)
+    paramsAndValues.foreach { case (paramName, paramValue) =>
+      m.set(m.getParam(paramName), paramValue)
+    }
+    intercept[IllegalArgumentException] {
+      m match {
+        case t: Transformer => t.transform(dataset)
+        case e: Estimator[_] => e.fit(dataset)
+      }
+    }
+  }
 }

From ab1b5d921b395cb7df3a3a2c4a7e5778d98e6f01 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Fri, 26 Jan 2018 12:28:27 +0200
Subject: [PATCH 190/566] [SPARK-22797][PYSPARK] Bucketizer support
 multi-column

## What changes were proposed in this pull request?
Bucketizer support multi-column in the python side

## How was this patch tested?
existing tests and added tests

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #19892 from zhengruifeng/20542_py.

(cherry picked from commit c22eaa94e85aaac649566495dcf763a5de3c8d06)
Signed-off-by: Nick Pentreath <nickp@za.ibm.com>
---
 python/pyspark/ml/feature.py        | 105 +++++++++++++++++++++-------
 python/pyspark/ml/param/__init__.py |  10 +++
 python/pyspark/ml/tests.py          |   9 +++
 3 files changed, 99 insertions(+), 25 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index da85ba761a145..fdc7787140490 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -317,26 +317,33 @@ class BucketedRandomProjectionLSHModel(LSHModel, JavaMLReadable, JavaMLWritable)
 
 
 @inherit_doc
-class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasHandleInvalid,
-                 JavaMLReadable, JavaMLWritable):
-    """
-    Maps a column of continuous features to a column of feature buckets.
-
-    >>> values = [(0.1,), (0.4,), (1.2,), (1.5,), (float("nan"),), (float("nan"),)]
-    >>> df = spark.createDataFrame(values, ["values"])
+class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols, HasOutputCols,
+                 HasHandleInvalid, JavaMLReadable, JavaMLWritable):
+    """
+    Maps a column of continuous features to a column of feature buckets. Since 2.3.0,
+    :py:class:`Bucketizer` can map multiple columns at once by setting the :py:attr:`inputCols`
+    parameter. Note that when both the :py:attr:`inputCol` and :py:attr:`inputCols` parameters
+    are set, an Exception will be thrown. The :py:attr:`splits` parameter is only used for single
+    column usage, and :py:attr:`splitsArray` is for multiple columns.
+
+    >>> values = [(0.1, 0.0), (0.4, 1.0), (1.2, 1.3), (1.5, float("nan")),
+    ...     (float("nan"), 1.0), (float("nan"), 0.0)]
+    >>> df = spark.createDataFrame(values, ["values1", "values2"])
     >>> bucketizer = Bucketizer(splits=[-float("inf"), 0.5, 1.4, float("inf")],
-    ...     inputCol="values", outputCol="buckets")
-    >>> bucketed = bucketizer.setHandleInvalid("keep").transform(df).collect()
-    >>> len(bucketed)
-    6
-    >>> bucketed[0].buckets
-    0.0
-    >>> bucketed[1].buckets
-    0.0
-    >>> bucketed[2].buckets
-    1.0
-    >>> bucketed[3].buckets
-    2.0
+    ...     inputCol="values1", outputCol="buckets")
+    >>> bucketed = bucketizer.setHandleInvalid("keep").transform(df.select("values1"))
+    >>> bucketed.show(truncate=False)
+    +-------+-------+
+    |values1|buckets|
+    +-------+-------+
+    |0.1    |0.0    |
+    |0.4    |0.0    |
+    |1.2    |1.0    |
+    |1.5    |2.0    |
+    |NaN    |3.0    |
+    |NaN    |3.0    |
+    +-------+-------+
+    ...
     >>> bucketizer.setParams(outputCol="b").transform(df).head().b
     0.0
     >>> bucketizerPath = temp_path + "/bucketizer"
@@ -347,6 +354,22 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasHandleInvalid,
     >>> bucketed = bucketizer.setHandleInvalid("skip").transform(df).collect()
     >>> len(bucketed)
     4
+    >>> bucketizer2 = Bucketizer(splitsArray=
+    ...     [[-float("inf"), 0.5, 1.4, float("inf")], [-float("inf"), 0.5, float("inf")]],
+    ...     inputCols=["values1", "values2"], outputCols=["buckets1", "buckets2"])
+    >>> bucketed2 = bucketizer2.setHandleInvalid("keep").transform(df)
+    >>> bucketed2.show(truncate=False)
+    +-------+-------+--------+--------+
+    |values1|values2|buckets1|buckets2|
+    +-------+-------+--------+--------+
+    |0.1    |0.0    |0.0     |0.0     |
+    |0.4    |1.0    |0.0     |1.0     |
+    |1.2    |1.3    |1.0     |1.0     |
+    |1.5    |NaN    |2.0     |2.0     |
+    |NaN    |1.0    |3.0     |1.0     |
+    |NaN    |0.0    |3.0     |0.0     |
+    +-------+-------+--------+--------+
+    ...
 
     .. versionadded:: 1.4.0
     """
@@ -363,14 +386,30 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasHandleInvalid,
 
     handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. " +
                           "Options are 'skip' (filter out rows with invalid values), " +
-                          "'error' (throw an error), or 'keep' (keep invalid values in a special " +
-                          "additional bucket).",
+                          "'error' (throw an error), or 'keep' (keep invalid values in a " +
+                          "special additional bucket). Note that in the multiple column " +
+                          "case, the invalid handling is applied to all columns. That said " +
+                          "for 'error' it will throw an error if any invalids are found in " +
+                          "any column, for 'skip' it will skip rows with any invalids in " +
+                          "any columns, etc.",
                           typeConverter=TypeConverters.toString)
 
+    splitsArray = Param(Params._dummy(), "splitsArray", "The array of split points for mapping " +
+                        "continuous features into buckets for multiple columns. For each input " +
+                        "column, with n+1 splits, there are n buckets. A bucket defined by " +
+                        "splits x,y holds values in the range [x,y) except the last bucket, " +
+                        "which also includes y. The splits should be of length >= 3 and " +
+                        "strictly increasing. Values at -inf, inf must be explicitly provided " +
+                        "to cover all Double values; otherwise, values outside the splits " +
+                        "specified will be treated as errors.",
+                        typeConverter=TypeConverters.toListListFloat)
+
     @keyword_only
-    def __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error"):
+    def __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error",
+                 splitsArray=None, inputCols=None, outputCols=None):
         """
-        __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error")
+        __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error", \
+                 splitsArray=None, inputCols=None, outputCols=None)
         """
         super(Bucketizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Bucketizer", self.uid)
@@ -380,9 +419,11 @@ def __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="er
 
     @keyword_only
     @since("1.4.0")
-    def setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error"):
+    def setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error",
+                  splitsArray=None, inputCols=None, outputCols=None):
         """
-        setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error")
+        setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error", \
+                  splitsArray=None, inputCols=None, outputCols=None)
         Sets params for this Bucketizer.
         """
         kwargs = self._input_kwargs
@@ -402,6 +443,20 @@ def getSplits(self):
         """
         return self.getOrDefault(self.splits)
 
+    @since("2.3.0")
+    def setSplitsArray(self, value):
+        """
+        Sets the value of :py:attr:`splitsArray`.
+        """
+        return self._set(splitsArray=value)
+
+    @since("2.3.0")
+    def getSplitsArray(self):
+        """
+        Gets the array of split points or its default value.
+        """
+        return self.getOrDefault(self.splitsArray)
+
 
 @inherit_doc
 class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
index 043c25cf9feb4..5b6b70292f099 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -134,6 +134,16 @@ def toListFloat(value):
                 return [float(v) for v in value]
         raise TypeError("Could not convert %s to list of floats" % value)
 
+    @staticmethod
+    def toListListFloat(value):
+        """
+        Convert a value to list of list of floats, if possible.
+        """
+        if TypeConverters._can_convert_to_list(value):
+            value = TypeConverters.toList(value)
+            return [TypeConverters.toListFloat(v) for v in value]
+        raise TypeError("Could not convert %s to list of list of floats" % value)
+
     @staticmethod
     def toListInt(value):
         """
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 1af2b91da900d..b8bddbd06f165 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -238,6 +238,15 @@ def test_bool(self):
         self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept=1))
         self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept="false"))
 
+    def test_list_list_float(self):
+        b = Bucketizer(splitsArray=[[-0.1, 0.5, 3], [-5, 1.5]])
+        self.assertEqual(b.getSplitsArray(), [[-0.1, 0.5, 3.0], [-5.0, 1.5]])
+        self.assertTrue(all([type(v) == list for v in b.getSplitsArray()]))
+        self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[0]]))
+        self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[1]]))
+        self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=["a", 1.0]))
+        self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=[[-5, 1.5], ["a", 1.0]]))
+
 
 class PipelineTests(PySparkTestCase):
 

From ca3613be20ff4dc546c43322eeabf591ab8ad97f Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 26 Jan 2018 09:17:05 -0800
Subject: [PATCH 191/566] [SPARK-23218][SQL] simplify ColumnVector.getArray

## What changes were proposed in this pull request?

`ColumnVector` is very flexible about how to implement array type. As a result `ColumnVector` has 3 abstract methods for array type: `arrayData`, `getArrayOffset`, `getArrayLength`. For example, in `WritableColumnVector` we use the first child vector as the array data vector, and store offsets and lengths in 2 arrays in the parent vector. `ArrowColumnVector` has a different implementation.

This PR simplifies `ColumnVector` by using only one abstract method for array type: `getArray`.

## How was this patch tested?

existing tests.

rerun `ColumnarBatchBenchmark`, there is no performance regression.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20395 from cloud-fan/vector.

(cherry picked from commit dd8e257d1ccf20f4383dd7f30d634010b176f0d3)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../datasources/orc/OrcColumnVector.java      | 13 +--
 .../vectorized/WritableColumnVector.java      | 13 ++-
 .../sql/vectorized/ArrowColumnVector.java     | 48 ++++------
 .../spark/sql/vectorized/ColumnVector.java    | 88 ++++++++++---------
 .../spark/sql/vectorized/ColumnarArray.java   |  2 +
 .../spark/sql/vectorized/ColumnarBatch.java   |  2 +
 .../spark/sql/vectorized/ColumnarRow.java     |  2 +
 .../vectorized/ColumnarBatchBenchmark.scala   | 14 ++-
 8 files changed, 87 insertions(+), 95 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
index aaf2a380034a9..5078bc7922ee2 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
@@ -24,6 +24,7 @@
 import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.Decimal;
 import org.apache.spark.sql.types.TimestampType;
+import org.apache.spark.sql.vectorized.ColumnarArray;
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
@@ -145,16 +146,6 @@ public double getDouble(int rowId) {
     return doubleData.vector[getRowIndex(rowId)];
   }
 
-  @Override
-  public int getArrayLength(int rowId) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public int getArrayOffset(int rowId) {
-    throw new UnsupportedOperationException();
-  }
-
   @Override
   public Decimal getDecimal(int rowId, int precision, int scale) {
     BigDecimal data = decimalData.vector[getRowIndex(rowId)].getHiveDecimal().bigDecimalValue();
@@ -177,7 +168,7 @@ public byte[] getBinary(int rowId) {
   }
 
   @Override
-  public org.apache.spark.sql.vectorized.ColumnVector arrayData() {
+  public ColumnarArray getArray(int rowId) {
     throw new UnsupportedOperationException();
   }
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index ca4f00985c2a3..a8ec8ef2aadf8 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -24,6 +24,7 @@
 import org.apache.spark.sql.internal.SQLConf;
 import org.apache.spark.sql.types.*;
 import org.apache.spark.sql.vectorized.ColumnVector;
+import org.apache.spark.sql.vectorized.ColumnarArray;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.types.UTF8String;
 
@@ -602,7 +603,17 @@ public final int appendStruct(boolean isNull) {
   // `WritableColumnVector` puts the data of array in the first child column vector, and puts the
   // array offsets and lengths in the current column vector.
   @Override
-  public WritableColumnVector arrayData() { return childColumns[0]; }
+  public final ColumnarArray getArray(int rowId) {
+    return new ColumnarArray(arrayData(), getArrayOffset(rowId), getArrayLength(rowId));
+  }
+
+  public WritableColumnVector arrayData() {
+    return childColumns[0];
+  }
+
+  public abstract int getArrayLength(int rowId);
+
+  public abstract int getArrayOffset(int rowId);
 
   @Override
   public WritableColumnVector getChild(int ordinal) { return childColumns[ordinal]; }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
index ca7a4751450d4..9803c3dec6de2 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
@@ -17,17 +17,21 @@
 
 package org.apache.spark.sql.vectorized;
 
+import io.netty.buffer.ArrowBuf;
 import org.apache.arrow.vector.*;
 import org.apache.arrow.vector.complex.*;
 import org.apache.arrow.vector.holders.NullableVarCharHolder;
 
+import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.execution.arrow.ArrowUtils;
 import org.apache.spark.sql.types.*;
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
- * A column vector backed by Apache Arrow.
+ * A column vector backed by Apache Arrow. Currently time interval type and map type are not
+ * supported.
  */
+@InterfaceStability.Evolving
 public final class ArrowColumnVector extends ColumnVector {
 
   private final ArrowVectorAccessor accessor;
@@ -90,16 +94,6 @@ public double getDouble(int rowId) {
     return accessor.getDouble(rowId);
   }
 
-  @Override
-  public int getArrayLength(int rowId) {
-    return accessor.getArrayLength(rowId);
-  }
-
-  @Override
-  public int getArrayOffset(int rowId) {
-    return accessor.getArrayOffset(rowId);
-  }
-
   @Override
   public Decimal getDecimal(int rowId, int precision, int scale) {
     return accessor.getDecimal(rowId, precision, scale);
@@ -116,7 +110,9 @@ public byte[] getBinary(int rowId) {
   }
 
   @Override
-  public ArrowColumnVector arrayData() { return childColumns[0]; }
+  public ColumnarArray getArray(int rowId) {
+    return accessor.getArray(rowId);
+  }
 
   @Override
   public ArrowColumnVector getChild(int ordinal) { return childColumns[ordinal]; }
@@ -151,9 +147,6 @@ public ArrowColumnVector(ValueVector vector) {
     } else if (vector instanceof ListVector) {
       ListVector listVector = (ListVector) vector;
       accessor = new ArrayAccessor(listVector);
-
-      childColumns = new ArrowColumnVector[1];
-      childColumns[0] = new ArrowColumnVector(listVector.getDataVector());
     } else if (vector instanceof NullableMapVector) {
       NullableMapVector mapVector = (NullableMapVector) vector;
       accessor = new StructAccessor(mapVector);
@@ -180,10 +173,6 @@ boolean isNullAt(int rowId) {
       return vector.isNull(rowId);
     }
 
-    final int getValueCount() {
-      return vector.getValueCount();
-    }
-
     final int getNullCount() {
       return vector.getNullCount();
     }
@@ -232,11 +221,7 @@ byte[] getBinary(int rowId) {
       throw new UnsupportedOperationException();
     }
 
-    int getArrayLength(int rowId) {
-      throw new UnsupportedOperationException();
-    }
-
-    int getArrayOffset(int rowId) {
+    ColumnarArray getArray(int rowId) {
       throw new UnsupportedOperationException();
     }
   }
@@ -433,10 +418,12 @@ final long getLong(int rowId) {
   private static class ArrayAccessor extends ArrowVectorAccessor {
 
     private final ListVector accessor;
+    private final ArrowColumnVector arrayData;
 
     ArrayAccessor(ListVector vector) {
       super(vector);
       this.accessor = vector;
+      this.arrayData = new ArrowColumnVector(vector.getDataVector());
     }
 
     @Override
@@ -450,13 +437,12 @@ final boolean isNullAt(int rowId) {
     }
 
     @Override
-    final int getArrayLength(int rowId) {
-      return accessor.getInnerValueCountAt(rowId);
-    }
-
-    @Override
-    final int getArrayOffset(int rowId) {
-      return accessor.getOffsetBuffer().getInt(rowId * accessor.OFFSET_WIDTH);
+    final ColumnarArray getArray(int rowId) {
+      ArrowBuf offsets = accessor.getOffsetBuffer();
+      int index = rowId * accessor.OFFSET_WIDTH;
+      int start = offsets.getInt(index);
+      int end = offsets.getInt(index + accessor.OFFSET_WIDTH);
+      return new ColumnarArray(arrayData, start, end - start);
     }
   }
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
index f9936214035b6..4b955ceddd0f2 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
@@ -16,6 +16,7 @@
  */
 package org.apache.spark.sql.vectorized;
 
+import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.catalyst.util.MapData;
 import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.Decimal;
@@ -29,11 +30,14 @@
  * Most of the APIs take the rowId as a parameter. This is the batch local 0-based row id for values
  * in this ColumnVector.
  *
+ * Spark only calls specific `get` method according to the data type of this {@link ColumnVector},
+ * e.g. if it's int type, Spark is guaranteed to only call {@link #getInt(int)} or
+ * {@link #getInts(int, int)}.
+ *
  * ColumnVector supports all the data types including nested types. To handle nested types,
- * ColumnVector can have children and is a tree structure. For struct type, it stores the actual
- * data of each field in the corresponding child ColumnVector, and only stores null information in
- * the parent ColumnVector. For array type, it stores the actual array elements in the child
- * ColumnVector, and stores null information, array offsets and lengths in the parent ColumnVector.
+ * ColumnVector can have children and is a tree structure. Please refer to {@link #getStruct(int)},
+ * {@link #getArray(int)} and {@link #getMap(int)} for the details about how to implement nested
+ * types.
  *
  * ColumnVector is expected to be reused during the entire data loading process, to avoid allocating
  * memory again and again.
@@ -43,6 +47,7 @@
  * format. Since it is expected to reuse the ColumnVector instance while loading data, the storage
  * footprint is negligible.
  */
+@InterfaceStability.Evolving
 public abstract class ColumnVector implements AutoCloseable {
 
   /**
@@ -70,12 +75,12 @@ public abstract class ColumnVector implements AutoCloseable {
   public abstract boolean isNullAt(int rowId);
 
   /**
-   * Returns the value for rowId.
+   * Returns the boolean type value for rowId.
    */
   public abstract boolean getBoolean(int rowId);
 
   /**
-   * Gets values from [rowId, rowId + count)
+   * Gets boolean type values from [rowId, rowId + count)
    */
   public boolean[] getBooleans(int rowId, int count) {
     boolean[] res = new boolean[count];
@@ -86,12 +91,12 @@ public boolean[] getBooleans(int rowId, int count) {
   }
 
   /**
-   * Returns the value for rowId.
+   * Returns the byte type value for rowId.
    */
   public abstract byte getByte(int rowId);
 
   /**
-   * Gets values from [rowId, rowId + count)
+   * Gets byte type values from [rowId, rowId + count)
    */
   public byte[] getBytes(int rowId, int count) {
     byte[] res = new byte[count];
@@ -102,12 +107,12 @@ public byte[] getBytes(int rowId, int count) {
   }
 
   /**
-   * Returns the value for rowId.
+   * Returns the short type value for rowId.
    */
   public abstract short getShort(int rowId);
 
   /**
-   * Gets values from [rowId, rowId + count)
+   * Gets short type values from [rowId, rowId + count)
    */
   public short[] getShorts(int rowId, int count) {
     short[] res = new short[count];
@@ -118,12 +123,12 @@ public short[] getShorts(int rowId, int count) {
   }
 
   /**
-   * Returns the value for rowId.
+   * Returns the int type value for rowId.
    */
   public abstract int getInt(int rowId);
 
   /**
-   * Gets values from [rowId, rowId + count)
+   * Gets int type values from [rowId, rowId + count)
    */
   public int[] getInts(int rowId, int count) {
     int[] res = new int[count];
@@ -134,12 +139,12 @@ public int[] getInts(int rowId, int count) {
   }
 
   /**
-   * Returns the value for rowId.
+   * Returns the long type value for rowId.
    */
   public abstract long getLong(int rowId);
 
   /**
-   * Gets values from [rowId, rowId + count)
+   * Gets long type values from [rowId, rowId + count)
    */
   public long[] getLongs(int rowId, int count) {
     long[] res = new long[count];
@@ -150,12 +155,12 @@ public long[] getLongs(int rowId, int count) {
   }
 
   /**
-   * Returns the value for rowId.
+   * Returns the float type value for rowId.
    */
   public abstract float getFloat(int rowId);
 
   /**
-   * Gets values from [rowId, rowId + count)
+   * Gets float type values from [rowId, rowId + count)
    */
   public float[] getFloats(int rowId, int count) {
     float[] res = new float[count];
@@ -166,12 +171,12 @@ public float[] getFloats(int rowId, int count) {
   }
 
   /**
-   * Returns the value for rowId.
+   * Returns the double type value for rowId.
    */
   public abstract double getDouble(int rowId);
 
   /**
-   * Gets values from [rowId, rowId + count)
+   * Gets double type values from [rowId, rowId + count)
    */
   public double[] getDoubles(int rowId, int count) {
     double[] res = new double[count];
@@ -182,57 +187,54 @@ public double[] getDoubles(int rowId, int count) {
   }
 
   /**
-   * Returns the length of the array for rowId.
-   */
-  public abstract int getArrayLength(int rowId);
-
-  /**
-   * Returns the offset of the array for rowId.
-   */
-  public abstract int getArrayOffset(int rowId);
-
-  /**
-   * Returns the struct for rowId.
+   * Returns the struct type value for rowId.
+   *
+   * To support struct type, implementations must implement {@link #getChild(int)} and make this
+   * vector a tree structure. The number of child vectors must be same as the number of fields of
+   * the struct type, and each child vector is responsible to store the data for its corresponding
+   * struct field.
    */
   public final ColumnarRow getStruct(int rowId) {
     return new ColumnarRow(this, rowId);
   }
 
   /**
-   * Returns the array for rowId.
+   * Returns the array type value for rowId.
+   *
+   * To support array type, implementations must construct an {@link ColumnarArray} and return it in
+   * this method. {@link ColumnarArray} requires a {@link ColumnVector} that stores the data of all
+   * the elements of all the arrays in this vector, and an offset and length which points to a range
+   * in that {@link ColumnVector}, and the range represents the array for rowId. Implementations
+   * are free to decide where to put the data vector and offsets and lengths. For example, we can
+   * use the first child vector as the data vector, and store offsets and lengths in 2 int arrays in
+   * this vector.
    */
-  public final ColumnarArray getArray(int rowId) {
-    return new ColumnarArray(arrayData(), getArrayOffset(rowId), getArrayLength(rowId));
-  }
+  public abstract ColumnarArray getArray(int rowId);
 
   /**
-   * Returns the map for rowId.
+   * Returns the map type value for rowId.
    */
   public MapData getMap(int ordinal) {
     throw new UnsupportedOperationException();
   }
 
   /**
-   * Returns the decimal for rowId.
+   * Returns the decimal type value for rowId.
    */
   public abstract Decimal getDecimal(int rowId, int precision, int scale);
 
   /**
-   * Returns the UTF8String for rowId. Note that the returned UTF8String may point to the data of
-   * this column vector, please copy it if you want to keep it after this column vector is freed.
+   * Returns the string type value for rowId. Note that the returned UTF8String may point to the
+   * data of this column vector, please copy it if you want to keep it after this column vector is
+   * freed.
    */
   public abstract UTF8String getUTF8String(int rowId);
 
   /**
-   * Returns the byte array for rowId.
+   * Returns the binary type value for rowId.
    */
   public abstract byte[] getBinary(int rowId);
 
-  /**
-   * Returns the data for the underlying array.
-   */
-  public abstract ColumnVector arrayData();
-
   /**
    * Returns the ordinal's child column vector.
    */
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
index 522c39580389f..0d2c3ec8648d3 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
@@ -16,6 +16,7 @@
  */
 package org.apache.spark.sql.vectorized;
 
+import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.catalyst.util.ArrayData;
 import org.apache.spark.sql.catalyst.util.MapData;
 import org.apache.spark.sql.types.*;
@@ -25,6 +26,7 @@
 /**
  * Array abstraction in {@link ColumnVector}.
  */
+@InterfaceStability.Evolving
 public final class ColumnarArray extends ArrayData {
   // The data for this array. This array contains elements from
   // data[offset] to data[offset + length).
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java
index 4dc826cf60c15..d206c1df42abb 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java
@@ -18,6 +18,7 @@
 
 import java.util.*;
 
+import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.execution.vectorized.MutableColumnarRow;
 
@@ -26,6 +27,7 @@
  * batch so that Spark can access the data row by row. Instance of it is meant to be reused during
  * the entire data loading process.
  */
+@InterfaceStability.Evolving
 public final class ColumnarBatch {
   private int numRows;
   private final ColumnVector[] columns;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
index 2e59085a82768..25db7e09d20d0 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
@@ -16,6 +16,7 @@
  */
 package org.apache.spark.sql.vectorized;
 
+import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
 import org.apache.spark.sql.catalyst.util.MapData;
@@ -26,6 +27,7 @@
 /**
  * Row abstraction in {@link ColumnVector}.
  */
+@InterfaceStability.Evolving
 public final class ColumnarRow extends InternalRow {
   // The data for this row.
   // E.g. the value of 3rd int field is `data.getChild(3).getInt(rowId)`.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchBenchmark.scala
index ad74fb99b0c73..1f31aa45a1220 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchBenchmark.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.sql.execution.datasources.parquet
+package org.apache.spark.sql.execution.vectorized
 
 import java.nio.ByteBuffer
 import java.nio.charset.StandardCharsets
@@ -23,8 +23,6 @@ import scala.util.Random
 
 import org.apache.spark.memory.MemoryMode
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
-import org.apache.spark.sql.execution.vectorized.OffHeapColumnVector
-import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector
 import org.apache.spark.sql.types.{ArrayType, BinaryType, IntegerType}
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.util.Benchmark
@@ -434,7 +432,6 @@ object ColumnarBatchBenchmark {
     }
 
     def readArrays(onHeap: Boolean): Unit = {
-      System.gc()
       val vector = if (onHeap) onHeapVector else offHeapVector
 
       var sum = 0L
@@ -448,7 +445,6 @@ object ColumnarBatchBenchmark {
     }
 
     def readArrayElements(onHeap: Boolean): Unit = {
-      System.gc()
       val vector = if (onHeap) onHeapVector else offHeapVector
 
       var sum = 0L
@@ -479,10 +475,10 @@ object ColumnarBatchBenchmark {
 
     Array Vector Read:                       Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
     ------------------------------------------------------------------------------------------------
-    On Heap Read Size Only                         416 /  423        393.5           2.5       1.0X
-    Off Heap Read Size Only                        396 /  404        413.6           2.4       1.1X
-    On Heap Read Elements                         2569 / 2590         63.8          15.7       0.2X
-    Off Heap Read Elements                        3302 / 3333         49.6          20.2       0.1X
+    On Heap Read Size Only                         426 /  437        384.9           2.6       1.0X
+    Off Heap Read Size Only                        406 /  421        404.0           2.5       1.0X
+    On Heap Read Elements                         2636 / 2642         62.2          16.1       0.2X
+    Off Heap Read Elements                        3770 / 3774         43.5          23.0       0.1X
     */
     benchmark.run
   }

From f5911d4894700eb48f794133cbd363bf3b7c8c8e Mon Sep 17 00:00:00 2001
From: Nick Pentreath <nickp@za.ibm.com>
Date: Fri, 26 Jan 2018 23:17:45 +0200
Subject: [PATCH 192/566] Revert "[SPARK-22797][PYSPARK] Bucketizer support
 multi-column"

This reverts commit ab1b5d921b395cb7df3a3a2c4a7e5778d98e6f01.
---
 python/pyspark/ml/feature.py        | 105 +++++++---------------------
 python/pyspark/ml/param/__init__.py |  10 ---
 python/pyspark/ml/tests.py          |   9 ---
 3 files changed, 25 insertions(+), 99 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index fdc7787140490..da85ba761a145 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -317,33 +317,26 @@ class BucketedRandomProjectionLSHModel(LSHModel, JavaMLReadable, JavaMLWritable)
 
 
 @inherit_doc
-class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols, HasOutputCols,
-                 HasHandleInvalid, JavaMLReadable, JavaMLWritable):
-    """
-    Maps a column of continuous features to a column of feature buckets. Since 2.3.0,
-    :py:class:`Bucketizer` can map multiple columns at once by setting the :py:attr:`inputCols`
-    parameter. Note that when both the :py:attr:`inputCol` and :py:attr:`inputCols` parameters
-    are set, an Exception will be thrown. The :py:attr:`splits` parameter is only used for single
-    column usage, and :py:attr:`splitsArray` is for multiple columns.
-
-    >>> values = [(0.1, 0.0), (0.4, 1.0), (1.2, 1.3), (1.5, float("nan")),
-    ...     (float("nan"), 1.0), (float("nan"), 0.0)]
-    >>> df = spark.createDataFrame(values, ["values1", "values2"])
+class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasHandleInvalid,
+                 JavaMLReadable, JavaMLWritable):
+    """
+    Maps a column of continuous features to a column of feature buckets.
+
+    >>> values = [(0.1,), (0.4,), (1.2,), (1.5,), (float("nan"),), (float("nan"),)]
+    >>> df = spark.createDataFrame(values, ["values"])
     >>> bucketizer = Bucketizer(splits=[-float("inf"), 0.5, 1.4, float("inf")],
-    ...     inputCol="values1", outputCol="buckets")
-    >>> bucketed = bucketizer.setHandleInvalid("keep").transform(df.select("values1"))
-    >>> bucketed.show(truncate=False)
-    +-------+-------+
-    |values1|buckets|
-    +-------+-------+
-    |0.1    |0.0    |
-    |0.4    |0.0    |
-    |1.2    |1.0    |
-    |1.5    |2.0    |
-    |NaN    |3.0    |
-    |NaN    |3.0    |
-    +-------+-------+
-    ...
+    ...     inputCol="values", outputCol="buckets")
+    >>> bucketed = bucketizer.setHandleInvalid("keep").transform(df).collect()
+    >>> len(bucketed)
+    6
+    >>> bucketed[0].buckets
+    0.0
+    >>> bucketed[1].buckets
+    0.0
+    >>> bucketed[2].buckets
+    1.0
+    >>> bucketed[3].buckets
+    2.0
     >>> bucketizer.setParams(outputCol="b").transform(df).head().b
     0.0
     >>> bucketizerPath = temp_path + "/bucketizer"
@@ -354,22 +347,6 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols, HasOu
     >>> bucketed = bucketizer.setHandleInvalid("skip").transform(df).collect()
     >>> len(bucketed)
     4
-    >>> bucketizer2 = Bucketizer(splitsArray=
-    ...     [[-float("inf"), 0.5, 1.4, float("inf")], [-float("inf"), 0.5, float("inf")]],
-    ...     inputCols=["values1", "values2"], outputCols=["buckets1", "buckets2"])
-    >>> bucketed2 = bucketizer2.setHandleInvalid("keep").transform(df)
-    >>> bucketed2.show(truncate=False)
-    +-------+-------+--------+--------+
-    |values1|values2|buckets1|buckets2|
-    +-------+-------+--------+--------+
-    |0.1    |0.0    |0.0     |0.0     |
-    |0.4    |1.0    |0.0     |1.0     |
-    |1.2    |1.3    |1.0     |1.0     |
-    |1.5    |NaN    |2.0     |2.0     |
-    |NaN    |1.0    |3.0     |1.0     |
-    |NaN    |0.0    |3.0     |0.0     |
-    +-------+-------+--------+--------+
-    ...
 
     .. versionadded:: 1.4.0
     """
@@ -386,30 +363,14 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols, HasOu
 
     handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. " +
                           "Options are 'skip' (filter out rows with invalid values), " +
-                          "'error' (throw an error), or 'keep' (keep invalid values in a " +
-                          "special additional bucket). Note that in the multiple column " +
-                          "case, the invalid handling is applied to all columns. That said " +
-                          "for 'error' it will throw an error if any invalids are found in " +
-                          "any column, for 'skip' it will skip rows with any invalids in " +
-                          "any columns, etc.",
+                          "'error' (throw an error), or 'keep' (keep invalid values in a special " +
+                          "additional bucket).",
                           typeConverter=TypeConverters.toString)
 
-    splitsArray = Param(Params._dummy(), "splitsArray", "The array of split points for mapping " +
-                        "continuous features into buckets for multiple columns. For each input " +
-                        "column, with n+1 splits, there are n buckets. A bucket defined by " +
-                        "splits x,y holds values in the range [x,y) except the last bucket, " +
-                        "which also includes y. The splits should be of length >= 3 and " +
-                        "strictly increasing. Values at -inf, inf must be explicitly provided " +
-                        "to cover all Double values; otherwise, values outside the splits " +
-                        "specified will be treated as errors.",
-                        typeConverter=TypeConverters.toListListFloat)
-
     @keyword_only
-    def __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error",
-                 splitsArray=None, inputCols=None, outputCols=None):
+    def __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error"):
         """
-        __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error", \
-                 splitsArray=None, inputCols=None, outputCols=None)
+        __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error")
         """
         super(Bucketizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Bucketizer", self.uid)
@@ -419,11 +380,9 @@ def __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="er
 
     @keyword_only
     @since("1.4.0")
-    def setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error",
-                  splitsArray=None, inputCols=None, outputCols=None):
+    def setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error"):
         """
-        setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error", \
-                  splitsArray=None, inputCols=None, outputCols=None)
+        setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error")
         Sets params for this Bucketizer.
         """
         kwargs = self._input_kwargs
@@ -443,20 +402,6 @@ def getSplits(self):
         """
         return self.getOrDefault(self.splits)
 
-    @since("2.3.0")
-    def setSplitsArray(self, value):
-        """
-        Sets the value of :py:attr:`splitsArray`.
-        """
-        return self._set(splitsArray=value)
-
-    @since("2.3.0")
-    def getSplitsArray(self):
-        """
-        Gets the array of split points or its default value.
-        """
-        return self.getOrDefault(self.splitsArray)
-
 
 @inherit_doc
 class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
index 5b6b70292f099..043c25cf9feb4 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -134,16 +134,6 @@ def toListFloat(value):
                 return [float(v) for v in value]
         raise TypeError("Could not convert %s to list of floats" % value)
 
-    @staticmethod
-    def toListListFloat(value):
-        """
-        Convert a value to list of list of floats, if possible.
-        """
-        if TypeConverters._can_convert_to_list(value):
-            value = TypeConverters.toList(value)
-            return [TypeConverters.toListFloat(v) for v in value]
-        raise TypeError("Could not convert %s to list of list of floats" % value)
-
     @staticmethod
     def toListInt(value):
         """
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index b8bddbd06f165..1af2b91da900d 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -238,15 +238,6 @@ def test_bool(self):
         self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept=1))
         self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept="false"))
 
-    def test_list_list_float(self):
-        b = Bucketizer(splitsArray=[[-0.1, 0.5, 3], [-5, 1.5]])
-        self.assertEqual(b.getSplitsArray(), [[-0.1, 0.5, 3.0], [-5.0, 1.5]])
-        self.assertTrue(all([type(v) == list for v in b.getSplitsArray()]))
-        self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[0]]))
-        self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[1]]))
-        self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=["a", 1.0]))
-        self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=[[-5, 1.5], ["a", 1.0]]))
-
 
 class PipelineTests(PySparkTestCase):
 

From 30d16e116b0ff044ca03974de0f1faf17e497903 Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Fri, 26 Jan 2018 15:01:03 -0800
Subject: [PATCH 193/566] [SPARK-23207][SQL] Shuffle+Repartition on a DataFrame
 could lead to incorrect answers

## What changes were proposed in this pull request?

Currently shuffle repartition uses RoundRobinPartitioning, the generated result is nondeterministic since the sequence of input rows are not determined.

The bug can be triggered when there is a repartition call following a shuffle (which would lead to non-deterministic row ordering), as the pattern shows below:
upstream stage -> repartition stage -> result stage
(-> indicate a shuffle)
When one of the executors process goes down, some tasks on the repartition stage will be retried and generate inconsistent ordering, and some tasks of the result stage will be retried generating different data.

The following code returns 931532, instead of 1000000:
```
import scala.sys.process._

import org.apache.spark.TaskContext
val res = spark.range(0, 1000 * 1000, 1).repartition(200).map { x =>
  x
}.repartition(200).map { x =>
  if (TaskContext.get.attemptNumber == 0 && TaskContext.get.partitionId < 2) {
    throw new Exception("pkill -f java".!!)
  }
  x
}
res.distinct().count()
```

In this PR, we propose a most straight-forward way to fix this problem by performing a local sort before partitioning, after we make the input row ordering deterministic, the function from rows to partitions is fully deterministic too.

The downside of the approach is that with extra local sort inserted, the performance of repartition() will go down, so we add a new config named `spark.sql.execution.sortBeforeRepartition` to control whether this patch is applied. The patch is default enabled to be safe-by-default, but user may choose to manually turn it off to avoid performance regression.

This patch also changes the output rows ordering of repartition(), that leads to a bunch of test cases failure because they are comparing the results directly.

## How was this patch tested?

Add unit test in ExchangeSuite.

With this patch(and `spark.sql.execution.sortBeforeRepartition` set to true), the following query returns 1000000:
```
import scala.sys.process._

import org.apache.spark.TaskContext

spark.conf.set("spark.sql.execution.sortBeforeRepartition", "true")

val res = spark.range(0, 1000 * 1000, 1).repartition(200).map { x =>
  x
}.repartition(200).map { x =>
  if (TaskContext.get.attemptNumber == 0 && TaskContext.get.partitionId < 2) {
    throw new Exception("pkill -f java".!!)
  }
  x
}
res.distinct().count()

res7: Long = 1000000
```

Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #20393 from jiangxb1987/shuffle-repartition.

(cherry picked from commit 94c67a76ec1fda908a671a47a2a1fa63b3ab1b06)
Signed-off-by: Sameer Agarwal <sameerag@apache.org>
---
 .../unsafe/sort/RecordComparator.java         |  4 +-
 .../unsafe/sort/UnsafeInMemorySorter.java     |  7 +-
 .../unsafe/sort/UnsafeSorterSpillMerger.java  |  4 +-
 .../main/scala/org/apache/spark/rdd/RDD.scala |  2 +
 .../sort/UnsafeExternalSorterSuite.java       |  4 +-
 .../sort/UnsafeInMemorySorterSuite.java       |  8 ++-
 .../spark/ml/feature/Word2VecSuite.scala      |  3 +-
 .../sql/execution/RecordBinaryComparator.java | 70 +++++++++++++++++++
 .../execution/UnsafeExternalRowSorter.java    | 44 ++++++++++--
 .../apache/spark/sql/internal/SQLConf.scala   | 14 ++++
 .../sql/execution/UnsafeKVExternalSorter.java |  8 ++-
 .../apache/spark/sql/execution/SortExec.scala |  2 +-
 .../exchange/ShuffleExchangeExec.scala        | 52 +++++++++++++-
 .../spark/sql/execution/ExchangeSuite.scala   | 26 ++++++-
 .../datasources/parquet/ParquetIOSuite.scala  |  6 +-
 .../datasources/text/WholeTextFileSuite.scala |  2 +-
 .../streaming/ForeachSinkSuite.scala          |  6 +-
 17 files changed, 233 insertions(+), 29 deletions(-)
 create mode 100644 sql/catalyst/src/main/java/org/apache/spark/sql/execution/RecordBinaryComparator.java

diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RecordComparator.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RecordComparator.java
index 09e4258792204..02b5de8e128c9 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RecordComparator.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RecordComparator.java
@@ -32,6 +32,8 @@ public abstract class RecordComparator {
   public abstract int compare(
     Object leftBaseObject,
     long leftBaseOffset,
+    int leftBaseLength,
     Object rightBaseObject,
-    long rightBaseOffset);
+    long rightBaseOffset,
+    int rightBaseLength);
 }
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
index 951d076420ee6..b3c27d83da172 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
@@ -62,12 +62,13 @@ public int compare(RecordPointerAndKeyPrefix r1, RecordPointerAndKeyPrefix r2) {
       int uaoSize = UnsafeAlignedOffset.getUaoSize();
       if (prefixComparisonResult == 0) {
         final Object baseObject1 = memoryManager.getPage(r1.recordPointer);
-        // skip length
         final long baseOffset1 = memoryManager.getOffsetInPage(r1.recordPointer) + uaoSize;
+        final int baseLength1 = UnsafeAlignedOffset.getSize(baseObject1, baseOffset1 - uaoSize);
         final Object baseObject2 = memoryManager.getPage(r2.recordPointer);
-        // skip length
         final long baseOffset2 = memoryManager.getOffsetInPage(r2.recordPointer) + uaoSize;
-        return recordComparator.compare(baseObject1, baseOffset1, baseObject2, baseOffset2);
+        final int baseLength2 = UnsafeAlignedOffset.getSize(baseObject2, baseOffset2 - uaoSize);
+        return recordComparator.compare(baseObject1, baseOffset1, baseLength1, baseObject2,
+          baseOffset2, baseLength2);
       } else {
         return prefixComparisonResult;
       }
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java
index cf4dfde86ca91..ff0dcc259a4ad 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java
@@ -35,8 +35,8 @@ final class UnsafeSorterSpillMerger {
         prefixComparator.compare(left.getKeyPrefix(), right.getKeyPrefix());
       if (prefixComparisonResult == 0) {
         return recordComparator.compare(
-          left.getBaseObject(), left.getBaseOffset(),
-          right.getBaseObject(), right.getBaseOffset());
+          left.getBaseObject(), left.getBaseOffset(), left.getRecordLength(),
+          right.getBaseObject(), right.getBaseOffset(), right.getRecordLength());
       } else {
         return prefixComparisonResult;
       }
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 7859781e98223..0574abdca32ac 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -414,6 +414,8 @@ abstract class RDD[T: ClassTag](
    *
    * If you are decreasing the number of partitions in this RDD, consider using `coalesce`,
    * which can avoid performing a shuffle.
+   *
+   * TODO Fix the Shuffle+Repartition data loss issue described in SPARK-23207.
    */
   def repartition(numPartitions: Int)(implicit ord: Ordering[T] = null): RDD[T] = withScope {
     coalesce(numPartitions, shuffle = true)
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
index af4975c888d65..411cd5cb57331 100644
--- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
@@ -72,8 +72,10 @@ public class UnsafeExternalSorterSuite {
     public int compare(
       Object leftBaseObject,
       long leftBaseOffset,
+      int leftBaseLength,
       Object rightBaseObject,
-      long rightBaseOffset) {
+      long rightBaseOffset,
+      int rightBaseLength) {
       return 0;
     }
   };
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
index 594f07dd780f9..c145532328514 100644
--- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
@@ -98,8 +98,10 @@ public void testSortingOnlyByIntegerPrefix() throws Exception {
       public int compare(
         Object leftBaseObject,
         long leftBaseOffset,
+        int leftBaseLength,
         Object rightBaseObject,
-        long rightBaseOffset) {
+        long rightBaseOffset,
+        int rightBaseLength) {
         return 0;
       }
     };
@@ -164,8 +166,10 @@ public void freeAfterOOM() {
       public int compare(
               Object leftBaseObject,
               long leftBaseOffset,
+              int leftBaseLength,
               Object rightBaseObject,
-              long rightBaseOffset) {
+              long rightBaseOffset,
+              int rightBaseLength) {
         return 0;
       }
     };
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
index 6183606a7b2ac..10682ba176aca 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
@@ -222,7 +222,8 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
     val oldModel = new OldWord2VecModel(word2VecMap)
     val instance = new Word2VecModel("myWord2VecModel", oldModel)
     val newInstance = testDefaultReadWrite(instance)
-    assert(newInstance.getVectors.collect() === instance.getVectors.collect())
+    assert(newInstance.getVectors.collect().sortBy(_.getString(0)) ===
+      instance.getVectors.collect().sortBy(_.getString(0)))
   }
 
   test("Word2Vec works with input that is non-nullable (NGram)") {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/RecordBinaryComparator.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/RecordBinaryComparator.java
new file mode 100644
index 0000000000000..bb77b5bf6de2a
--- /dev/null
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/RecordBinaryComparator.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution;
+
+import org.apache.spark.unsafe.Platform;
+import org.apache.spark.util.collection.unsafe.sort.RecordComparator;
+
+public final class RecordBinaryComparator extends RecordComparator {
+
+  // TODO(jiangxb) Add test suite for this.
+  @Override
+  public int compare(
+      Object leftObj, long leftOff, int leftLen, Object rightObj, long rightOff, int rightLen) {
+    int i = 0;
+    int res = 0;
+
+    // If the arrays have different length, the longer one is larger.
+    if (leftLen != rightLen) {
+      return leftLen - rightLen;
+    }
+
+    // The following logic uses `leftLen` as the length for both `leftObj` and `rightObj`, since
+    // we have guaranteed `leftLen` == `rightLen`.
+
+    // check if stars align and we can get both offsets to be aligned
+    if ((leftOff % 8) == (rightOff % 8)) {
+      while ((leftOff + i) % 8 != 0 && i < leftLen) {
+        res = (Platform.getByte(leftObj, leftOff + i) & 0xff) -
+                (Platform.getByte(rightObj, rightOff + i) & 0xff);
+        if (res != 0) return res;
+        i += 1;
+      }
+    }
+    // for architectures that support unaligned accesses, chew it up 8 bytes at a time
+    if (Platform.unaligned() || (((leftOff + i) % 8 == 0) && ((rightOff + i) % 8 == 0))) {
+      while (i <= leftLen - 8) {
+        res = (int) ((Platform.getLong(leftObj, leftOff + i) -
+                Platform.getLong(rightObj, rightOff + i)) % Integer.MAX_VALUE);
+        if (res != 0) return res;
+        i += 8;
+      }
+    }
+    // this will finish off the unaligned comparisons, or do the entire aligned comparison
+    // whichever is needed.
+    while (i < leftLen) {
+      res = (Platform.getByte(leftObj, leftOff + i) & 0xff) -
+              (Platform.getByte(rightObj, rightOff + i) & 0xff);
+      if (res != 0) return res;
+      i += 1;
+    }
+
+    // The two arrays are equal.
+    return 0;
+  }
+}
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
index 6b002f0d3f8e8..78647b56d621f 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
@@ -18,7 +18,9 @@
 package org.apache.spark.sql.execution;
 
 import java.io.IOException;
+import java.util.function.Supplier;
 
+import org.apache.spark.sql.catalyst.util.TypeUtils;
 import scala.collection.AbstractIterator;
 import scala.collection.Iterator;
 import scala.math.Ordering;
@@ -56,26 +58,50 @@ public abstract static class PrefixComputer {
 
     public static class Prefix {
       /** Key prefix value, or the null prefix value if isNull = true. **/
-      long value;
+      public long value;
 
       /** Whether the key is null. */
-      boolean isNull;
+      public boolean isNull;
     }
 
     /**
      * Computes prefix for the given row. For efficiency, the returned object may be reused in
      * further calls to a given PrefixComputer.
      */
-    abstract Prefix computePrefix(InternalRow row);
+    public abstract Prefix computePrefix(InternalRow row);
   }
 
-  public UnsafeExternalRowSorter(
+  public static UnsafeExternalRowSorter createWithRecordComparator(
+      StructType schema,
+      Supplier<RecordComparator> recordComparatorSupplier,
+      PrefixComparator prefixComparator,
+      PrefixComputer prefixComputer,
+      long pageSizeBytes,
+      boolean canUseRadixSort) throws IOException {
+    return new UnsafeExternalRowSorter(schema, recordComparatorSupplier, prefixComparator,
+      prefixComputer, pageSizeBytes, canUseRadixSort);
+  }
+
+  public static UnsafeExternalRowSorter create(
       StructType schema,
       Ordering<InternalRow> ordering,
       PrefixComparator prefixComparator,
       PrefixComputer prefixComputer,
       long pageSizeBytes,
       boolean canUseRadixSort) throws IOException {
+    Supplier<RecordComparator> recordComparatorSupplier =
+      () -> new RowComparator(ordering, schema.length());
+    return new UnsafeExternalRowSorter(schema, recordComparatorSupplier, prefixComparator,
+      prefixComputer, pageSizeBytes, canUseRadixSort);
+  }
+
+  private UnsafeExternalRowSorter(
+      StructType schema,
+      Supplier<RecordComparator> recordComparatorSupplier,
+      PrefixComparator prefixComparator,
+      PrefixComputer prefixComputer,
+      long pageSizeBytes,
+      boolean canUseRadixSort) throws IOException {
     this.schema = schema;
     this.prefixComputer = prefixComputer;
     final SparkEnv sparkEnv = SparkEnv.get();
@@ -85,7 +111,7 @@ public UnsafeExternalRowSorter(
       sparkEnv.blockManager(),
       sparkEnv.serializerManager(),
       taskContext,
-      () -> new RowComparator(ordering, schema.length()),
+      recordComparatorSupplier,
       prefixComparator,
       sparkEnv.conf().getInt("spark.shuffle.sort.initialBufferSize",
                              DEFAULT_INITIAL_SORT_BUFFER_SIZE),
@@ -206,7 +232,13 @@ private static final class RowComparator extends RecordComparator {
     }
 
     @Override
-    public int compare(Object baseObj1, long baseOff1, Object baseObj2, long baseOff2) {
+    public int compare(
+        Object baseObj1,
+        long baseOff1,
+        int baseLen1,
+        Object baseObj2,
+        long baseOff2,
+        int baseLen2) {
       // Note that since ordering doesn't need the total length of the record, we just pass 0
       // into the row.
       row1.pointTo(baseObj1, baseOff1, 0);
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index b0d18b6dced76..76b9d6f6f33bd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1145,6 +1145,18 @@ object SQLConf {
       .checkValues(PartitionOverwriteMode.values.map(_.toString))
       .createWithDefault(PartitionOverwriteMode.STATIC.toString)
 
+  val SORT_BEFORE_REPARTITION =
+    buildConf("spark.sql.execution.sortBeforeRepartition")
+      .internal()
+      .doc("When perform a repartition following a shuffle, the output row ordering would be " +
+        "nondeterministic. If some downstream stages fail and some tasks of the repartition " +
+        "stage retry, these tasks may generate different data, and that can lead to correctness " +
+        "issues. Turn on this config to insert a local sort before actually doing repartition " +
+        "to generate consistent repartition results. The performance of repartition() may go " +
+        "down since we insert extra local sort before it.")
+      .booleanConf
+      .createWithDefault(true)
+
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
   }
@@ -1300,6 +1312,8 @@ class SQLConf extends Serializable with Logging {
 
   def stringRedationPattern: Option[Regex] = SQL_STRING_REDACTION_PATTERN.readFrom(reader)
 
+  def sortBeforeRepartition: Boolean = getConf(SORT_BEFORE_REPARTITION)
+
   /**
    * Returns the [[Resolver]] for the current configuration, which can be used to determine if two
    * identifiers are equal.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
index eb2fe82007af3..b0b5383a081a0 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
@@ -241,7 +241,13 @@ private static final class KVComparator extends RecordComparator {
     }
 
     @Override
-    public int compare(Object baseObj1, long baseOff1, Object baseObj2, long baseOff2) {
+    public int compare(
+        Object baseObj1,
+        long baseOff1,
+        int baseLen1,
+        Object baseObj2,
+        long baseOff2,
+        int baseLen2) {
       // Note that since ordering doesn't need the total length of the record, we just pass 0
       // into the row.
       row1.pointTo(baseObj1, baseOff1 + 4, 0);
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala
index ef1bb1c2a4468..ac1c34d41c4f1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala
@@ -84,7 +84,7 @@ case class SortExec(
     }
 
     val pageSize = SparkEnv.get.memoryManager.pageSizeBytes
-    val sorter = new UnsafeExternalRowSorter(
+    val sorter = UnsafeExternalRowSorter.create(
       schema, ordering, prefixComparator, prefixComputer, pageSize, canUseRadixSort)
 
     if (testSpillFrequency > 0) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala
index 5a1e217082bc2..76c1fa65f924b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution.exchange
 
 import java.util.Random
+import java.util.function.Supplier
 
 import org.apache.spark._
 import org.apache.spark.rdd.RDD
@@ -25,13 +26,15 @@ import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.sort.SortShuffleManager
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors._
-import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.catalyst.expressions.codegen.LazilyGeneratedOrdering
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.MutablePair
+import org.apache.spark.util.collection.unsafe.sort.{PrefixComparators, RecordComparator}
 
 /**
  * Performs a shuffle that will result in the desired `newPartitioning`.
@@ -247,14 +250,57 @@ object ShuffleExchangeExec {
       case RangePartitioning(_, _) | SinglePartition => identity
       case _ => sys.error(s"Exchange not implemented for $newPartitioning")
     }
+
     val rddWithPartitionIds: RDD[Product2[Int, InternalRow]] = {
-      if (needToCopyObjectsBeforeShuffle(part, serializer)) {
+      // [SPARK-23207] Have to make sure the generated RoundRobinPartitioning is deterministic,
+      // otherwise a retry task may output different rows and thus lead to data loss.
+      //
+      // Currently we following the most straight-forward way that perform a local sort before
+      // partitioning.
+      val newRdd = if (SQLConf.get.sortBeforeRepartition &&
+          newPartitioning.isInstanceOf[RoundRobinPartitioning]) {
         rdd.mapPartitionsInternal { iter =>
+          val recordComparatorSupplier = new Supplier[RecordComparator] {
+            override def get: RecordComparator = new RecordBinaryComparator()
+          }
+          // The comparator for comparing row hashcode, which should always be Integer.
+          val prefixComparator = PrefixComparators.LONG
+          val canUseRadixSort = SparkEnv.get.conf.get(SQLConf.RADIX_SORT_ENABLED)
+          // The prefix computer generates row hashcode as the prefix, so we may decrease the
+          // probability that the prefixes are equal when input rows choose column values from a
+          // limited range.
+          val prefixComputer = new UnsafeExternalRowSorter.PrefixComputer {
+            private val result = new UnsafeExternalRowSorter.PrefixComputer.Prefix
+            override def computePrefix(row: InternalRow):
+            UnsafeExternalRowSorter.PrefixComputer.Prefix = {
+              // The hashcode generated from the binary form of a [[UnsafeRow]] should not be null.
+              result.isNull = false
+              result.value = row.hashCode()
+              result
+            }
+          }
+          val pageSize = SparkEnv.get.memoryManager.pageSizeBytes
+
+          val sorter = UnsafeExternalRowSorter.createWithRecordComparator(
+            StructType.fromAttributes(outputAttributes),
+            recordComparatorSupplier,
+            prefixComparator,
+            prefixComputer,
+            pageSize,
+            canUseRadixSort)
+          sorter.sort(iter.asInstanceOf[Iterator[UnsafeRow]])
+        }
+      } else {
+        rdd
+      }
+
+      if (needToCopyObjectsBeforeShuffle(part, serializer)) {
+        newRdd.mapPartitionsInternal { iter =>
           val getPartitionKey = getPartitionKeyExtractor()
           iter.map { row => (part.getPartition(getPartitionKey(row)), row.copy()) }
         }
       } else {
-        rdd.mapPartitionsInternal { iter =>
+        newRdd.mapPartitionsInternal { iter =>
           val getPartitionKey = getPartitionKeyExtractor()
           val mutablePair = new MutablePair[Int, InternalRow]()
           iter.map { row => mutablePair.update(part.getPartition(getPartitionKey(row)), row) }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala
index aac8d56ba6201..697d7e6520713 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala
@@ -17,11 +17,14 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.sql.Row
+import scala.util.Random
+
+import org.apache.spark.sql.{Dataset, Row}
 import org.apache.spark.sql.catalyst.expressions.{Alias, Literal}
 import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, IdentityBroadcastMode, SinglePartition}
 import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec, ShuffleExchangeExec}
 import org.apache.spark.sql.execution.joins.HashedRelationBroadcastMode
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 
 class ExchangeSuite extends SparkPlanTest with SharedSQLContext {
@@ -101,4 +104,25 @@ class ExchangeSuite extends SparkPlanTest with SharedSQLContext {
     assert(exchange4.sameResult(exchange5))
     assert(exchange5 sameResult exchange4)
   }
+
+  test("SPARK-23207: Make repartition() generate consistent output") {
+    def assertConsistency(ds: Dataset[java.lang.Long]): Unit = {
+      ds.persist()
+
+      val exchange = ds.mapPartitions { iter =>
+        Random.shuffle(iter)
+      }.repartition(111)
+      val exchange2 = ds.repartition(111)
+
+      assert(exchange.rdd.collectPartitions() === exchange2.rdd.collectPartitions())
+    }
+
+    withSQLConf(SQLConf.SORT_BEFORE_REPARTITION.key -> "true") {
+      // repartition() should generate consistent output.
+      assertConsistency(spark.range(10000))
+
+      // case when input contains duplicated rows.
+      assertConsistency(spark.range(10000).map(i => Random.nextInt(1000).toLong))
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
index 44a8b25c61dfb..f3ece5b15e26a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -662,7 +662,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
             val v = (row.getInt(0), row.getString(1))
             result += v
           }
-          assert(data == result)
+          assert(data.toSet == result.toSet)
         } finally {
           reader.close()
         }
@@ -678,7 +678,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
             val row = reader.getCurrentValue.asInstanceOf[InternalRow]
             result += row.getString(0)
           }
-          assert(data.map(_._2) == result)
+          assert(data.map(_._2).toSet == result.toSet)
         } finally {
           reader.close()
         }
@@ -695,7 +695,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
             val v = (row.getString(0), row.getInt(1))
             result += v
           }
-          assert(data.map { x => (x._2, x._1) } == result)
+          assert(data.map { x => (x._2, x._1) }.toSet == result.toSet)
         } finally {
           reader.close()
         }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/WholeTextFileSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/WholeTextFileSuite.scala
index 8bd736bee69de..fff0f82f9bc2b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/WholeTextFileSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/WholeTextFileSuite.scala
@@ -95,7 +95,7 @@ class WholeTextFileSuite extends QueryTest with SharedSQLContext {
       df1.write.option("compression", "gzip").mode("overwrite").text(path)
       // On reading through wholetext mode, one file will be read as a single row, i.e. not
       // delimited by "next line" character.
-      val expected = Row(Range(0, 1000).mkString("", "\n", "\n"))
+      val expected = Row(df1.collect().map(_.getString(0)).mkString("", "\n", "\n"))
       Seq(10, 100, 1000).foreach { bytes =>
         withSQLConf(SQLConf.FILES_MAX_PARTITION_BYTES.key -> bytes.toString) {
           val df2 = spark.read.option("wholetext", "true").format("text").load(path)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala
index 9137d650e906b..1248c670df45c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala
@@ -52,13 +52,13 @@ class ForeachSinkSuite extends StreamTest with SharedSQLContext with BeforeAndAf
 
       var expectedEventsForPartition0 = Seq(
         ForeachSinkSuite.Open(partition = 0, version = 0),
-        ForeachSinkSuite.Process(value = 1),
+        ForeachSinkSuite.Process(value = 2),
         ForeachSinkSuite.Process(value = 3),
         ForeachSinkSuite.Close(None)
       )
       var expectedEventsForPartition1 = Seq(
         ForeachSinkSuite.Open(partition = 1, version = 0),
-        ForeachSinkSuite.Process(value = 2),
+        ForeachSinkSuite.Process(value = 1),
         ForeachSinkSuite.Process(value = 4),
         ForeachSinkSuite.Close(None)
       )
@@ -162,7 +162,7 @@ class ForeachSinkSuite extends StreamTest with SharedSQLContext with BeforeAndAf
       val allEvents = ForeachSinkSuite.allEvents()
       assert(allEvents.size === 1)
       assert(allEvents(0)(0) === ForeachSinkSuite.Open(partition = 0, version = 0))
-      assert(allEvents(0)(1) === ForeachSinkSuite.Process(value = 1))
+      assert(allEvents(0)(1) === ForeachSinkSuite.Process(value = 2))
 
       // `close` should be called with the error
       val errorEvent = allEvents(0)(2).asInstanceOf[ForeachSinkSuite.Close]

From 7aaf23cf8ab871a8e8877ec82183656ae5f4be7b Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <zsxwing@gmail.com>
Date: Fri, 26 Jan 2018 16:09:57 -0800
Subject: [PATCH 194/566] [SPARK-23242][SS][TESTS] Don't run tests in
 KafkaSourceSuiteBase twice

## What changes were proposed in this pull request?

KafkaSourceSuiteBase should be abstract class, otherwise KafkaSourceSuiteBase will also run.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <zsxwing@gmail.com>

Closes #20412 from zsxwing/SPARK-23242.

(cherry picked from commit 073744985f439ca90afb9bd0bbc1332c53f7b4bb)
Signed-off-by: Shixiong Zhu <zsxwing@gmail.com>
---
 .../scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index 27dbb3f7a8f31..c4cb1bc4a2e18 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -546,7 +546,7 @@ class KafkaMicroBatchSourceSuite extends KafkaSourceSuiteBase {
   }
 }
 
-class KafkaSourceSuiteBase extends KafkaSourceTest {
+abstract class KafkaSourceSuiteBase extends KafkaSourceTest {
 
   import testImplicits._
 

From 20c0efe48df9ce622d97cf3d1274d877c0e3095c Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 26 Jan 2018 16:46:51 -0800
Subject: [PATCH 195/566] [SPARK-23214][SQL] cached data should not carry extra
 hint info

## What changes were proposed in this pull request?

This is a regression introduced by https://github.com/apache/spark/pull/19864

When we lookup cache, we should not carry the hint info, as this cache entry might be added by a plan having hint info, while the input plan for this lookup may not have hint info, or have different hint info.

## How was this patch tested?

a new test.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20394 from cloud-fan/cache.

(cherry picked from commit 5b5447c68ac79715e2256e487e1212861cdab1fc)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../spark/sql/execution/CacheManager.scala    |  17 +--
 .../execution/columnar/InMemoryRelation.scala |  27 +++--
 .../apache/spark/sql/CachedTableSuite.scala   |   4 +-
 .../columnar/InMemoryColumnarQuerySuite.scala |   2 +-
 .../execution/joins/BroadcastJoinSuite.scala  | 103 +++++++++++-------
 5 files changed, 94 insertions(+), 59 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
index 432eb59d6fe57..d68aeb275afda 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -169,14 +169,17 @@ class CacheManager extends Logging {
   /** Replaces segments of the given logical plan with cached versions where possible. */
   def useCachedData(plan: LogicalPlan): LogicalPlan = {
     val newPlan = plan transformDown {
+      // Do not lookup the cache by hint node. Hint node is special, we should ignore it when
+      // canonicalizing plans, so that plans which are same except hint can hit the same cache.
+      // However, we also want to keep the hint info after cache lookup. Here we skip the hint
+      // node, so that the returned caching plan won't replace the hint node and drop the hint info
+      // from the original plan.
+      case hint: ResolvedHint => hint
+
       case currentFragment =>
-        lookupCachedData(currentFragment).map { cached =>
-          val cachedPlan = cached.cachedRepresentation.withOutput(currentFragment.output)
-          currentFragment match {
-            case hint: ResolvedHint => ResolvedHint(cachedPlan, hint.hints)
-            case _ => cachedPlan
-          }
-        }.getOrElse(currentFragment)
+        lookupCachedData(currentFragment)
+          .map(_.cachedRepresentation.withOutput(currentFragment.output))
+          .getOrElse(currentFragment)
     }
 
     newPlan transformAllExpressions {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
index 51928d914841e..22e16913d4da9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical
-import org.apache.spark.sql.catalyst.plans.logical.Statistics
+import org.apache.spark.sql.catalyst.plans.logical.{HintInfo, Statistics}
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.LongAccumulator
@@ -62,8 +62,8 @@ case class InMemoryRelation(
     @transient child: SparkPlan,
     tableName: Option[String])(
     @transient var _cachedColumnBuffers: RDD[CachedBatch] = null,
-    val batchStats: LongAccumulator = child.sqlContext.sparkContext.longAccumulator,
-    statsOfPlanToCache: Statistics = null)
+    val sizeInBytesStats: LongAccumulator = child.sqlContext.sparkContext.longAccumulator,
+    statsOfPlanToCache: Statistics)
   extends logical.LeafNode with MultiInstanceRelation {
 
   override protected def innerChildren: Seq[SparkPlan] = Seq(child)
@@ -73,11 +73,16 @@ case class InMemoryRelation(
   @transient val partitionStatistics = new PartitionStatistics(output)
 
   override def computeStats(): Statistics = {
-    if (batchStats.value == 0L) {
-      // Underlying columnar RDD hasn't been materialized, use the stats from the plan to cache
-      statsOfPlanToCache
+    if (sizeInBytesStats.value == 0L) {
+      // Underlying columnar RDD hasn't been materialized, use the stats from the plan to cache.
+      // Note that we should drop the hint info here. We may cache a plan whose root node is a hint
+      // node. When we lookup the cache with a semantically same plan without hint info, the plan
+      // returned by cache lookup should not have hint info. If we lookup the cache with a
+      // semantically same plan with a different hint info, `CacheManager.useCachedData` will take
+      // care of it and retain the hint info in the lookup input plan.
+      statsOfPlanToCache.copy(hints = HintInfo())
     } else {
-      Statistics(sizeInBytes = batchStats.value.longValue)
+      Statistics(sizeInBytes = sizeInBytesStats.value.longValue)
     }
   }
 
@@ -122,7 +127,7 @@ case class InMemoryRelation(
             rowCount += 1
           }
 
-          batchStats.add(totalSize)
+          sizeInBytesStats.add(totalSize)
 
           val stats = InternalRow.fromSeq(
             columnBuilders.flatMap(_.columnStats.collectedStatistics))
@@ -144,7 +149,7 @@ case class InMemoryRelation(
   def withOutput(newOutput: Seq[Attribute]): InMemoryRelation = {
     InMemoryRelation(
       newOutput, useCompression, batchSize, storageLevel, child, tableName)(
-        _cachedColumnBuffers, batchStats, statsOfPlanToCache)
+        _cachedColumnBuffers, sizeInBytesStats, statsOfPlanToCache)
   }
 
   override def newInstance(): this.type = {
@@ -156,12 +161,12 @@ case class InMemoryRelation(
       child,
       tableName)(
         _cachedColumnBuffers,
-        batchStats,
+        sizeInBytesStats,
         statsOfPlanToCache).asInstanceOf[this.type]
   }
 
   def cachedColumnBuffers: RDD[CachedBatch] = _cachedColumnBuffers
 
   override protected def otherCopyArgs: Seq[AnyRef] =
-    Seq(_cachedColumnBuffers, batchStats, statsOfPlanToCache)
+    Seq(_cachedColumnBuffers, sizeInBytesStats, statsOfPlanToCache)
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index 1e52445f28fc1..72fe0f42801f1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -368,12 +368,12 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext
     val toBeCleanedAccIds = new HashSet[Long]
 
     val accId1 = spark.table("t1").queryExecution.withCachedData.collect {
-      case i: InMemoryRelation => i.batchStats.id
+      case i: InMemoryRelation => i.sizeInBytesStats.id
     }.head
     toBeCleanedAccIds += accId1
 
     val accId2 = spark.table("t1").queryExecution.withCachedData.collect {
-      case i: InMemoryRelation => i.batchStats.id
+      case i: InMemoryRelation => i.sizeInBytesStats.id
     }.head
     toBeCleanedAccIds += accId2
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
index 2280da927cf70..dc1766fb9a785 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
@@ -336,7 +336,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
     checkAnswer(cached, expectedAnswer)
 
     // Check that the right size was calculated.
-    assert(cached.batchStats.value === expectedAnswer.size * INT.defaultSize)
+    assert(cached.sizeInBytesStats.value === expectedAnswer.size * INT.defaultSize)
   }
 
   test("access primitive-type columns in CachedBatch without whole stage codegen") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
index 1704bc8376f0d..bcdee792f4c70 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
@@ -22,7 +22,8 @@ import scala.reflect.ClassTag
 import org.apache.spark.AccumulatorSuite
 import org.apache.spark.sql.{Dataset, QueryTest, Row, SparkSession}
 import org.apache.spark.sql.catalyst.expressions.{BitwiseAnd, BitwiseOr, Cast, Literal, ShiftLeft}
-import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan, WholeStageCodegenExec}
+import org.apache.spark.sql.execution.{SparkPlan, WholeStageCodegenExec}
+import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
 import org.apache.spark.sql.execution.exchange.EnsureRequirements
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
@@ -70,8 +71,8 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils {
   private def testBroadcastJoin[T: ClassTag](
       joinType: String,
       forceBroadcast: Boolean = false): SparkPlan = {
-    val df1 = spark.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value")
-    val df2 = spark.createDataFrame(Seq((1, "1"), (2, "2"))).toDF("key", "value")
+    val df1 = Seq((1, "4"), (2, "2")).toDF("key", "value")
+    val df2 = Seq((1, "1"), (2, "2")).toDF("key", "value")
 
     // Comparison at the end is for broadcast left semi join
     val joinExpression = df1("key") === df2("key") && df1("value") > df2("value")
@@ -109,30 +110,58 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils {
     }
   }
 
-  test("broadcast hint is retained after using the cached data") {
+  test("SPARK-23192: broadcast hint should be retained after using the cached data") {
     withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") {
-      val df1 = spark.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value")
-      val df2 = spark.createDataFrame(Seq((1, "1"), (2, "2"))).toDF("key", "value")
-      df2.cache()
-      val df3 = df1.join(broadcast(df2), Seq("key"), "inner")
-      val numBroadCastHashJoin = df3.queryExecution.executedPlan.collect {
-        case b: BroadcastHashJoinExec => b
-      }.size
-      assert(numBroadCastHashJoin === 1)
+      try {
+        val df1 = Seq((1, "4"), (2, "2")).toDF("key", "value")
+        val df2 = Seq((1, "1"), (2, "2")).toDF("key", "value")
+        df2.cache()
+        val df3 = df1.join(broadcast(df2), Seq("key"), "inner")
+        val numBroadCastHashJoin = df3.queryExecution.executedPlan.collect {
+          case b: BroadcastHashJoinExec => b
+        }.size
+        assert(numBroadCastHashJoin === 1)
+      } finally {
+        spark.catalog.clearCache()
+      }
+    }
+  }
+
+  test("SPARK-23214: cached data should not carry extra hint info") {
+    withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") {
+      try {
+        val df1 = Seq((1, "4"), (2, "2")).toDF("key", "value")
+        val df2 = Seq((1, "1"), (2, "2")).toDF("key", "value")
+        broadcast(df2).cache()
+
+        val df3 = df1.join(df2, Seq("key"), "inner")
+        val numCachedPlan = df3.queryExecution.executedPlan.collect {
+          case i: InMemoryTableScanExec => i
+        }.size
+        // df2 should be cached.
+        assert(numCachedPlan === 1)
+
+        val numBroadCastHashJoin = df3.queryExecution.executedPlan.collect {
+          case b: BroadcastHashJoinExec => b
+        }.size
+        // df2 should not be broadcasted.
+        assert(numBroadCastHashJoin === 0)
+      } finally {
+        spark.catalog.clearCache()
+      }
     }
   }
 
   test("broadcast hint isn't propagated after a join") {
     withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") {
-      val df1 = spark.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value")
-      val df2 = spark.createDataFrame(Seq((1, "1"), (2, "2"))).toDF("key", "value")
+      val df1 = Seq((1, "4"), (2, "2")).toDF("key", "value")
+      val df2 = Seq((1, "1"), (2, "2")).toDF("key", "value")
       val df3 = df1.join(broadcast(df2), Seq("key"), "inner").drop(df2("key"))
 
-      val df4 = spark.createDataFrame(Seq((1, "5"), (2, "5"))).toDF("key", "value")
+      val df4 = Seq((1, "5"), (2, "5")).toDF("key", "value")
       val df5 = df4.join(df3, Seq("key"), "inner")
 
-      val plan =
-        EnsureRequirements(spark.sessionState.conf).apply(df5.queryExecution.sparkPlan)
+      val plan = EnsureRequirements(spark.sessionState.conf).apply(df5.queryExecution.sparkPlan)
 
       assert(plan.collect { case p: BroadcastHashJoinExec => p }.size === 1)
       assert(plan.collect { case p: SortMergeJoinExec => p }.size === 1)
@@ -140,30 +169,30 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils {
   }
 
   private def assertBroadcastJoin(df : Dataset[Row]) : Unit = {
-    val df1 = spark.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value")
+    val df1 = Seq((1, "4"), (2, "2")).toDF("key", "value")
     val joined = df1.join(df, Seq("key"), "inner")
 
-    val plan =
-      EnsureRequirements(spark.sessionState.conf).apply(joined.queryExecution.sparkPlan)
+    val plan = EnsureRequirements(spark.sessionState.conf).apply(joined.queryExecution.sparkPlan)
 
     assert(plan.collect { case p: BroadcastHashJoinExec => p }.size === 1)
   }
 
   test("broadcast hint programming API") {
     withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") {
-      val df2 = spark.createDataFrame(Seq((1, "1"), (2, "2"), (3, "2"))).toDF("key", "value")
+      val df2 = Seq((1, "1"), (2, "2"), (3, "2")).toDF("key", "value")
       val broadcasted = broadcast(df2)
-      val df3 = spark.createDataFrame(Seq((2, "2"), (3, "3"))).toDF("key", "value")
-
-      val cases = Seq(broadcasted.limit(2),
-                      broadcasted.filter("value < 10"),
-                      broadcasted.sample(true, 0.5),
-                      broadcasted.distinct(),
-                      broadcasted.groupBy("value").agg(min($"key").as("key")),
-                      // except and intersect are semi/anti-joins which won't return more data then
-                      // their left argument, so the broadcast hint should be propagated here
-                      broadcasted.except(df3),
-                      broadcasted.intersect(df3))
+      val df3 = Seq((2, "2"), (3, "3")).toDF("key", "value")
+
+      val cases = Seq(
+        broadcasted.limit(2),
+        broadcasted.filter("value < 10"),
+        broadcasted.sample(true, 0.5),
+        broadcasted.distinct(),
+        broadcasted.groupBy("value").agg(min($"key").as("key")),
+        // except and intersect are semi/anti-joins which won't return more data then
+        // their left argument, so the broadcast hint should be propagated here
+        broadcasted.except(df3),
+        broadcasted.intersect(df3))
 
       cases.foreach(assertBroadcastJoin)
     }
@@ -240,9 +269,8 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils {
   test("Shouldn't change broadcast join buildSide if user clearly specified") {
 
     withTempView("t1", "t2") {
-      spark.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value").createTempView("t1")
-      spark.createDataFrame(Seq((1, "1"), (2, "12.3"), (2, "123"))).toDF("key", "value")
-        .createTempView("t2")
+      Seq((1, "4"), (2, "2")).toDF("key", "value").createTempView("t1")
+      Seq((1, "1"), (2, "12.3"), (2, "123")).toDF("key", "value").createTempView("t2")
 
       val t1Size = spark.table("t1").queryExecution.analyzed.children.head.stats.sizeInBytes
       val t2Size = spark.table("t2").queryExecution.analyzed.children.head.stats.sizeInBytes
@@ -292,9 +320,8 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils {
   test("Shouldn't bias towards build right if user didn't specify") {
 
     withTempView("t1", "t2") {
-      spark.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value").createTempView("t1")
-      spark.createDataFrame(Seq((1, "1"), (2, "12.3"), (2, "123"))).toDF("key", "value")
-        .createTempView("t2")
+      Seq((1, "4"), (2, "2")).toDF("key", "value").createTempView("t1")
+      Seq((1, "1"), (2, "12.3"), (2, "123")).toDF("key", "value").createTempView("t2")
 
       val t1Size = spark.table("t1").queryExecution.analyzed.children.head.stats.sizeInBytes
       val t2Size = spark.table("t2").queryExecution.analyzed.children.head.stats.sizeInBytes

From 234c854bd203e9ba32be50b1f33cc118d0dbd9e8 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Fri, 26 Jan 2018 18:57:32 -0600
Subject: [PATCH 196/566] [MINOR][SS][DOC] Fix `Trigger` Scala/Java doc
 examples

## What changes were proposed in this pull request?

This PR fixes Scala/Java doc examples in `Trigger.java`.

## How was this patch tested?

N/A.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20401 from dongjoon-hyun/SPARK-TRIGGER.

(cherry picked from commit e7bc9f0524822a08d857c3a5ba57119644ceae85)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../src/main/java/org/apache/spark/sql/streaming/Trigger.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/streaming/Trigger.java b/sql/core/src/main/java/org/apache/spark/sql/streaming/Trigger.java
index 33ae9a9e87668..5371a23230c98 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/streaming/Trigger.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/streaming/Trigger.java
@@ -50,7 +50,7 @@ public static Trigger ProcessingTime(long intervalMs) {
    *
    * {{{
    *    import java.util.concurrent.TimeUnit
-   *    df.writeStream.trigger(ProcessingTime.create(10, TimeUnit.SECONDS))
+   *    df.writeStream().trigger(Trigger.ProcessingTime(10, TimeUnit.SECONDS))
    * }}}
    *
    * @since 2.2.0
@@ -66,7 +66,7 @@ public static Trigger ProcessingTime(long interval, TimeUnit timeUnit) {
    *
    * {{{
    *    import scala.concurrent.duration._
-   *    df.writeStream.trigger(ProcessingTime(10.seconds))
+   *    df.writeStream.trigger(Trigger.ProcessingTime(10.seconds))
    * }}}
    * @since 2.2.0
    */

From 65600bfdb9417e5f2bd2e40312e139f592f238e9 Mon Sep 17 00:00:00 2001
From: Jose Torres <jose@databricks.com>
Date: Fri, 26 Jan 2018 23:06:03 -0800
Subject: [PATCH 197/566] [SPARK-23245][SS][TESTS] Don't access
 `lastExecution.executedPlan` in StreamTest

## What changes were proposed in this pull request?

`lastExecution.executedPlan` is lazy val so accessing it in StreamTest may need to acquire the lock of `lastExecution`. It may be waiting forever when the streaming thread is holding it and running a continuous Spark job.

This PR changes to check if `s.lastExecution` is null to avoid accessing `lastExecution.executedPlan`.

## How was this patch tested?

Jenkins

Author: Jose Torres <jose@databricks.com>

Closes #20413 from zsxwing/SPARK-23245.

(cherry picked from commit 6328868e524121bd00595959d6d059f74e038a6b)
Signed-off-by: Shixiong Zhu <zsxwing@gmail.com>
---
 .../test/scala/org/apache/spark/sql/streaming/StreamTest.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index efdb0e0e7cf1c..d6433562fb29b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -472,7 +472,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
               currentStream.awaitInitialization(streamingTimeout.toMillis)
               currentStream match {
                 case s: ContinuousExecution => eventually("IncrementalExecution was not created") {
-                  s.lastExecution.executedPlan // will fail if lastExecution is null
+                  assert(s.lastExecution != null)
                 }
                 case _ =>
               }

From 3b6fc286d105ae7de737c46e50cf941e6831ab98 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sat, 27 Jan 2018 11:26:09 -0800
Subject: [PATCH 198/566] [SPARK-23233][PYTHON] Reset the cache in
 asNondeterministic to set deterministic properly

## What changes were proposed in this pull request?

Reproducer:

```python
from pyspark.sql.functions import udf
f = udf(lambda x: x)
spark.range(1).select(f("id"))  # cache JVM UDF instance.
f = f.asNondeterministic()
spark.range(1).select(f("id"))._jdf.logicalPlan().projectList().head().deterministic()
```

It should return `False` but the current master returns `True`. Seems it's because we cache the JVM UDF instance and then we reuse it even after setting `deterministic` disabled once it's called.

## How was this patch tested?

Manually tested. I am not sure if I should add the test with a lot of JVM accesses with the intetnal stuff .. Let me know if anyone feels so. I will add.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20409 from HyukjinKwon/SPARK-23233.

(cherry picked from commit 3227d14feb1a65e95a2bf326cff6ac95615cc5ac)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 python/pyspark/sql/tests.py | 13 +++++++++++++
 python/pyspark/sql/udf.py   |  3 +++
 2 files changed, 16 insertions(+)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index fbb18c494f899..a8e337d8cd69f 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -435,6 +435,19 @@ def test_nondeterministic_udf2(self):
         pydoc.render_doc(random_udf1)
         pydoc.render_doc(udf(lambda x: x).asNondeterministic)
 
+    def test_nondeterministic_udf3(self):
+        # regression test for SPARK-23233
+        from pyspark.sql.functions import udf
+        f = udf(lambda x: x)
+        # Here we cache the JVM UDF instance.
+        self.spark.range(1).select(f("id"))
+        # This should reset the cache to set the deterministic status correctly.
+        f = f.asNondeterministic()
+        # Check the deterministic status of udf.
+        df = self.spark.range(1).select(f("id"))
+        deterministic = df._jdf.logicalPlan().projectList().head().deterministic()
+        self.assertFalse(deterministic)
+
     def test_nondeterministic_udf_in_aggregate(self):
         from pyspark.sql.functions import udf, sum
         import random
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index 134badb8485f5..07952558a036e 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -181,6 +181,9 @@ def asNondeterministic(self):
 
         .. versionadded:: 2.3
         """
+        # Here, we explicitly clean the cache to create a JVM UDF instance
+        # with 'deterministic' updated. See SPARK-23233.
+        self._judf_placeholder = None
         self.deterministic = False
         return self
 

From 8ff0cc48b1b45ed41914822ffaaf8de8dff87b72 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sun, 28 Jan 2018 10:33:06 +0900
Subject: [PATCH 199/566] [SPARK-23248][PYTHON][EXAMPLES] Relocate module
 docstrings to the top in PySpark examples

## What changes were proposed in this pull request?

This PR proposes to relocate the docstrings in modules of examples to the top. Seems these are mistakes. So, for example, the below codes

```python
>>> help(aft_survival_regression)
```

shows the module docstrings for examples as below:

**Before**

```
Help on module aft_survival_regression:

NAME
    aft_survival_regression

...

DESCRIPTION
    # Licensed to the Apache Software Foundation (ASF) under one or more
    # contributor license agreements.  See the NOTICE file distributed with
    # this work for additional information regarding copyright ownership.
    # The ASF licenses this file to You under the Apache License, Version 2.0
    # (the "License"); you may not use this file except in compliance with
    # the License.  You may obtain a copy of the License at
    #
    #    http://www.apache.org/licenses/LICENSE-2.0
    #
    # Unless required by applicable law or agreed to in writing, software
    # distributed under the License is distributed on an "AS IS" BASIS,
    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    # See the License for the specific language governing permissions and
    # limitations under the License.
    #

...

(END)
```

**After**

```
Help on module aft_survival_regression:

NAME
    aft_survival_regression

...

DESCRIPTION
    An example demonstrating aft survival regression.
    Run with:
      bin/spark-submit examples/src/main/python/ml/aft_survival_regression.py

(END)
```

## How was this patch tested?

Manually checked.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20416 from HyukjinKwon/module-docstring-example.

(cherry picked from commit b8c32dc57368e49baaacf660b7e8836eedab2df7)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 examples/src/main/python/avro_inputformat.py      | 14 +++++++-------
 .../src/main/python/ml/aft_survival_regression.py | 11 +++++------
 .../main/python/ml/bisecting_k_means_example.py   | 11 +++++------
 .../ml/bucketed_random_projection_lsh_example.py  | 12 +++++-------
 .../src/main/python/ml/chi_square_test_example.py | 10 +++++-----
 .../src/main/python/ml/correlation_example.py     | 10 +++++-----
 examples/src/main/python/ml/cross_validator.py    | 15 +++++++--------
 examples/src/main/python/ml/fpgrowth_example.py   |  9 ++++-----
 .../main/python/ml/gaussian_mixture_example.py    | 11 +++++------
 .../ml/generalized_linear_regression_example.py   | 11 +++++------
 examples/src/main/python/ml/imputer_example.py    |  9 ++++-----
 .../main/python/ml/isotonic_regression_example.py |  9 +++------
 examples/src/main/python/ml/kmeans_example.py     | 15 +++++++--------
 examples/src/main/python/ml/lda_example.py        | 12 +++++-------
 .../ml/logistic_regression_summary_example.py     | 11 +++++------
 .../src/main/python/ml/min_hash_lsh_example.py    | 12 +++++-------
 .../src/main/python/ml/one_vs_rest_example.py     | 13 ++++++-------
 .../src/main/python/ml/train_validation_split.py  | 13 ++++++-------
 examples/src/main/python/parquet_inputformat.py   | 12 ++++++------
 examples/src/main/python/sql/basic.py             | 11 +++++------
 examples/src/main/python/sql/datasource.py        | 11 +++++------
 examples/src/main/python/sql/hive.py              | 11 +++++------
 22 files changed, 115 insertions(+), 138 deletions(-)

diff --git a/examples/src/main/python/avro_inputformat.py b/examples/src/main/python/avro_inputformat.py
index 4422f9e7a9589..6286ba6541fbd 100644
--- a/examples/src/main/python/avro_inputformat.py
+++ b/examples/src/main/python/avro_inputformat.py
@@ -15,13 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
-import sys
-
-from functools import reduce
-from pyspark.sql import SparkSession
-
 """
 Read data file users.avro in local Spark distro:
 
@@ -50,6 +43,13 @@
 {u'favorite_color': None, u'name': u'Alyssa'}
 {u'favorite_color': u'red', u'name': u'Ben'}
 """
+from __future__ import print_function
+
+import sys
+
+from functools import reduce
+from pyspark.sql import SparkSession
+
 if __name__ == "__main__":
     if len(sys.argv) != 2 and len(sys.argv) != 3:
         print("""
diff --git a/examples/src/main/python/ml/aft_survival_regression.py b/examples/src/main/python/ml/aft_survival_regression.py
index 2f0ca995e55c7..0a71f76418ea6 100644
--- a/examples/src/main/python/ml/aft_survival_regression.py
+++ b/examples/src/main/python/ml/aft_survival_regression.py
@@ -15,6 +15,11 @@
 # limitations under the License.
 #
 
+"""
+An example demonstrating aft survival regression.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/aft_survival_regression.py
+"""
 from __future__ import print_function
 
 # $example on$
@@ -23,12 +28,6 @@
 # $example off$
 from pyspark.sql import SparkSession
 
-"""
-An example demonstrating aft survival regression.
-Run with:
-  bin/spark-submit examples/src/main/python/ml/aft_survival_regression.py
-"""
-
 if __name__ == "__main__":
     spark = SparkSession \
         .builder \
diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py
index 1263cb5d177a8..7842d2009e238 100644
--- a/examples/src/main/python/ml/bisecting_k_means_example.py
+++ b/examples/src/main/python/ml/bisecting_k_means_example.py
@@ -15,6 +15,11 @@
 # limitations under the License.
 #
 
+"""
+An example demonstrating bisecting k-means clustering.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py
+"""
 from __future__ import print_function
 
 # $example on$
@@ -22,12 +27,6 @@
 # $example off$
 from pyspark.sql import SparkSession
 
-"""
-An example demonstrating bisecting k-means clustering.
-Run with:
-  bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py
-"""
-
 if __name__ == "__main__":
     spark = SparkSession\
         .builder\
diff --git a/examples/src/main/python/ml/bucketed_random_projection_lsh_example.py b/examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
index 1b7a458125cef..610176ea596ca 100644
--- a/examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
+++ b/examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
@@ -15,7 +15,11 @@
 # limitations under the License.
 #
 
-
+"""
+An example demonstrating BucketedRandomProjectionLSH.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
+"""
 from __future__ import print_function
 
 # $example on$
@@ -25,12 +29,6 @@
 # $example off$
 from pyspark.sql import SparkSession
 
-"""
-An example demonstrating BucketedRandomProjectionLSH.
-Run with:
-  bin/spark-submit examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
-"""
-
 if __name__ == "__main__":
     spark = SparkSession \
         .builder \
diff --git a/examples/src/main/python/ml/chi_square_test_example.py b/examples/src/main/python/ml/chi_square_test_example.py
index 8f25318ded00a..2af7e683cdb72 100644
--- a/examples/src/main/python/ml/chi_square_test_example.py
+++ b/examples/src/main/python/ml/chi_square_test_example.py
@@ -15,6 +15,11 @@
 # limitations under the License.
 #
 
+"""
+An example for Chi-square hypothesis testing.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/chi_square_test_example.py
+"""
 from __future__ import print_function
 
 from pyspark.sql import SparkSession
@@ -23,11 +28,6 @@
 from pyspark.ml.stat import ChiSquareTest
 # $example off$
 
-"""
-An example for Chi-square hypothesis testing.
-Run with:
-  bin/spark-submit examples/src/main/python/ml/chi_square_test_example.py
-"""
 if __name__ == "__main__":
     spark = SparkSession \
         .builder \
diff --git a/examples/src/main/python/ml/correlation_example.py b/examples/src/main/python/ml/correlation_example.py
index 0a9d30da5a42e..1f4e402ac1a51 100644
--- a/examples/src/main/python/ml/correlation_example.py
+++ b/examples/src/main/python/ml/correlation_example.py
@@ -15,6 +15,11 @@
 # limitations under the License.
 #
 
+"""
+An example for computing correlation matrix.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/correlation_example.py
+"""
 from __future__ import print_function
 
 # $example on$
@@ -23,11 +28,6 @@
 # $example off$
 from pyspark.sql import SparkSession
 
-"""
-An example for computing correlation matrix.
-Run with:
-  bin/spark-submit examples/src/main/python/ml/correlation_example.py
-"""
 if __name__ == "__main__":
     spark = SparkSession \
         .builder \
diff --git a/examples/src/main/python/ml/cross_validator.py b/examples/src/main/python/ml/cross_validator.py
index db7054307c2e3..6256d11504afb 100644
--- a/examples/src/main/python/ml/cross_validator.py
+++ b/examples/src/main/python/ml/cross_validator.py
@@ -15,6 +15,13 @@
 # limitations under the License.
 #
 
+"""
+A simple example demonstrating model selection using CrossValidator.
+This example also demonstrates how Pipelines are Estimators.
+Run with:
+
+  bin/spark-submit examples/src/main/python/ml/cross_validator.py
+"""
 from __future__ import print_function
 
 # $example on$
@@ -26,14 +33,6 @@
 # $example off$
 from pyspark.sql import SparkSession
 
-"""
-A simple example demonstrating model selection using CrossValidator.
-This example also demonstrates how Pipelines are Estimators.
-Run with:
-
-  bin/spark-submit examples/src/main/python/ml/cross_validator.py
-"""
-
 if __name__ == "__main__":
     spark = SparkSession\
         .builder\
diff --git a/examples/src/main/python/ml/fpgrowth_example.py b/examples/src/main/python/ml/fpgrowth_example.py
index c92c3c27abb21..39092e616d429 100644
--- a/examples/src/main/python/ml/fpgrowth_example.py
+++ b/examples/src/main/python/ml/fpgrowth_example.py
@@ -15,16 +15,15 @@
 # limitations under the License.
 #
 
-# $example on$
-from pyspark.ml.fpm import FPGrowth
-# $example off$
-from pyspark.sql import SparkSession
-
 """
 An example demonstrating FPGrowth.
 Run with:
   bin/spark-submit examples/src/main/python/ml/fpgrowth_example.py
 """
+# $example on$
+from pyspark.ml.fpm import FPGrowth
+# $example off$
+from pyspark.sql import SparkSession
 
 if __name__ == "__main__":
     spark = SparkSession\
diff --git a/examples/src/main/python/ml/gaussian_mixture_example.py b/examples/src/main/python/ml/gaussian_mixture_example.py
index e4a0d314e9d91..4938a904189f9 100644
--- a/examples/src/main/python/ml/gaussian_mixture_example.py
+++ b/examples/src/main/python/ml/gaussian_mixture_example.py
@@ -15,6 +15,11 @@
 # limitations under the License.
 #
 
+"""
+A simple example demonstrating Gaussian Mixture Model (GMM).
+Run with:
+  bin/spark-submit examples/src/main/python/ml/gaussian_mixture_example.py
+"""
 from __future__ import print_function
 
 # $example on$
@@ -22,12 +27,6 @@
 # $example off$
 from pyspark.sql import SparkSession
 
-"""
-A simple example demonstrating Gaussian Mixture Model (GMM).
-Run with:
-  bin/spark-submit examples/src/main/python/ml/gaussian_mixture_example.py
-"""
-
 if __name__ == "__main__":
     spark = SparkSession\
         .builder\
diff --git a/examples/src/main/python/ml/generalized_linear_regression_example.py b/examples/src/main/python/ml/generalized_linear_regression_example.py
index 796752a60f3ab..a52f4650c1c6f 100644
--- a/examples/src/main/python/ml/generalized_linear_regression_example.py
+++ b/examples/src/main/python/ml/generalized_linear_regression_example.py
@@ -15,6 +15,11 @@
 # limitations under the License.
 #
 
+"""
+An example demonstrating generalized linear regression.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/generalized_linear_regression_example.py
+"""
 from __future__ import print_function
 
 from pyspark.sql import SparkSession
@@ -22,12 +27,6 @@
 from pyspark.ml.regression import GeneralizedLinearRegression
 # $example off$
 
-"""
-An example demonstrating generalized linear regression.
-Run with:
-  bin/spark-submit examples/src/main/python/ml/generalized_linear_regression_example.py
-"""
-
 if __name__ == "__main__":
     spark = SparkSession\
         .builder\
diff --git a/examples/src/main/python/ml/imputer_example.py b/examples/src/main/python/ml/imputer_example.py
index b8437f827e56d..9ba0147763618 100644
--- a/examples/src/main/python/ml/imputer_example.py
+++ b/examples/src/main/python/ml/imputer_example.py
@@ -15,16 +15,15 @@
 # limitations under the License.
 #
 
-# $example on$
-from pyspark.ml.feature import Imputer
-# $example off$
-from pyspark.sql import SparkSession
-
 """
 An example demonstrating Imputer.
 Run with:
   bin/spark-submit examples/src/main/python/ml/imputer_example.py
 """
+# $example on$
+from pyspark.ml.feature import Imputer
+# $example off$
+from pyspark.sql import SparkSession
 
 if __name__ == "__main__":
     spark = SparkSession\
diff --git a/examples/src/main/python/ml/isotonic_regression_example.py b/examples/src/main/python/ml/isotonic_regression_example.py
index 6ae15f1b4b0dd..89cba9dfc7e8f 100644
--- a/examples/src/main/python/ml/isotonic_regression_example.py
+++ b/examples/src/main/python/ml/isotonic_regression_example.py
@@ -17,6 +17,9 @@
 
 """
 Isotonic Regression Example.
+
+Run with:
+  bin/spark-submit examples/src/main/python/ml/isotonic_regression_example.py
 """
 from __future__ import print_function
 
@@ -25,12 +28,6 @@
 # $example off$
 from pyspark.sql import SparkSession
 
-"""
-An example demonstrating isotonic regression.
-Run with:
-  bin/spark-submit examples/src/main/python/ml/isotonic_regression_example.py
-"""
-
 if __name__ == "__main__":
     spark = SparkSession\
         .builder\
diff --git a/examples/src/main/python/ml/kmeans_example.py b/examples/src/main/python/ml/kmeans_example.py
index 5f77843e3743a..80a878af679f4 100644
--- a/examples/src/main/python/ml/kmeans_example.py
+++ b/examples/src/main/python/ml/kmeans_example.py
@@ -15,6 +15,13 @@
 # limitations under the License.
 #
 
+"""
+An example demonstrating k-means clustering.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/kmeans_example.py
+
+This example requires NumPy (http://www.numpy.org/).
+"""
 from __future__ import print_function
 
 # $example on$
@@ -24,14 +31,6 @@
 
 from pyspark.sql import SparkSession
 
-"""
-An example demonstrating k-means clustering.
-Run with:
-  bin/spark-submit examples/src/main/python/ml/kmeans_example.py
-
-This example requires NumPy (http://www.numpy.org/).
-"""
-
 if __name__ == "__main__":
     spark = SparkSession\
         .builder\
diff --git a/examples/src/main/python/ml/lda_example.py b/examples/src/main/python/ml/lda_example.py
index a8b346f72cd6f..97d1a042d1479 100644
--- a/examples/src/main/python/ml/lda_example.py
+++ b/examples/src/main/python/ml/lda_example.py
@@ -15,7 +15,11 @@
 # limitations under the License.
 #
 
-
+"""
+An example demonstrating LDA.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/lda_example.py
+"""
 from __future__ import print_function
 
 # $example on$
@@ -23,12 +27,6 @@
 # $example off$
 from pyspark.sql import SparkSession
 
-"""
-An example demonstrating LDA.
-Run with:
-  bin/spark-submit examples/src/main/python/ml/lda_example.py
-"""
-
 if __name__ == "__main__":
     spark = SparkSession \
         .builder \
diff --git a/examples/src/main/python/ml/logistic_regression_summary_example.py b/examples/src/main/python/ml/logistic_regression_summary_example.py
index bd440a1fbe8df..2274ff707b2a3 100644
--- a/examples/src/main/python/ml/logistic_regression_summary_example.py
+++ b/examples/src/main/python/ml/logistic_regression_summary_example.py
@@ -15,6 +15,11 @@
 # limitations under the License.
 #
 
+"""
+An example demonstrating Logistic Regression Summary.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/logistic_regression_summary_example.py
+"""
 from __future__ import print_function
 
 # $example on$
@@ -22,12 +27,6 @@
 # $example off$
 from pyspark.sql import SparkSession
 
-"""
-An example demonstrating Logistic Regression Summary.
-Run with:
-  bin/spark-submit examples/src/main/python/ml/logistic_regression_summary_example.py
-"""
-
 if __name__ == "__main__":
     spark = SparkSession \
         .builder \
diff --git a/examples/src/main/python/ml/min_hash_lsh_example.py b/examples/src/main/python/ml/min_hash_lsh_example.py
index 7b1dd611a865b..93136e6ae3cae 100644
--- a/examples/src/main/python/ml/min_hash_lsh_example.py
+++ b/examples/src/main/python/ml/min_hash_lsh_example.py
@@ -15,7 +15,11 @@
 # limitations under the License.
 #
 
-
+"""
+An example demonstrating MinHashLSH.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/min_hash_lsh_example.py
+"""
 from __future__ import print_function
 
 # $example on$
@@ -25,12 +29,6 @@
 # $example off$
 from pyspark.sql import SparkSession
 
-"""
-An example demonstrating MinHashLSH.
-Run with:
-  bin/spark-submit examples/src/main/python/ml/min_hash_lsh_example.py
-"""
-
 if __name__ == "__main__":
     spark = SparkSession \
         .builder \
diff --git a/examples/src/main/python/ml/one_vs_rest_example.py b/examples/src/main/python/ml/one_vs_rest_example.py
index 8e00c25d9342e..956e94ae4ab62 100644
--- a/examples/src/main/python/ml/one_vs_rest_example.py
+++ b/examples/src/main/python/ml/one_vs_rest_example.py
@@ -15,6 +15,12 @@
 # limitations under the License.
 #
 
+"""
+An example of Multiclass to Binary Reduction with One Vs Rest,
+using Logistic Regression as the base classifier.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/one_vs_rest_example.py
+"""
 from __future__ import print_function
 
 # $example on$
@@ -23,13 +29,6 @@
 # $example off$
 from pyspark.sql import SparkSession
 
-"""
-An example of Multiclass to Binary Reduction with One Vs Rest,
-using Logistic Regression as the base classifier.
-Run with:
-  bin/spark-submit examples/src/main/python/ml/one_vs_rest_example.py
-"""
-
 if __name__ == "__main__":
     spark = SparkSession \
         .builder \
diff --git a/examples/src/main/python/ml/train_validation_split.py b/examples/src/main/python/ml/train_validation_split.py
index d104f7d30a1bf..d4f9184bf576e 100644
--- a/examples/src/main/python/ml/train_validation_split.py
+++ b/examples/src/main/python/ml/train_validation_split.py
@@ -15,13 +15,6 @@
 # limitations under the License.
 #
 
-# $example on$
-from pyspark.ml.evaluation import RegressionEvaluator
-from pyspark.ml.regression import LinearRegression
-from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
-# $example off$
-from pyspark.sql import SparkSession
-
 """
 This example demonstrates applying TrainValidationSplit to split data
 and preform model selection.
@@ -29,6 +22,12 @@
 
   bin/spark-submit examples/src/main/python/ml/train_validation_split.py
 """
+# $example on$
+from pyspark.ml.evaluation import RegressionEvaluator
+from pyspark.ml.regression import LinearRegression
+from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
+# $example off$
+from pyspark.sql import SparkSession
 
 if __name__ == "__main__":
     spark = SparkSession\
diff --git a/examples/src/main/python/parquet_inputformat.py b/examples/src/main/python/parquet_inputformat.py
index 52e9662d528d8..a3f86cf8999cf 100644
--- a/examples/src/main/python/parquet_inputformat.py
+++ b/examples/src/main/python/parquet_inputformat.py
@@ -15,12 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
-import sys
-
-from pyspark.sql import SparkSession
-
 """
 Read data file users.parquet in local Spark distro:
 
@@ -35,6 +29,12 @@
 {u'favorite_color': u'red', u'name': u'Ben', u'favorite_numbers': []}
 <...more log output...>
 """
+from __future__ import print_function
+
+import sys
+
+from pyspark.sql import SparkSession
+
 if __name__ == "__main__":
     if len(sys.argv) != 2:
         print("""
diff --git a/examples/src/main/python/sql/basic.py b/examples/src/main/python/sql/basic.py
index c07fa8f2752b3..c8fb25d0533b5 100644
--- a/examples/src/main/python/sql/basic.py
+++ b/examples/src/main/python/sql/basic.py
@@ -15,6 +15,11 @@
 # limitations under the License.
 #
 
+"""
+A simple example demonstrating basic Spark SQL features.
+Run with:
+  ./bin/spark-submit examples/src/main/python/sql/basic.py
+"""
 from __future__ import print_function
 
 # $example on:init_session$
@@ -30,12 +35,6 @@
 from pyspark.sql.types import *
 # $example off:programmatic_schema$
 
-"""
-A simple example demonstrating basic Spark SQL features.
-Run with:
-  ./bin/spark-submit examples/src/main/python/sql/basic.py
-"""
-
 
 def basic_df_example(spark):
     # $example on:create_df$
diff --git a/examples/src/main/python/sql/datasource.py b/examples/src/main/python/sql/datasource.py
index b375fa775de39..d8c879dfe02ed 100644
--- a/examples/src/main/python/sql/datasource.py
+++ b/examples/src/main/python/sql/datasource.py
@@ -15,6 +15,11 @@
 # limitations under the License.
 #
 
+"""
+A simple example demonstrating Spark SQL data sources.
+Run with:
+  ./bin/spark-submit examples/src/main/python/sql/datasource.py
+"""
 from __future__ import print_function
 
 from pyspark.sql import SparkSession
@@ -22,12 +27,6 @@
 from pyspark.sql import Row
 # $example off:schema_merging$
 
-"""
-A simple example demonstrating Spark SQL data sources.
-Run with:
-  ./bin/spark-submit examples/src/main/python/sql/datasource.py
-"""
-
 
 def basic_datasource_example(spark):
     # $example on:generic_load_save_functions$
diff --git a/examples/src/main/python/sql/hive.py b/examples/src/main/python/sql/hive.py
index 1f83a6fb48b97..33fc2dfbeefa2 100644
--- a/examples/src/main/python/sql/hive.py
+++ b/examples/src/main/python/sql/hive.py
@@ -15,6 +15,11 @@
 # limitations under the License.
 #
 
+"""
+A simple example demonstrating Spark SQL Hive integration.
+Run with:
+  ./bin/spark-submit examples/src/main/python/sql/hive.py
+"""
 from __future__ import print_function
 
 # $example on:spark_hive$
@@ -24,12 +29,6 @@
 from pyspark.sql import Row
 # $example off:spark_hive$
 
-"""
-A simple example demonstrating Spark SQL Hive integration.
-Run with:
-  ./bin/spark-submit examples/src/main/python/sql/hive.py
-"""
-
 
 if __name__ == "__main__":
     # $example on:spark_hive$

From 7ca2cd463db90fc166a10de1ebe58ccc795fbbe9 Mon Sep 17 00:00:00 2001
From: CCInCharge <charles.l.chen.clc@gmail.com>
Date: Sun, 28 Jan 2018 14:55:43 -0600
Subject: [PATCH 200/566] [SPARK-23250][DOCS] Typo in JavaDoc/ScalaDoc for
 DataFrameWriter

## What changes were proposed in this pull request?

Fix typo in ScalaDoc for DataFrameWriter - originally stated "This is applicable for all file-based data sources (e.g. Parquet, JSON) staring Spark 2.1.0", should be "starting with Spark 2.1.0".

## How was this patch tested?

Check of correct spelling in ScalaDoc

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: CCInCharge <charles.l.chen.clc@gmail.com>

Closes #20417 from CCInCharge/master.

(cherry picked from commit 686a622c93207564635569f054e1e6c921624e96)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../scala/org/apache/spark/sql/DataFrameWriter.scala     | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 5f3d4448e4e54..5c02eae05304b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -174,7 +174,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    * predicates on the partitioned columns. In order for partitioning to work well, the number
    * of distinct values in each column should typically be less than tens of thousands.
    *
-   * This is applicable for all file-based data sources (e.g. Parquet, JSON) staring Spark 2.1.0.
+   * This is applicable for all file-based data sources (e.g. Parquet, JSON) starting with Spark
+   * 2.1.0.
    *
    * @since 1.4.0
    */
@@ -188,7 +189,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    * Buckets the output by the given columns. If specified, the output is laid out on the file
    * system similar to Hive's bucketing scheme.
    *
-   * This is applicable for all file-based data sources (e.g. Parquet, JSON) staring Spark 2.1.0.
+   * This is applicable for all file-based data sources (e.g. Parquet, JSON) starting with Spark
+   * 2.1.0.
    *
    * @since 2.0
    */
@@ -202,7 +204,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   /**
    * Sorts the output in each bucket by the given columns.
    *
-   * This is applicable for all file-based data sources (e.g. Parquet, JSON) staring Spark 2.1.0.
+   * This is applicable for all file-based data sources (e.g. Parquet, JSON) starting with Spark
+   * 2.1.0.
    *
    * @since 2.0
    */

From 588b9694c1967ff45774431441e84081ee6eb515 Mon Sep 17 00:00:00 2001
From: Jose Torres <jose@databricks.com>
Date: Mon, 29 Jan 2018 13:10:38 +0800
Subject: [PATCH 201/566] [SPARK-23196] Unify continuous and microbatch V2
 sinks

## What changes were proposed in this pull request?

Replace streaming V2 sinks with a unified StreamWriteSupport interface, with a shim to use it with microbatch execution.

Add a new SQL config to use for disabling V2 sinks, falling back to the V1 sink implementation.

## How was this patch tested?

Existing tests, which in the case of Kafka (the only existing continuous V2 sink) now use V2 for microbatch.

Author: Jose Torres <jose@databricks.com>

Closes #20369 from jose-torres/streaming-sink.

(cherry picked from commit 49b0207dc9327989c72700b4d04d2a714c92e159)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/kafka010/KafkaSourceProvider.scala    |  16 +--
 ...usWriter.scala => KafkaStreamWriter.scala} |  30 ++---
 .../kafka010/KafkaContinuousSinkSuite.scala   |   8 +-
 .../spark/sql/kafka010/KafkaSinkSuite.scala   |  14 ++-
 .../spark/sql/kafka010/KafkaSourceSuite.scala |   8 +-
 .../apache/spark/sql/internal/SQLConf.scala   |   9 ++
 .../v2/streaming/MicroBatchWriteSupport.java  |  60 ----------
 ...teSupport.java => StreamWriteSupport.java} |  12 +-
 ...ontinuousWriter.java => StreamWriter.java} |  34 +++++-
 .../sources/v2/writer/DataSourceV2Writer.java |   4 +-
 .../datasources/v2/WriteToDataSourceV2.scala  |  11 +-
 .../streaming/MicroBatchExecution.scala       |  19 +--
 .../sql/execution/streaming/console.scala     |  27 ++---
 .../continuous/ContinuousExecution.scala      |  19 ++-
 .../continuous/EpochCoordinator.scala         |   9 +-
 .../streaming/sources/ConsoleWriter.scala     |  59 ++-------
 .../streaming/sources/MicroBatchWriter.scala  |  54 +++++++++
 .../streaming/sources/memoryV2.scala          |  29 ++---
 .../sql/streaming/DataStreamWriter.scala      |  10 +-
 .../sql/streaming/StreamingQueryManager.scala |   9 +-
 ...pache.spark.sql.sources.DataSourceRegister |   7 +-
 .../streaming/MemorySinkV2Suite.scala         |   2 +-
 .../sources/StreamingDataSourceV2Suite.scala  | 112 +++++++++---------
 23 files changed, 265 insertions(+), 297 deletions(-)
 rename external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/{KafkaContinuousWriter.scala => KafkaStreamWriter.scala} (78%)
 delete mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchWriteSupport.java
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/{ContinuousWriteSupport.java => StreamWriteSupport.java} (85%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/writer/{ContinuousWriter.java => StreamWriter.java} (50%)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/MicroBatchWriter.scala

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
index 62a998fbfb30b..2deb7fa2cdf1e 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
@@ -28,11 +28,11 @@ import org.apache.kafka.common.serialization.{ByteArrayDeserializer, ByteArraySe
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SparkSession, SQLContext}
-import org.apache.spark.sql.execution.streaming.{Offset, Sink, Source}
+import org.apache.spark.sql.execution.streaming.{Sink, Source}
 import org.apache.spark.sql.sources._
-import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
-import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, ContinuousWriteSupport}
-import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
+import org.apache.spark.sql.sources.v2.DataSourceV2Options
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, StreamWriteSupport}
+import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 
@@ -46,7 +46,7 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
     with StreamSinkProvider
     with RelationProvider
     with CreatableRelationProvider
-    with ContinuousWriteSupport
+    with StreamWriteSupport
     with ContinuousReadSupport
     with Logging {
   import KafkaSourceProvider._
@@ -223,11 +223,11 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
     }
   }
 
-  override def createContinuousWriter(
+  override def createStreamWriter(
       queryId: String,
       schema: StructType,
       mode: OutputMode,
-      options: DataSourceV2Options): Optional[ContinuousWriter] = {
+      options: DataSourceV2Options): StreamWriter = {
     import scala.collection.JavaConverters._
 
     val spark = SparkSession.getActiveSession.get
@@ -238,7 +238,7 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
     KafkaWriter.validateQuery(
       schema.toAttributes, new java.util.HashMap[String, Object](producerParams.asJava), topic)
 
-    Optional.of(new KafkaContinuousWriter(topic, producerParams, schema))
+    new KafkaStreamWriter(topic, producerParams, schema)
   }
 
   private def strategy(caseInsensitiveParams: Map[String, String]) =
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaStreamWriter.scala
similarity index 78%
rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala
rename to external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaStreamWriter.scala
index 9843f469c5b25..a24efdefa4464 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaStreamWriter.scala
@@ -17,19 +17,14 @@
 
 package org.apache.spark.sql.kafka010
 
-import org.apache.kafka.clients.producer.{Callback, ProducerRecord, RecordMetadata}
 import scala.collection.JavaConverters._
 
-import org.apache.spark.internal.Logging
-import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Literal, UnsafeProjection}
-import org.apache.spark.sql.kafka010.KafkaSourceProvider.{kafkaParamsForProducer, TOPIC_OPTION_KEY}
+import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.kafka010.KafkaWriter.validateQuery
-import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
+import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
 import org.apache.spark.sql.sources.v2.writer._
-import org.apache.spark.sql.streaming.OutputMode
-import org.apache.spark.sql.types.{BinaryType, StringType, StructType}
+import org.apache.spark.sql.types.StructType
 
 /**
  * Dummy commit message. The DataSourceV2 framework requires a commit message implementation but we
@@ -38,23 +33,24 @@ import org.apache.spark.sql.types.{BinaryType, StringType, StructType}
 case object KafkaWriterCommitMessage extends WriterCommitMessage
 
 /**
- * A [[ContinuousWriter]] for Kafka writing. Responsible for generating the writer factory.
+ * A [[StreamWriter]] for Kafka writing. Responsible for generating the writer factory.
+ *
  * @param topic The topic this writer is responsible for. If None, topic will be inferred from
  *              a `topic` field in the incoming data.
  * @param producerParams Parameters for Kafka producers in each task.
  * @param schema The schema of the input data.
  */
-class KafkaContinuousWriter(
+class KafkaStreamWriter(
     topic: Option[String], producerParams: Map[String, String], schema: StructType)
-  extends ContinuousWriter with SupportsWriteInternalRow {
+  extends StreamWriter with SupportsWriteInternalRow {
 
   validateQuery(schema.toAttributes, producerParams.toMap[String, Object].asJava, topic)
 
-  override def createInternalRowWriterFactory(): KafkaContinuousWriterFactory =
-    KafkaContinuousWriterFactory(topic, producerParams, schema)
+  override def createInternalRowWriterFactory(): KafkaStreamWriterFactory =
+    KafkaStreamWriterFactory(topic, producerParams, schema)
 
   override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {}
-  override def abort(messages: Array[WriterCommitMessage]): Unit = {}
+  override def abort(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {}
 }
 
 /**
@@ -65,12 +61,12 @@ class KafkaContinuousWriter(
  * @param producerParams Parameters for Kafka producers in each task.
  * @param schema The schema of the input data.
  */
-case class KafkaContinuousWriterFactory(
+case class KafkaStreamWriterFactory(
     topic: Option[String], producerParams: Map[String, String], schema: StructType)
   extends DataWriterFactory[InternalRow] {
 
   override def createDataWriter(partitionId: Int, attemptNumber: Int): DataWriter[InternalRow] = {
-    new KafkaContinuousDataWriter(topic, producerParams, schema.toAttributes)
+    new KafkaStreamDataWriter(topic, producerParams, schema.toAttributes)
   }
 }
 
@@ -83,7 +79,7 @@ case class KafkaContinuousWriterFactory(
  * @param producerParams Parameters to use for the Kafka producer.
  * @param inputSchema The attributes in the input data.
  */
-class KafkaContinuousDataWriter(
+class KafkaStreamDataWriter(
     targetTopic: Option[String], producerParams: Map[String, String], inputSchema: Seq[Attribute])
   extends KafkaRowWriter(inputSchema, targetTopic) with DataWriter[InternalRow] {
   import scala.collection.JavaConverters._
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
index 8487a69851237..fc890a0cfdac3 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
@@ -18,16 +18,14 @@
 package org.apache.spark.sql.kafka010
 
 import java.util.Locale
-import java.util.concurrent.atomic.AtomicInteger
 
 import org.apache.kafka.clients.producer.ProducerConfig
 import org.apache.kafka.common.serialization.ByteArraySerializer
 import org.scalatest.time.SpanSugar._
 import scala.collection.JavaConverters._
 
-import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SaveMode}
+import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, SpecificInternalRow, UnsafeProjection}
-import org.apache.spark.sql.execution.streaming.MemoryStream
 import org.apache.spark.sql.streaming._
 import org.apache.spark.sql.types.{BinaryType, DataType}
 import org.apache.spark.util.Utils
@@ -362,7 +360,7 @@ class KafkaContinuousSinkSuite extends KafkaContinuousTest {
     } finally {
       writer.stop()
     }
-    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains("job aborted"))
+    assert(ex.getCause.getCause.getMessage.toLowerCase(Locale.ROOT).contains("job aborted"))
   }
 
   test("streaming - exception on config serializer") {
@@ -424,7 +422,7 @@ class KafkaContinuousSinkSuite extends KafkaContinuousTest {
     options.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, classOf[ByteArraySerializer].getName)
     val inputSchema = Seq(AttributeReference("value", BinaryType)())
     val data = new Array[Byte](15000) // large value
-    val writeTask = new KafkaContinuousDataWriter(Some(topic), options.asScala.toMap, inputSchema)
+    val writeTask = new KafkaStreamDataWriter(Some(topic), options.asScala.toMap, inputSchema)
     try {
       val fieldTypes: Array[DataType] = Array(BinaryType)
       val converter = UnsafeProjection.create(fieldTypes)
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala
index 2ab336c7ac476..42f8b4c7657e2 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala
@@ -336,27 +336,31 @@ class KafkaSinkSuite extends StreamTest with SharedSQLContext {
     } finally {
       writer.stop()
     }
-    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains("job aborted"))
+    assert(ex.getCause.getCause.getMessage.toLowerCase(Locale.ROOT).contains("job aborted"))
   }
 
   test("streaming - exception on config serializer") {
     val input = MemoryStream[String]
     var writer: StreamingQuery = null
     var ex: Exception = null
-    ex = intercept[IllegalArgumentException] {
+    ex = intercept[StreamingQueryException] {
       writer = createKafkaWriter(
         input.toDF(),
         withOptions = Map("kafka.key.serializer" -> "foo"))()
+      input.addData("1")
+      writer.processAllAvailable()
     }
-    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+    assert(ex.getCause.getMessage.toLowerCase(Locale.ROOT).contains(
       "kafka option 'key.serializer' is not supported"))
 
-    ex = intercept[IllegalArgumentException] {
+    ex = intercept[StreamingQueryException] {
       writer = createKafkaWriter(
         input.toDF(),
         withOptions = Map("kafka.value.serializer" -> "foo"))()
+      input.addData("1")
+      writer.processAllAvailable()
     }
-    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+    assert(ex.getCause.getMessage.toLowerCase(Locale.ROOT).contains(
       "kafka option 'value.serializer' is not supported"))
   }
 
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index c4cb1bc4a2e18..02c87643568bd 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -29,19 +29,17 @@ import scala.util.Random
 
 import org.apache.kafka.clients.producer.RecordMetadata
 import org.apache.kafka.common.TopicPartition
-import org.scalatest.concurrent.Eventually._
 import org.scalatest.concurrent.PatienceConfiguration.Timeout
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.SparkContext
-import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row}
-import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, WriteToDataSourceV2Exec}
+import org.apache.spark.sql.{Dataset, ForeachWriter}
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
-import org.apache.spark.sql.execution.streaming.sources.ContinuousMemoryWriter
 import org.apache.spark.sql.functions.{count, window}
 import org.apache.spark.sql.kafka010.KafkaSourceProvider._
-import org.apache.spark.sql.streaming.{ProcessingTime, StreamTest, Trigger}
+import org.apache.spark.sql.streaming.{ProcessingTime, StreamTest}
 import org.apache.spark.sql.streaming.util.StreamManualClock
 import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}
 import org.apache.spark.util.Utils
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 76b9d6f6f33bd..2c70b004bcff9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1127,6 +1127,13 @@ object SQLConf {
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefault(100)
 
+  val DISABLED_V2_STREAMING_WRITERS = buildConf("spark.sql.streaming.disabledV2Writers")
+    .internal()
+    .doc("A comma-separated list of fully qualified data source register class names for which" +
+      " StreamWriteSupport is disabled. Writes to these sources will fail back to the V1 Sink.")
+    .stringConf
+    .createWithDefault("")
+
   object PartitionOverwriteMode extends Enumeration {
     val STATIC, DYNAMIC = Value
   }
@@ -1494,6 +1501,8 @@ class SQLConf extends Serializable with Logging {
   def continuousStreamingExecutorPollIntervalMs: Long =
     getConf(CONTINUOUS_STREAMING_EXECUTOR_POLL_INTERVAL_MS)
 
+  def disabledV2StreamingWriters: String = getConf(DISABLED_V2_STREAMING_WRITERS)
+
   def concatBinaryAsString: Boolean = getConf(CONCAT_BINARY_AS_STRING)
 
   def eltOutputAsString: Boolean = getConf(ELT_OUTPUT_AS_STRING)
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchWriteSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchWriteSupport.java
deleted file mode 100644
index 53ffa95ae0f4c..0000000000000
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchWriteSupport.java
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.sources.v2.streaming;
-
-import java.util.Optional;
-
-import org.apache.spark.annotation.InterfaceStability;
-import org.apache.spark.sql.execution.streaming.BaseStreamingSink;
-import org.apache.spark.sql.sources.v2.DataSourceV2;
-import org.apache.spark.sql.sources.v2.DataSourceV2Options;
-import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer;
-import org.apache.spark.sql.streaming.OutputMode;
-import org.apache.spark.sql.types.StructType;
-
-/**
- * A mix-in interface for {@link DataSourceV2}. Data sources can implement this interface to
- * provide data writing ability and save the data from a microbatch to the data source.
- */
-@InterfaceStability.Evolving
-public interface MicroBatchWriteSupport extends BaseStreamingSink {
-
-  /**
-   * Creates an optional {@link DataSourceV2Writer} to save the data to this data source. Data
-   * sources can return None if there is no writing needed to be done.
-   *
-   * @param queryId A unique string for the writing query. It's possible that there are many writing
-   *                queries running at the same time, and the returned {@link DataSourceV2Writer}
-   *                can use this id to distinguish itself from others.
-   * @param epochId The unique numeric ID of the batch within this writing query. This is an
-   *                incrementing counter representing a consistent set of data; the same batch may
-   *                be started multiple times in failure recovery scenarios, but it will always
-   *                contain the same records.
-   * @param schema the schema of the data to be written.
-   * @param mode the output mode which determines what successive batch output means to this
-   *             sink, please refer to {@link OutputMode} for more details.
-   * @param options the options for the returned data source writer, which is an immutable
-   *                case-insensitive string-to-string map.
-   */
-  Optional<DataSourceV2Writer> createMicroBatchWriter(
-      String queryId,
-      long epochId,
-      StructType schema,
-      OutputMode mode,
-      DataSourceV2Options options);
-}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousWriteSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/StreamWriteSupport.java
similarity index 85%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousWriteSupport.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/StreamWriteSupport.java
index dee493cadb71e..6cd219c67109a 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousWriteSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/StreamWriteSupport.java
@@ -17,26 +17,24 @@
 
 package org.apache.spark.sql.sources.v2.streaming;
 
-import java.util.Optional;
-
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.execution.streaming.BaseStreamingSink;
 import org.apache.spark.sql.sources.v2.DataSourceV2;
 import org.apache.spark.sql.sources.v2.DataSourceV2Options;
-import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter;
+import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter;
 import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer;
 import org.apache.spark.sql.streaming.OutputMode;
 import org.apache.spark.sql.types.StructType;
 
 /**
  * A mix-in interface for {@link DataSourceV2}. Data sources can implement this interface to
- * provide data writing ability for continuous stream processing.
+ * provide data writing ability for structured streaming.
  */
 @InterfaceStability.Evolving
-public interface ContinuousWriteSupport extends BaseStreamingSink {
+public interface StreamWriteSupport extends BaseStreamingSink {
 
     /**
-     * Creates an optional {@link ContinuousWriter} to save the data to this data source. Data
+     * Creates an optional {@link StreamWriter} to save the data to this data source. Data
      * sources can return None if there is no writing needed to be done.
      *
      * @param queryId A unique string for the writing query. It's possible that there are many
@@ -48,7 +46,7 @@ public interface ContinuousWriteSupport extends BaseStreamingSink {
      * @param options the options for the returned data source writer, which is an immutable
      *                case-insensitive string-to-string map.
      */
-    Optional<ContinuousWriter> createContinuousWriter(
+    StreamWriter createStreamWriter(
         String queryId,
         StructType schema,
         OutputMode mode,
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/writer/ContinuousWriter.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/writer/StreamWriter.java
similarity index 50%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/writer/ContinuousWriter.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/writer/StreamWriter.java
index 723395bd1e963..3156c88933e5e 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/writer/ContinuousWriter.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/writer/StreamWriter.java
@@ -23,10 +23,14 @@
 import org.apache.spark.sql.sources.v2.writer.WriterCommitMessage;
 
 /**
- * A {@link DataSourceV2Writer} for use with continuous stream processing.
+ * A {@link DataSourceV2Writer} for use with structured streaming. This writer handles commits and
+ * aborts relative to an epoch ID determined by the execution engine.
+ *
+ * {@link DataWriter} implementations generated by a StreamWriter may be reused for multiple epochs,
+ * and so must reset any internal state after a successful commit.
  */
 @InterfaceStability.Evolving
-public interface ContinuousWriter extends DataSourceV2Writer {
+public interface StreamWriter extends DataSourceV2Writer {
   /**
    * Commits this writing job for the specified epoch with a list of commit messages. The commit
    * messages are collected from successful data writers and are produced by
@@ -34,11 +38,35 @@ public interface ContinuousWriter extends DataSourceV2Writer {
    *
    * If this method fails (by throwing an exception), this writing job is considered to have been
    * failed, and the execution engine will attempt to call {@link #abort(WriterCommitMessage[])}.
+   *
+   * To support exactly-once processing, writer implementations should ensure that this method is
+   * idempotent. The execution engine may call commit() multiple times for the same epoch
+   * in some circumstances.
    */
   void commit(long epochId, WriterCommitMessage[] messages);
 
+  /**
+   * Aborts this writing job because some data writers are failed and keep failing when retry, or
+   * the Spark job fails with some unknown reasons, or {@link #commit(WriterCommitMessage[])} fails.
+   *
+   * If this method fails (by throwing an exception), the underlying data source may require manual
+   * cleanup.
+   *
+   * Unless the abort is triggered by the failure of commit, the given messages should have some
+   * null slots as there maybe only a few data writers that are committed before the abort
+   * happens, or some data writers were committed but their commit messages haven't reached the
+   * driver when the abort is triggered. So this is just a "best effort" for data sources to
+   * clean up the data left by data writers.
+   */
+  void abort(long epochId, WriterCommitMessage[] messages);
+
   default void commit(WriterCommitMessage[] messages) {
     throw new UnsupportedOperationException(
-       "Commit without epoch should not be called with ContinuousWriter");
+        "Commit without epoch should not be called with StreamWriter");
+  }
+
+  default void abort(WriterCommitMessage[] messages) {
+    throw new UnsupportedOperationException(
+        "Abort without epoch should not be called with StreamWriter");
   }
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java
index f1ef411423162..8048f507a1dca 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java
@@ -28,9 +28,7 @@
 /**
  * A data source writer that is returned by
  * {@link WriteSupport#createWriter(String, StructType, SaveMode, DataSourceV2Options)}/
- * {@link org.apache.spark.sql.sources.v2.streaming.MicroBatchWriteSupport#createMicroBatchWriter(
- * String, long, StructType, OutputMode, DataSourceV2Options)}/
- * {@link org.apache.spark.sql.sources.v2.streaming.ContinuousWriteSupport#createContinuousWriter(
+ * {@link org.apache.spark.sql.sources.v2.streaming.StreamWriteSupport#createStreamWriter(
  * String, StructType, OutputMode, DataSourceV2Options)}.
  * It can mix in various writing optimization interfaces to speed up the data saving. The actual
  * writing logic is delegated to {@link DataWriter}.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
index 3dbdae7b4df9f..cd6b3e99b6bcb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
@@ -26,9 +26,8 @@ import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.streaming.StreamExecution
 import org.apache.spark.sql.execution.streaming.continuous.{CommitPartitionEpoch, ContinuousExecution, EpochCoordinatorRef, SetWriterPartitions}
-import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
+import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
 import org.apache.spark.sql.sources.v2.writer._
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
@@ -62,7 +61,9 @@ case class WriteToDataSourceV2Exec(writer: DataSourceV2Writer, query: SparkPlan)
 
     try {
       val runTask = writer match {
-        case w: ContinuousWriter =>
+        // This case means that we're doing continuous processing. In microbatch streaming, the
+        // StreamWriter is wrapped in a MicroBatchWriter, which is executed as a normal batch.
+        case w: StreamWriter =>
           EpochCoordinatorRef.get(
             sparkContext.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY),
             sparkContext.env)
@@ -82,13 +83,13 @@ case class WriteToDataSourceV2Exec(writer: DataSourceV2Writer, query: SparkPlan)
         (index, message: WriterCommitMessage) => messages(index) = message
       )
 
-      if (!writer.isInstanceOf[ContinuousWriter]) {
+      if (!writer.isInstanceOf[StreamWriter]) {
         logInfo(s"Data source writer $writer is committing.")
         writer.commit(messages)
         logInfo(s"Data source writer $writer committed.")
       }
     } catch {
-      case _: InterruptedException if writer.isInstanceOf[ContinuousWriter] =>
+      case _: InterruptedException if writer.isInstanceOf[StreamWriter] =>
         // Interruption is how continuous queries are ended, so accept and ignore the exception.
       case cause: Throwable =>
         logError(s"Data source writer $writer is aborting.")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
index 7c3804547b736..975975243a3d1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
@@ -28,9 +28,11 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Curre
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.datasources.v2.{StreamingDataSourceV2Relation, WriteToDataSourceV2}
+import org.apache.spark.sql.execution.streaming.sources.{InternalRowMicroBatchWriter, MicroBatchWriter}
 import org.apache.spark.sql.sources.v2.DataSourceV2Options
-import org.apache.spark.sql.sources.v2.streaming.{MicroBatchReadSupport, MicroBatchWriteSupport}
+import org.apache.spark.sql.sources.v2.streaming.{MicroBatchReadSupport, StreamWriteSupport}
 import org.apache.spark.sql.sources.v2.streaming.reader.{MicroBatchReader, Offset => OffsetV2}
+import org.apache.spark.sql.sources.v2.writer.SupportsWriteInternalRow
 import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime, Trigger}
 import org.apache.spark.util.{Clock, Utils}
 
@@ -440,15 +442,18 @@ class MicroBatchExecution(
 
     val triggerLogicalPlan = sink match {
       case _: Sink => newAttributePlan
-      case s: MicroBatchWriteSupport =>
-        val writer = s.createMicroBatchWriter(
+      case s: StreamWriteSupport =>
+        val writer = s.createStreamWriter(
           s"$runId",
-          currentBatchId,
           newAttributePlan.schema,
           outputMode,
           new DataSourceV2Options(extraOptions.asJava))
-        assert(writer.isPresent, "microbatch writer must always be present")
-        WriteToDataSourceV2(writer.get, newAttributePlan)
+        if (writer.isInstanceOf[SupportsWriteInternalRow]) {
+          WriteToDataSourceV2(
+            new InternalRowMicroBatchWriter(currentBatchId, writer), newAttributePlan)
+        } else {
+          WriteToDataSourceV2(new MicroBatchWriter(currentBatchId, writer), newAttributePlan)
+        }
       case _ => throw new IllegalArgumentException(s"unknown sink type for $sink")
     }
 
@@ -471,7 +476,7 @@ class MicroBatchExecution(
       SQLExecution.withNewExecutionId(sparkSessionToRunBatch, lastExecution) {
         sink match {
           case s: Sink => s.addBatch(currentBatchId, nextBatch)
-          case s: MicroBatchWriteSupport =>
+          case _: StreamWriteSupport =>
             // This doesn't accumulate any data - it just forces execution of the microbatch writer.
             nextBatch.collect()
         }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
index f2aa3259731d1..d5ac0bd1df52b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
@@ -17,15 +17,12 @@
 
 package org.apache.spark.sql.execution.streaming
 
-import java.util.Optional
-
 import org.apache.spark.sql._
-import org.apache.spark.sql.execution.streaming.sources.{ConsoleContinuousWriter, ConsoleMicroBatchWriter, ConsoleWriter}
+import org.apache.spark.sql.execution.streaming.sources.ConsoleWriter
 import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister}
 import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
-import org.apache.spark.sql.sources.v2.streaming.{ContinuousWriteSupport, MicroBatchWriteSupport}
-import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
-import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer
+import org.apache.spark.sql.sources.v2.streaming.StreamWriteSupport
+import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 
@@ -35,26 +32,16 @@ case class ConsoleRelation(override val sqlContext: SQLContext, data: DataFrame)
 }
 
 class ConsoleSinkProvider extends DataSourceV2
-  with MicroBatchWriteSupport
-  with ContinuousWriteSupport
+  with StreamWriteSupport
   with DataSourceRegister
   with CreatableRelationProvider {
 
-  override def createMicroBatchWriter(
-      queryId: String,
-      batchId: Long,
-      schema: StructType,
-      mode: OutputMode,
-      options: DataSourceV2Options): Optional[DataSourceV2Writer] = {
-    Optional.of(new ConsoleMicroBatchWriter(batchId, schema, options))
-  }
-
-  override def createContinuousWriter(
+  override def createStreamWriter(
       queryId: String,
       schema: StructType,
       mode: OutputMode,
-      options: DataSourceV2Options): Optional[ContinuousWriter] = {
-    Optional.of(new ConsoleContinuousWriter(schema, options))
+      options: DataSourceV2Options): StreamWriter = {
+    new ConsoleWriter(schema, options)
   }
 
   def createRelation(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
index 462e7d9721d28..60f880f9c73b8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -24,17 +24,16 @@ import java.util.function.UnaryOperator
 import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, Map => MutableMap}
 
-import org.apache.spark.{SparkEnv, SparkException}
-import org.apache.spark.sql.{AnalysisException, SparkSession}
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp}
+import org.apache.spark.SparkEnv
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, CurrentDate, CurrentTimestamp}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, StreamingDataSourceV2Relation, WriteToDataSourceV2}
 import org.apache.spark.sql.execution.streaming.{ContinuousExecutionRelation, StreamingRelationV2, _}
 import org.apache.spark.sql.sources.v2.DataSourceV2Options
-import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, ContinuousWriteSupport}
-import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousReader, Offset, PartitionOffset}
-import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, StreamWriteSupport}
+import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousReader, PartitionOffset}
 import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime, Trigger}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.{Clock, Utils}
@@ -44,7 +43,7 @@ class ContinuousExecution(
     name: String,
     checkpointRoot: String,
     analyzedPlan: LogicalPlan,
-    sink: ContinuousWriteSupport,
+    sink: StreamWriteSupport,
     trigger: Trigger,
     triggerClock: Clock,
     outputMode: OutputMode,
@@ -195,12 +194,12 @@ class ContinuousExecution(
           "CurrentTimestamp and CurrentDate not yet supported for continuous processing")
     }
 
-    val writer = sink.createContinuousWriter(
+    val writer = sink.createStreamWriter(
       s"$runId",
       triggerLogicalPlan.schema,
       outputMode,
       new DataSourceV2Options(extraOptions.asJava))
-    val withSink = WriteToDataSourceV2(writer.get(), triggerLogicalPlan)
+    val withSink = WriteToDataSourceV2(writer, triggerLogicalPlan)
 
     val reader = withSink.collect {
       case DataSourceV2Relation(_, r: ContinuousReader) => r
@@ -230,7 +229,7 @@ class ContinuousExecution(
     // Use the parent Spark session for the endpoint since it's where this query ID is registered.
     val epochEndpoint =
       EpochCoordinatorRef.create(
-        writer.get(), reader, this, epochCoordinatorId, currentBatchId, sparkSession, SparkEnv.get)
+        writer, reader, this, epochCoordinatorId, currentBatchId, sparkSession, SparkEnv.get)
     val epochUpdateThread = new Thread(new Runnable {
       override def run: Unit = {
         try {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
index 90b3584aa0436..84d262116cb46 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
@@ -17,17 +17,14 @@
 
 package org.apache.spark.sql.execution.streaming.continuous
 
-import java.util.concurrent.atomic.AtomicLong
-
 import scala.collection.mutable
 
 import org.apache.spark.SparkEnv
 import org.apache.spark.internal.Logging
 import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, RpcEnv, ThreadSafeRpcEndpoint}
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.execution.streaming.StreamingQueryWrapper
 import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousReader, PartitionOffset}
-import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
+import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
 import org.apache.spark.sql.sources.v2.writer.WriterCommitMessage
 import org.apache.spark.util.RpcUtils
 
@@ -85,7 +82,7 @@ private[sql] object EpochCoordinatorRef extends Logging {
    * Create a reference to a new [[EpochCoordinator]].
    */
   def create(
-      writer: ContinuousWriter,
+      writer: StreamWriter,
       reader: ContinuousReader,
       query: ContinuousExecution,
       epochCoordinatorId: String,
@@ -118,7 +115,7 @@ private[sql] object EpochCoordinatorRef extends Logging {
  *   have both committed and reported an end offset for a given epoch.
  */
 private[continuous] class EpochCoordinator(
-    writer: ContinuousWriter,
+    writer: StreamWriter,
     reader: ContinuousReader,
     query: ContinuousExecution,
     startEpoch: Long,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala
index 6fb61dff60045..7c1700f1de48c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala
@@ -20,14 +20,13 @@ package org.apache.spark.sql.execution.streaming.sources
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.sources.v2.DataSourceV2Options
-import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
-import org.apache.spark.sql.sources.v2.writer.{DataSourceV2Writer, DataWriterFactory, WriterCommitMessage}
+import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
+import org.apache.spark.sql.sources.v2.writer.{DataWriterFactory, WriterCommitMessage}
 import org.apache.spark.sql.types.StructType
 
 /** Common methods used to create writes for the the console sink */
-trait ConsoleWriter extends Logging {
-
-  def options: DataSourceV2Options
+class ConsoleWriter(schema: StructType, options: DataSourceV2Options)
+    extends StreamWriter with Logging {
 
   // Number of rows to display, by default 20 rows
   protected val numRowsToShow = options.getInt("numRows", 20)
@@ -40,14 +39,20 @@ trait ConsoleWriter extends Logging {
 
   def createWriterFactory(): DataWriterFactory[Row] = PackedRowWriterFactory
 
-  def abort(messages: Array[WriterCommitMessage]): Unit = {}
+  override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {
+    // We have to print a "Batch" label for the epoch for compatibility with the pre-data source V2
+    // behavior.
+    printRows(messages, schema, s"Batch: $epochId")
+  }
+
+  def abort(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {}
 
   protected def printRows(
       commitMessages: Array[WriterCommitMessage],
       schema: StructType,
       printMessage: String): Unit = {
     val rows = commitMessages.collect {
-      case PackedRowCommitMessage(rows) => rows
+      case PackedRowCommitMessage(rs) => rs
     }.flatten
 
     // scalastyle:off println
@@ -59,46 +64,8 @@ trait ConsoleWriter extends Logging {
       .createDataFrame(spark.sparkContext.parallelize(rows), schema)
       .show(numRowsToShow, isTruncated)
   }
-}
-
-
-/**
- * A [[DataSourceV2Writer]] that collects results from a micro-batch query to the driver and
- * prints them in the console. Created by
- * [[org.apache.spark.sql.execution.streaming.ConsoleSinkProvider]].
- *
- * This sink should not be used for production, as it requires sending all rows to the driver
- * and does not support recovery.
- */
-class ConsoleMicroBatchWriter(batchId: Long, schema: StructType, val options: DataSourceV2Options)
-  extends DataSourceV2Writer with ConsoleWriter {
-
-  override def commit(messages: Array[WriterCommitMessage]): Unit = {
-    printRows(messages, schema, s"Batch: $batchId")
-  }
-
-  override def toString(): String = {
-    s"ConsoleMicroBatchWriter[numRows=$numRowsToShow, truncate=$isTruncated]"
-  }
-}
-
-
-/**
- * A [[DataSourceV2Writer]] that collects results from a continuous query to the driver and
- * prints them in the console. Created by
- * [[org.apache.spark.sql.execution.streaming.ConsoleSinkProvider]].
- *
- * This sink should not be used for production, as it requires sending all rows to the driver
- * and does not support recovery.
- */
-class ConsoleContinuousWriter(schema: StructType, val options: DataSourceV2Options)
-  extends ContinuousWriter with ConsoleWriter {
-
-  override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {
-    printRows(messages, schema, s"Continuous processing epoch $epochId")
-  }
 
   override def toString(): String = {
-    s"ConsoleContinuousWriter[numRows=$numRowsToShow, truncate=$isTruncated]"
+    s"ConsoleWriter[numRows=$numRowsToShow, truncate=$isTruncated]"
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/MicroBatchWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/MicroBatchWriter.scala
new file mode 100644
index 0000000000000..d7f3ba8856982
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/MicroBatchWriter.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.sources
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
+import org.apache.spark.sql.sources.v2.writer.{DataSourceV2Writer, DataWriterFactory, SupportsWriteInternalRow, WriterCommitMessage}
+
+/**
+ * A [[DataSourceV2Writer]] used to hook V2 stream writers into a microbatch plan. It implements
+ * the non-streaming interface, forwarding the batch ID determined at construction to a wrapped
+ * streaming writer.
+ */
+class MicroBatchWriter(batchId: Long, writer: StreamWriter) extends DataSourceV2Writer {
+  override def commit(messages: Array[WriterCommitMessage]): Unit = {
+    writer.commit(batchId, messages)
+  }
+
+  override def abort(messages: Array[WriterCommitMessage]): Unit = writer.abort(batchId, messages)
+
+  override def createWriterFactory(): DataWriterFactory[Row] = writer.createWriterFactory()
+}
+
+class InternalRowMicroBatchWriter(batchId: Long, writer: StreamWriter)
+  extends DataSourceV2Writer with SupportsWriteInternalRow {
+  override def commit(messages: Array[WriterCommitMessage]): Unit = {
+    writer.commit(batchId, messages)
+  }
+
+  override def abort(messages: Array[WriterCommitMessage]): Unit = writer.abort(batchId, messages)
+
+  override def createInternalRowWriterFactory(): DataWriterFactory[InternalRow] =
+    writer match {
+      case w: SupportsWriteInternalRow => w.createInternalRowWriterFactory()
+      case _ => throw new IllegalStateException(
+        "InternalRowMicroBatchWriter should only be created with base writer support")
+    }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
index da7c31cf62428..ce55e44d932bd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
@@ -30,8 +30,8 @@ import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics}
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes.{Append, Complete, Update}
 import org.apache.spark.sql.execution.streaming.Sink
 import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
-import org.apache.spark.sql.sources.v2.streaming.{ContinuousWriteSupport, MicroBatchWriteSupport}
-import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
+import org.apache.spark.sql.sources.v2.streaming.StreamWriteSupport
+import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
 import org.apache.spark.sql.sources.v2.writer._
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
@@ -40,24 +40,13 @@ import org.apache.spark.sql.types.StructType
  * A sink that stores the results in memory. This [[Sink]] is primarily intended for use in unit
  * tests and does not provide durability.
  */
-class MemorySinkV2 extends DataSourceV2
-  with MicroBatchWriteSupport with ContinuousWriteSupport with Logging {
-
-  override def createMicroBatchWriter(
+class MemorySinkV2 extends DataSourceV2 with StreamWriteSupport with Logging {
+  override def createStreamWriter(
       queryId: String,
-      batchId: Long,
       schema: StructType,
       mode: OutputMode,
-      options: DataSourceV2Options): java.util.Optional[DataSourceV2Writer] = {
-    java.util.Optional.of(new MemoryWriter(this, batchId, mode))
-  }
-
-  override def createContinuousWriter(
-      queryId: String,
-      schema: StructType,
-      mode: OutputMode,
-      options: DataSourceV2Options): java.util.Optional[ContinuousWriter] = {
-    java.util.Optional.of(new ContinuousMemoryWriter(this, mode))
+      options: DataSourceV2Options): StreamWriter = {
+    new MemoryStreamWriter(this, mode)
   }
 
   private case class AddedData(batchId: Long, data: Array[Row])
@@ -141,8 +130,8 @@ class MemoryWriter(sink: MemorySinkV2, batchId: Long, outputMode: OutputMode)
   }
 }
 
-class ContinuousMemoryWriter(val sink: MemorySinkV2, outputMode: OutputMode)
-  extends ContinuousWriter {
+class MemoryStreamWriter(val sink: MemorySinkV2, outputMode: OutputMode)
+  extends StreamWriter {
 
   override def createWriterFactory: MemoryWriterFactory = MemoryWriterFactory(outputMode)
 
@@ -153,7 +142,7 @@ class ContinuousMemoryWriter(val sink: MemorySinkV2, outputMode: OutputMode)
     sink.write(epochId, outputMode, newRows)
   }
 
-  override def abort(messages: Array[WriterCommitMessage]): Unit = {
+  override def abort(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {
     // Don't accept any of the new input.
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
index d24f0ddeab4de..3b5b30d77945c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.continuous.ContinuousTrigger
 import org.apache.spark.sql.execution.streaming.sources.{MemoryPlanV2, MemorySinkV2}
-import org.apache.spark.sql.sources.v2.streaming.{ContinuousWriteSupport, MicroBatchWriteSupport}
+import org.apache.spark.sql.sources.v2.streaming.StreamWriteSupport
 
 /**
  * Interface used to write a streaming `Dataset` to external storage systems (e.g. file systems,
@@ -281,11 +281,9 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
         trigger = trigger)
     } else {
       val ds = DataSource.lookupDataSource(source, df.sparkSession.sessionState.conf)
-      val sink = (ds.newInstance(), trigger) match {
-        case (w: ContinuousWriteSupport, _: ContinuousTrigger) => w
-        case (_, _: ContinuousTrigger) => throw new UnsupportedOperationException(
-            s"Data source $source does not support continuous writing")
-        case (w: MicroBatchWriteSupport, _) => w
+      val disabledSources = df.sparkSession.sqlContext.conf.disabledV2StreamingWriters.split(",")
+      val sink = ds.newInstance() match {
+        case w: StreamWriteSupport if !disabledSources.contains(w.getClass.getCanonicalName) => w
         case _ =>
           val ds = DataSource(
             df.sparkSession,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
index 4b27e0d4ef47b..fdd709cdb1f38 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.continuous.{ContinuousExecution, ContinuousTrigger}
 import org.apache.spark.sql.execution.streaming.state.StateStoreCoordinatorRef
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.sources.v2.streaming.{ContinuousWriteSupport, MicroBatchWriteSupport}
+import org.apache.spark.sql.sources.v2.streaming.StreamWriteSupport
 import org.apache.spark.util.{Clock, SystemClock, Utils}
 
 /**
@@ -241,7 +241,7 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo
     }
 
     (sink, trigger) match {
-      case (v2Sink: ContinuousWriteSupport, trigger: ContinuousTrigger) =>
+      case (v2Sink: StreamWriteSupport, trigger: ContinuousTrigger) =>
         UnsupportedOperationChecker.checkForContinuous(analyzedPlan, outputMode)
         new StreamingQueryWrapper(new ContinuousExecution(
           sparkSession,
@@ -254,7 +254,7 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo
           outputMode,
           extraOptions,
           deleteCheckpointOnStop))
-      case (_: MicroBatchWriteSupport, _) | (_: Sink, _) =>
+      case _ =>
         new StreamingQueryWrapper(new MicroBatchExecution(
           sparkSession,
           userSpecifiedName.orNull,
@@ -266,9 +266,6 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo
           outputMode,
           extraOptions,
           deleteCheckpointOnStop))
-      case (_: ContinuousWriteSupport, t) if !t.isInstanceOf[ContinuousTrigger] =>
-        throw new AnalysisException(
-          "Sink only supports continuous writes, but a continuous trigger was not specified.")
     }
   }
 
diff --git a/sql/core/src/test/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/sql/core/src/test/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
index a0b25b4e82364..46b38bed1c0fb 100644
--- a/sql/core/src/test/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
+++ b/sql/core/src/test/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
@@ -9,7 +9,6 @@ org.apache.spark.sql.streaming.sources.FakeReadMicroBatchOnly
 org.apache.spark.sql.streaming.sources.FakeReadContinuousOnly
 org.apache.spark.sql.streaming.sources.FakeReadBothModes
 org.apache.spark.sql.streaming.sources.FakeReadNeitherMode
-org.apache.spark.sql.streaming.sources.FakeWriteMicroBatchOnly
-org.apache.spark.sql.streaming.sources.FakeWriteContinuousOnly
-org.apache.spark.sql.streaming.sources.FakeWriteBothModes
-org.apache.spark.sql.streaming.sources.FakeWriteNeitherMode
+org.apache.spark.sql.streaming.sources.FakeWrite
+org.apache.spark.sql.streaming.sources.FakeNoWrite
+org.apache.spark.sql.streaming.sources.FakeWriteV1Fallback
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkV2Suite.scala
index 00d4f0b8503d8..9be22d94b5654 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkV2Suite.scala
@@ -40,7 +40,7 @@ class MemorySinkV2Suite extends StreamTest with BeforeAndAfter {
 
   test("continuous writer") {
     val sink = new MemorySinkV2
-    val writer = new ContinuousMemoryWriter(sink, OutputMode.Append())
+    val writer = new MemoryStreamWriter(sink, OutputMode.Append())
     writer.commit(0,
       Array(
         MemoryWriterCommitMessage(0, Seq(Row(1), Row(2))),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
index f152174b0a7f0..d4f8bae96695d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
@@ -19,18 +19,18 @@ package org.apache.spark.sql.streaming.sources
 
 import java.util.Optional
 
-import org.apache.spark.sql.{AnalysisException, Row}
+import org.apache.spark.sql.{DataFrame, Row, SQLContext}
 import org.apache.spark.sql.execution.datasources.DataSource
-import org.apache.spark.sql.execution.streaming.{LongOffset, RateStreamOffset}
+import org.apache.spark.sql.execution.streaming.{RateStreamOffset, Sink, StreamingQueryWrapper}
 import org.apache.spark.sql.execution.streaming.continuous.ContinuousTrigger
-import org.apache.spark.sql.sources.DataSourceRegister
-import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider}
+import org.apache.spark.sql.sources.v2.DataSourceV2Options
 import org.apache.spark.sql.sources.v2.reader.ReadTask
-import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, ContinuousWriteSupport, MicroBatchReadSupport, MicroBatchWriteSupport}
+import org.apache.spark.sql.sources.v2.streaming._
 import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousReader, MicroBatchReader, Offset, PartitionOffset}
-import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
-import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer
-import org.apache.spark.sql.streaming.{OutputMode, StreamingQueryException, StreamTest, Trigger}
+import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
+import org.apache.spark.sql.streaming.{OutputMode, StreamTest, Trigger}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
 
@@ -64,23 +64,12 @@ trait FakeContinuousReadSupport extends ContinuousReadSupport {
       options: DataSourceV2Options): ContinuousReader = FakeReader()
 }
 
-trait FakeMicroBatchWriteSupport extends MicroBatchWriteSupport {
-  def createMicroBatchWriter(
+trait FakeStreamWriteSupport extends StreamWriteSupport {
+  override def createStreamWriter(
       queryId: String,
-      epochId: Long,
       schema: StructType,
       mode: OutputMode,
-      options: DataSourceV2Options): Optional[DataSourceV2Writer] = {
-    throw new IllegalStateException("fake sink - cannot actually write")
-  }
-}
-
-trait FakeContinuousWriteSupport extends ContinuousWriteSupport {
-  def createContinuousWriter(
-      queryId: String,
-      schema: StructType,
-      mode: OutputMode,
-      options: DataSourceV2Options): Optional[ContinuousWriter] = {
+      options: DataSourceV2Options): StreamWriter = {
     throw new IllegalStateException("fake sink - cannot actually write")
   }
 }
@@ -102,23 +91,36 @@ class FakeReadNeitherMode extends DataSourceRegister {
   override def shortName(): String = "fake-read-neither-mode"
 }
 
-class FakeWriteMicroBatchOnly extends DataSourceRegister with FakeMicroBatchWriteSupport {
-  override def shortName(): String = "fake-write-microbatch-only"
+class FakeWrite extends DataSourceRegister with FakeStreamWriteSupport {
+  override def shortName(): String = "fake-write-microbatch-continuous"
 }
 
-class FakeWriteContinuousOnly extends DataSourceRegister with FakeContinuousWriteSupport {
-  override def shortName(): String = "fake-write-continuous-only"
+class FakeNoWrite extends DataSourceRegister {
+  override def shortName(): String = "fake-write-neither-mode"
 }
 
-class FakeWriteBothModes extends DataSourceRegister
-    with FakeMicroBatchWriteSupport with FakeContinuousWriteSupport {
-  override def shortName(): String = "fake-write-microbatch-continuous"
+
+case class FakeWriteV1FallbackException() extends Exception
+
+class FakeSink extends Sink {
+  override def addBatch(batchId: Long, data: DataFrame): Unit = {}
 }
 
-class FakeWriteNeitherMode extends DataSourceRegister {
-  override def shortName(): String = "fake-write-neither-mode"
+class FakeWriteV1Fallback extends DataSourceRegister
+  with FakeStreamWriteSupport with StreamSinkProvider {
+
+  override def createSink(
+    sqlContext: SQLContext,
+    parameters: Map[String, String],
+    partitionColumns: Seq[String],
+    outputMode: OutputMode): Sink = {
+    new FakeSink()
+  }
+
+  override def shortName(): String = "fake-write-v1-fallback"
 }
 
+
 class StreamingDataSourceV2Suite extends StreamTest {
 
   override def beforeAll(): Unit = {
@@ -133,8 +135,6 @@ class StreamingDataSourceV2Suite extends StreamTest {
     "fake-read-microbatch-continuous",
     "fake-read-neither-mode")
   val writeFormats = Seq(
-    "fake-write-microbatch-only",
-    "fake-write-continuous-only",
     "fake-write-microbatch-continuous",
     "fake-write-neither-mode")
   val triggers = Seq(
@@ -151,6 +151,7 @@ class StreamingDataSourceV2Suite extends StreamTest {
       .trigger(trigger)
       .start()
     query.stop()
+    query
   }
 
   private def testNegativeCase(
@@ -184,6 +185,24 @@ class StreamingDataSourceV2Suite extends StreamTest {
     }
   }
 
+  test("disabled v2 write") {
+    // Ensure the V2 path works normally and generates a V2 sink..
+    val v2Query = testPositiveCase(
+      "fake-read-microbatch-continuous", "fake-write-v1-fallback", Trigger.Once())
+    assert(v2Query.asInstanceOf[StreamingQueryWrapper].streamingQuery.sink
+      .isInstanceOf[FakeWriteV1Fallback])
+
+    // Ensure we create a V1 sink with the config. Note the config is a comma separated
+    // list, including other fake entries.
+    val fullSinkName = "org.apache.spark.sql.streaming.sources.FakeWriteV1Fallback"
+    withSQLConf(SQLConf.DISABLED_V2_STREAMING_WRITERS.key -> s"a,b,c,test,$fullSinkName,d,e") {
+      val v1Query = testPositiveCase(
+        "fake-read-microbatch-continuous", "fake-write-v1-fallback", Trigger.Once())
+      assert(v1Query.asInstanceOf[StreamingQueryWrapper].streamingQuery.sink
+        .isInstanceOf[FakeSink])
+    }
+  }
+
   // Get a list of (read, write, trigger) tuples for test cases.
   val cases = readFormats.flatMap { read =>
     writeFormats.flatMap { write =>
@@ -199,12 +218,12 @@ class StreamingDataSourceV2Suite extends StreamTest {
       val writeSource = DataSource.lookupDataSource(write, spark.sqlContext.conf).newInstance()
       (readSource, writeSource, trigger) match {
         // Valid microbatch queries.
-        case (_: MicroBatchReadSupport, _: MicroBatchWriteSupport, t)
+        case (_: MicroBatchReadSupport, _: StreamWriteSupport, t)
           if !t.isInstanceOf[ContinuousTrigger] =>
           testPositiveCase(read, write, trigger)
 
         // Valid continuous queries.
-        case (_: ContinuousReadSupport, _: ContinuousWriteSupport, _: ContinuousTrigger) =>
+        case (_: ContinuousReadSupport, _: StreamWriteSupport, _: ContinuousTrigger) =>
           testPositiveCase(read, write, trigger)
 
         // Invalid - can't read at all
@@ -214,31 +233,18 @@ class StreamingDataSourceV2Suite extends StreamTest {
           testNegativeCase(read, write, trigger,
             s"Data source $read does not support streamed reading")
 
-        // Invalid - trigger is continuous but writer is not
-        case (_, w, _: ContinuousTrigger) if !w.isInstanceOf[ContinuousWriteSupport] =>
-          testNegativeCase(read, write, trigger,
-            s"Data source $write does not support continuous writing")
-
-        // Invalid - can't write at all
-        case (_, w, _)
-            if !w.isInstanceOf[MicroBatchWriteSupport]
-              && !w.isInstanceOf[ContinuousWriteSupport] =>
+        // Invalid - can't write
+        case (_, w, _) if !w.isInstanceOf[StreamWriteSupport] =>
           testNegativeCase(read, write, trigger,
             s"Data source $write does not support streamed writing")
 
-        // Invalid - trigger and writer are continuous but reader is not
-        case (r, _: ContinuousWriteSupport, _: ContinuousTrigger)
+        // Invalid - trigger is continuous but reader is not
+        case (r, _: StreamWriteSupport, _: ContinuousTrigger)
             if !r.isInstanceOf[ContinuousReadSupport] =>
           testNegativeCase(read, write, trigger,
             s"Data source $read does not support continuous processing")
 
-        // Invalid - trigger is microbatch but writer is not
-        case (_, w, t)
-            if !w.isInstanceOf[MicroBatchWriteSupport] && !t.isInstanceOf[ContinuousTrigger] =>
-          testNegativeCase(read, write, trigger,
-            s"Data source $write does not support streamed writing")
-
-        // Invalid - trigger and writer are microbatch but reader is not
+        // Invalid - trigger is microbatch but reader is not
         case (r, _, t)
            if !r.isInstanceOf[MicroBatchReadSupport] && !t.isInstanceOf[ContinuousTrigger] =>
           testPostCreationNegativeCase(read, write, trigger,

From 5dda5db1229a20d7e3b0caab144af16da0787d56 Mon Sep 17 00:00:00 2001
From: Sameer Agarwal <sameerag@apache.org>
Date: Sun, 28 Jan 2018 23:13:30 -0800
Subject: [PATCH 202/566] [SPARK-23020] Ignore Flaky Test:
 SparkLauncherSuite.testInProcessLauncher in Spark 2.3

---
 .../java/org/apache/spark/launcher/SparkLauncherSuite.java    | 4 +++-
 .../apache/spark/sql/execution/UnsafeExternalRowSorter.java   | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
index 1543f4fdb0162..4c85a8b56141a 100644
--- a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
+++ b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
@@ -25,6 +25,7 @@
 import java.util.Map;
 import java.util.Properties;
 
+import org.junit.Ignore;
 import org.junit.Test;
 import static org.junit.Assert.*;
 import static org.junit.Assume.*;
@@ -121,7 +122,8 @@ public void testChildProcLauncher() throws Exception {
     assertEquals(0, app.waitFor());
   }
 
-  @Test
+  // TODO: [SPARK-23020] Re-enable this
+  @Ignore
   public void testInProcessLauncher() throws Exception {
     // Because this test runs SparkLauncher in process and in client mode, it pollutes the system
     // properties, and that can cause test failures down the test pipeline. So restore the original
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
index 78647b56d621f..1b2f5eee5ccdd 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
@@ -20,7 +20,6 @@
 import java.io.IOException;
 import java.util.function.Supplier;
 
-import org.apache.spark.sql.catalyst.util.TypeUtils;
 import scala.collection.AbstractIterator;
 import scala.collection.Iterator;
 import scala.math.Ordering;

From 8229e155d84cf02479c5dd0df6d577aff5075c00 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Mon, 29 Jan 2018 21:09:05 +0900
Subject: [PATCH 203/566] [SPARK-23238][SQL] Externalize SQLConf configurations
 exposed in documentation

## What changes were proposed in this pull request?

This PR proposes to expose few internal configurations found in the documentation.

Also it fixes the description for `spark.sql.execution.arrow.enabled`.
It's quite self-explanatory.

## How was this patch tested?

N/A

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20403 from HyukjinKwon/minor-doc-arrow.

(cherry picked from commit 39d2c6b03488895a0acb1dd3c46329db00fdd357)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 .../org/apache/spark/sql/internal/SQLConf.scala      | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 2c70b004bcff9..61ea03d395afc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -123,14 +123,12 @@ object SQLConf {
       .createWithDefault(10)
 
   val COMPRESS_CACHED = buildConf("spark.sql.inMemoryColumnarStorage.compressed")
-    .internal()
     .doc("When set to true Spark SQL will automatically select a compression codec for each " +
       "column based on statistics of the data.")
     .booleanConf
     .createWithDefault(true)
 
   val COLUMN_BATCH_SIZE = buildConf("spark.sql.inMemoryColumnarStorage.batchSize")
-    .internal()
     .doc("Controls the size of batches for columnar caching.  Larger batch sizes can improve " +
       "memory utilization and compression, but risk OOMs when caching data.")
     .intConf
@@ -1043,11 +1041,11 @@ object SQLConf {
 
   val ARROW_EXECUTION_ENABLE =
     buildConf("spark.sql.execution.arrow.enabled")
-      .internal()
-      .doc("Make use of Apache Arrow for columnar data transfers. Currently available " +
-        "for use with pyspark.sql.DataFrame.toPandas with the following data types: " +
-        "StringType, BinaryType, BooleanType, DoubleType, FloatType, ByteType, IntegerType, " +
-        "LongType, ShortType")
+      .doc("When true, make use of Apache Arrow for columnar data transfers. Currently available " +
+        "for use with pyspark.sql.DataFrame.toPandas, and " +
+        "pyspark.sql.SparkSession.createDataFrame when its input is a Pandas DataFrame. " +
+        "The following data types are unsupported: " +
+        "MapType, ArrayType of TimestampType, and nested StructType.")
       .booleanConf
       .createWithDefault(false)
 

From de66abafcf081c5cc4d1556d21c8ec21e1fefdf5 Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Tue, 30 Jan 2018 00:50:49 +0800
Subject: [PATCH 204/566] [SPARK-23219][SQL] Rename ReadTask to
 DataReaderFactory in data source v2

## What changes were proposed in this pull request?

Currently we have `ReadTask` in data source v2 reader, while in writer we have `DataWriterFactory`.
To make the naming consistent and better, renaming `ReadTask` to `DataReaderFactory`.

## How was this patch tested?

Unit test

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #20397 from gengliangwang/rename.

(cherry picked from commit badf0d0e0d1d9aa169ed655176ce9ae684d3905d)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/kafka010/KafkaContinuousReader.scala  | 16 ++---
 .../v2/reader/ClusteredDistribution.java      |  2 +-
 .../sql/sources/v2/reader/DataReader.java     |  2 +-
 .../{ReadTask.java => DataReaderFactory.java} | 22 +++----
 .../sources/v2/reader/DataSourceV2Reader.java | 11 ++--
 .../sql/sources/v2/reader/Distribution.java   |  6 +-
 .../sql/sources/v2/reader/Partitioning.java   |  2 +-
 .../v2/reader/SupportsScanColumnarBatch.java  | 11 ++--
 .../v2/reader/SupportsScanUnsafeRow.java      |  9 +--
 .../v2/streaming/MicroBatchReadSupport.java   |  4 +-
 .../v2/streaming/reader/ContinuousReader.java | 14 ++---
 .../v2/streaming/reader/MicroBatchReader.java |  6 +-
 .../datasources/v2/DataSourceRDD.scala        | 14 ++---
 .../datasources/v2/DataSourceV2ScanExec.scala | 25 ++++----
 .../ContinuousDataSourceRDDIter.scala         | 11 ++--
 .../ContinuousRateStreamSource.scala          | 10 ++--
 .../sources/RateStreamSourceV2.scala          |  6 +-
 .../sources/v2/JavaAdvancedDataSourceV2.java  | 20 +++----
 .../sql/sources/v2/JavaBatchDataSourceV2.java | 10 ++--
 .../v2/JavaPartitionAwareDataSource.java      | 10 ++--
 .../v2/JavaSchemaRequiredDataSource.java      |  4 +-
 .../sources/v2/JavaSimpleDataSourceV2.java    | 14 ++---
 .../sources/v2/JavaUnsafeRowDataSourceV2.java | 13 ++--
 .../streaming/RateSourceV2Suite.scala         | 10 ++--
 .../sql/sources/v2/DataSourceV2Suite.scala    | 59 ++++++++++---------
 .../sources/v2/SimpleWritableDataSource.scala | 12 ++--
 .../sources/StreamingDataSourceV2Suite.scala  |  4 +-
 27 files changed, 172 insertions(+), 155 deletions(-)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/{ReadTask.java => DataReaderFactory.java} (65%)

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
index fc977977504f7..9125cf5799d74 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
@@ -63,7 +63,7 @@ class KafkaContinuousReader(
 
   private val pollTimeoutMs = sourceOptions.getOrElse("kafkaConsumer.pollTimeoutMs", "512").toLong
 
-  // Initialized when creating read tasks. If this diverges from the partitions at the latest
+  // Initialized when creating reader factories. If this diverges from the partitions at the latest
   // offsets, we need to reconfigure.
   // Exposed outside this object only for unit tests.
   private[sql] var knownPartitions: Set[TopicPartition] = _
@@ -89,7 +89,7 @@ class KafkaContinuousReader(
     KafkaSourceOffset(JsonUtils.partitionOffsets(json))
   }
 
-  override def createUnsafeRowReadTasks(): ju.List[ReadTask[UnsafeRow]] = {
+  override def createUnsafeRowReaderFactories(): ju.List[DataReaderFactory[UnsafeRow]] = {
     import scala.collection.JavaConverters._
 
     val oldStartPartitionOffsets = KafkaSourceOffset.getPartitionOffsets(offset)
@@ -109,9 +109,9 @@ class KafkaContinuousReader(
 
     startOffsets.toSeq.map {
       case (topicPartition, start) =>
-        KafkaContinuousReadTask(
+        KafkaContinuousDataReaderFactory(
           topicPartition, start, kafkaParams, pollTimeoutMs, failOnDataLoss)
-          .asInstanceOf[ReadTask[UnsafeRow]]
+          .asInstanceOf[DataReaderFactory[UnsafeRow]]
     }.asJava
   }
 
@@ -149,8 +149,8 @@ class KafkaContinuousReader(
 }
 
 /**
- * A read task for continuous Kafka processing. This will be serialized and transformed into a
- * full reader on executors.
+ * A data reader factory for continuous Kafka processing. This will be serialized and transformed
+ * into a full reader on executors.
  *
  * @param topicPartition The (topic, partition) pair this task is responsible for.
  * @param startOffset The offset to start reading from within the partition.
@@ -159,12 +159,12 @@ class KafkaContinuousReader(
  * @param failOnDataLoss Flag indicating whether data reader should fail if some offsets
  *                       are skipped.
  */
-case class KafkaContinuousReadTask(
+case class KafkaContinuousDataReaderFactory(
     topicPartition: TopicPartition,
     startOffset: Long,
     kafkaParams: ju.Map[String, Object],
     pollTimeoutMs: Long,
-    failOnDataLoss: Boolean) extends ReadTask[UnsafeRow] {
+    failOnDataLoss: Boolean) extends DataReaderFactory[UnsafeRow] {
   override def createDataReader(): KafkaContinuousDataReader = {
     new KafkaContinuousDataReader(
       topicPartition, startOffset, kafkaParams, pollTimeoutMs, failOnDataLoss)
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ClusteredDistribution.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ClusteredDistribution.java
index 7346500de45b6..27905e325df87 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ClusteredDistribution.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ClusteredDistribution.java
@@ -22,7 +22,7 @@
 /**
  * A concrete implementation of {@link Distribution}. Represents a distribution where records that
  * share the same values for the {@link #clusteredColumns} will be produced by the same
- * {@link ReadTask}.
+ * {@link DataReader}.
  */
 @InterfaceStability.Evolving
 public class ClusteredDistribution implements Distribution {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java
index 8f58c865b6201..bb9790a1c819e 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java
@@ -23,7 +23,7 @@
 import org.apache.spark.annotation.InterfaceStability;
 
 /**
- * A data reader returned by {@link ReadTask#createDataReader()} and is responsible for
+ * A data reader returned by {@link DataReaderFactory#createDataReader()} and is responsible for
  * outputting data for a RDD partition.
  *
  * Note that, Currently the type `T` can only be {@link org.apache.spark.sql.Row} for normal data
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ReadTask.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReaderFactory.java
similarity index 65%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ReadTask.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReaderFactory.java
index fa161cdb8b347..077b95b837964 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ReadTask.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReaderFactory.java
@@ -22,21 +22,23 @@
 import org.apache.spark.annotation.InterfaceStability;
 
 /**
- * A read task returned by {@link DataSourceV2Reader#createReadTasks()} and is responsible for
- * creating the actual data reader. The relationship between {@link ReadTask} and {@link DataReader}
+ * A reader factory returned by {@link DataSourceV2Reader#createDataReaderFactories()} and is
+ * responsible for creating the actual data reader. The relationship between
+ * {@link DataReaderFactory} and {@link DataReader}
  * is similar to the relationship between {@link Iterable} and {@link java.util.Iterator}.
  *
- * Note that, the read task will be serialized and sent to executors, then the data reader will be
- * created on executors and do the actual reading. So {@link ReadTask} must be serializable and
- * {@link DataReader} doesn't need to be.
+ * Note that, the reader factory will be serialized and sent to executors, then the data reader
+ * will be created on executors and do the actual reading. So {@link DataReaderFactory} must be
+ * serializable and {@link DataReader} doesn't need to be.
  */
 @InterfaceStability.Evolving
-public interface ReadTask<T> extends Serializable {
+public interface DataReaderFactory<T> extends Serializable {
 
   /**
-   * The preferred locations where this read task can run faster, but Spark does not guarantee that
-   * this task will always run on these locations. The implementations should make sure that it can
-   * be run on any location. The location is a string representing the host name.
+   * The preferred locations where the data reader returned by this reader factory can run faster,
+   * but Spark does not guarantee to run the data reader on these locations.
+   * The implementations should make sure that it can be run on any location.
+   * The location is a string representing the host name.
    *
    * Note that if a host name cannot be recognized by Spark, it will be ignored as it was not in
    * the returned locations. By default this method returns empty string array, which means this
@@ -50,7 +52,7 @@ default String[] preferredLocations() {
   }
 
   /**
-   * Returns a data reader to do the actual reading work for this read task.
+   * Returns a data reader to do the actual reading work.
    *
    * If this method fails (by throwing an exception), the corresponding Spark task would fail and
    * get retried until hitting the maximum retry times.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
index f23c3842bf1b1..0180cd9ea47f8 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
@@ -30,7 +30,8 @@
  * {@link org.apache.spark.sql.sources.v2.ReadSupportWithSchema#createReader(
  * StructType, org.apache.spark.sql.sources.v2.DataSourceV2Options)}.
  * It can mix in various query optimization interfaces to speed up the data scan. The actual scan
- * logic is delegated to {@link ReadTask}s that are returned by {@link #createReadTasks()}.
+ * logic is delegated to {@link DataReaderFactory}s that are returned by
+ * {@link #createDataReaderFactories()}.
  *
  * There are mainly 3 kinds of query optimizations:
  *   1. Operators push-down. E.g., filter push-down, required columns push-down(aka column
@@ -63,9 +64,9 @@ public interface DataSourceV2Reader {
   StructType readSchema();
 
   /**
-   * Returns a list of read tasks. Each task is responsible for outputting data for one RDD
-   * partition. That means the number of tasks returned here is same as the number of RDD
-   * partitions this scan outputs.
+   * Returns a list of reader factories. Each factory is responsible for creating a data reader to
+   * output data for one RDD partition. That means the number of factories returned here is same as
+   * the number of RDD partitions this scan outputs.
    *
    * Note that, this may not be a full scan if the data source reader mixes in other optimization
    * interfaces like column pruning, filter push-down, etc. These optimizations are applied before
@@ -74,5 +75,5 @@ public interface DataSourceV2Reader {
    * If this method fails (by throwing an exception), the action would fail and no Spark job was
    * submitted.
    */
-  List<ReadTask<Row>> createReadTasks();
+  List<DataReaderFactory<Row>> createDataReaderFactories();
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Distribution.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Distribution.java
index a6201a222f541..b37562167d9ef 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Distribution.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Distribution.java
@@ -21,9 +21,9 @@
 
 /**
  * An interface to represent data distribution requirement, which specifies how the records should
- * be distributed among the {@link ReadTask}s that are returned by
- * {@link DataSourceV2Reader#createReadTasks()}. Note that this interface has nothing to do with
- * the data ordering inside one partition(the output records of a single {@link ReadTask}).
+ * be distributed among the data partitions(one {@link DataReader} outputs data for one partition).
+ * Note that this interface has nothing to do with the data ordering inside one
+ * partition(the output records of a single {@link DataReader}).
  *
  * The instance of this interface is created and provided by Spark, then consumed by
  * {@link Partitioning#satisfy(Distribution)}. This means data source developers don't need to
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Partitioning.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Partitioning.java
index 199e45d4a02ab..5e334d13a1215 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Partitioning.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Partitioning.java
@@ -29,7 +29,7 @@
 public interface Partitioning {
 
   /**
-   * Returns the number of partitions(i.e., {@link ReadTask}s) the data source outputs.
+   * Returns the number of partitions(i.e., {@link DataReaderFactory}s) the data source outputs.
    */
   int numPartitions();
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanColumnarBatch.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanColumnarBatch.java
index 27cf3a77724f0..67da55554bbf3 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanColumnarBatch.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanColumnarBatch.java
@@ -30,21 +30,22 @@
 @InterfaceStability.Evolving
 public interface SupportsScanColumnarBatch extends DataSourceV2Reader {
   @Override
-  default List<ReadTask<Row>> createReadTasks() {
+  default List<DataReaderFactory<Row>> createDataReaderFactories() {
     throw new IllegalStateException(
-      "createReadTasks not supported by default within SupportsScanColumnarBatch.");
+      "createDataReaderFactories not supported by default within SupportsScanColumnarBatch.");
   }
 
   /**
-   * Similar to {@link DataSourceV2Reader#createReadTasks()}, but returns columnar data in batches.
+   * Similar to {@link DataSourceV2Reader#createDataReaderFactories()}, but returns columnar data
+   * in batches.
    */
-  List<ReadTask<ColumnarBatch>> createBatchReadTasks();
+  List<DataReaderFactory<ColumnarBatch>> createBatchDataReaderFactories();
 
   /**
    * Returns true if the concrete data source reader can read data in batch according to the scan
    * properties like required columns, pushes filters, etc. It's possible that the implementation
    * can only support some certain columns with certain types. Users can overwrite this method and
-   * {@link #createReadTasks()} to fallback to normal read path under some conditions.
+   * {@link #createDataReaderFactories()} to fallback to normal read path under some conditions.
    */
   default boolean enableBatchRead() {
     return true;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java
index 2d3ad0eee65ff..156af69520f77 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java
@@ -33,13 +33,14 @@
 public interface SupportsScanUnsafeRow extends DataSourceV2Reader {
 
   @Override
-  default List<ReadTask<Row>> createReadTasks() {
+  default List<DataReaderFactory<Row>> createDataReaderFactories() {
     throw new IllegalStateException(
-      "createReadTasks not supported by default within SupportsScanUnsafeRow");
+      "createDataReaderFactories not supported by default within SupportsScanUnsafeRow");
   }
 
   /**
-   * Similar to {@link DataSourceV2Reader#createReadTasks()}, but returns data in unsafe row format.
+   * Similar to {@link DataSourceV2Reader#createDataReaderFactories()},
+   * but returns data in unsafe row format.
    */
-  List<ReadTask<UnsafeRow>> createUnsafeRowReadTasks();
+  List<DataReaderFactory<UnsafeRow>> createUnsafeRowReaderFactories();
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchReadSupport.java
index 3c87a3db68243..3b357c01a29fe 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchReadSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchReadSupport.java
@@ -36,8 +36,8 @@ public interface MicroBatchReadSupport extends DataSourceV2 {
    * streaming query.
    *
    * The execution engine will create a micro-batch reader at the start of a streaming query,
-   * alternate calls to setOffsetRange and createReadTasks for each batch to process, and then
-   * call stop() when the execution is complete. Note that a single query may have multiple
+   * alternate calls to setOffsetRange and createDataReaderFactories for each batch to process, and
+   * then call stop() when the execution is complete. Note that a single query may have multiple
    * executions due to restart or failure recovery.
    *
    * @param schema the user provided schema, or empty() if none was provided
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java
index 745f1ce502443..3ac979cb0b7b4 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java
@@ -27,7 +27,7 @@
  * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
  * interface to allow reading in a continuous processing mode stream.
  *
- * Implementations must ensure each read task output is a {@link ContinuousDataReader}.
+ * Implementations must ensure each reader factory output is a {@link ContinuousDataReader}.
  *
  * Note: This class currently extends {@link BaseStreamingSource} to maintain compatibility with
  * DataSource V1 APIs. This extension will be removed once we get rid of V1 completely.
@@ -47,9 +47,9 @@ public interface ContinuousReader extends BaseStreamingSource, DataSourceV2Reade
     Offset deserializeOffset(String json);
 
     /**
-     * Set the desired start offset for read tasks created from this reader. The scan will start
-     * from the first record after the provided offset, or from an implementation-defined inferred
-     * starting point if no offset is provided.
+     * Set the desired start offset for reader factories created from this reader. The scan will
+     * start from the first record after the provided offset, or from an implementation-defined
+     * inferred starting point if no offset is provided.
      */
     void setOffset(Optional<Offset> start);
 
@@ -61,9 +61,9 @@ public interface ContinuousReader extends BaseStreamingSource, DataSourceV2Reade
     Offset getStartOffset();
 
     /**
-     * The execution engine will call this method in every epoch to determine if new read tasks need
-     * to be generated, which may be required if for example the underlying source system has had
-     * partitions added or removed.
+     * The execution engine will call this method in every epoch to determine if new reader
+     * factories need to be generated, which may be required if for example the underlying
+     * source system has had partitions added or removed.
      *
      * If true, the query will be shut down and restarted with a new reader.
      */
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java
index 02f37cebc7484..68887e569fc1d 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java
@@ -33,9 +33,9 @@
 @InterfaceStability.Evolving
 public interface MicroBatchReader extends DataSourceV2Reader, BaseStreamingSource {
     /**
-     * Set the desired offset range for read tasks created from this reader. Read tasks will
-     * generate only data within (`start`, `end`]; that is, from the first record after `start` to
-     * the record with offset `end`.
+     * Set the desired offset range for reader factories created from this reader. Reader factories
+     * will generate only data within (`start`, `end`]; that is, from the first record after `start`
+     * to the record with offset `end`.
      *
      * @param start The initial offset to scan from. If not specified, scan from an
      *              implementation-specified start point, such as the earliest available record.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
index ac104d7cd0cb3..5ed0ba71e94c7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
@@ -22,24 +22,24 @@ import scala.reflect.ClassTag
 
 import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.sources.v2.reader.ReadTask
+import org.apache.spark.sql.sources.v2.reader.DataReaderFactory
 
-class DataSourceRDDPartition[T : ClassTag](val index: Int, val readTask: ReadTask[T])
+class DataSourceRDDPartition[T : ClassTag](val index: Int, val readerFactory: DataReaderFactory[T])
   extends Partition with Serializable
 
 class DataSourceRDD[T: ClassTag](
     sc: SparkContext,
-    @transient private val readTasks: java.util.List[ReadTask[T]])
+    @transient private val readerFactories: java.util.List[DataReaderFactory[T]])
   extends RDD[T](sc, Nil) {
 
   override protected def getPartitions: Array[Partition] = {
-    readTasks.asScala.zipWithIndex.map {
-      case (readTask, index) => new DataSourceRDDPartition(index, readTask)
+    readerFactories.asScala.zipWithIndex.map {
+      case (readerFactory, index) => new DataSourceRDDPartition(index, readerFactory)
     }.toArray
   }
 
   override def compute(split: Partition, context: TaskContext): Iterator[T] = {
-    val reader = split.asInstanceOf[DataSourceRDDPartition[T]].readTask.createDataReader()
+    val reader = split.asInstanceOf[DataSourceRDDPartition[T]].readerFactory.createDataReader()
     context.addTaskCompletionListener(_ => reader.close())
     val iter = new Iterator[T] {
       private[this] var valuePrepared = false
@@ -63,6 +63,6 @@ class DataSourceRDD[T: ClassTag](
   }
 
   override def getPreferredLocations(split: Partition): Seq[String] = {
-    split.asInstanceOf[DataSourceRDDPartition[T]].readTask.preferredLocations()
+    split.asInstanceOf[DataSourceRDDPartition[T]].readerFactory.preferredLocations()
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
index 2c22239e81869..3f808fbb40932 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
@@ -51,11 +51,11 @@ case class DataSourceV2ScanExec(
     case _ => super.outputPartitioning
   }
 
-  private lazy val readTasks: java.util.List[ReadTask[UnsafeRow]] = reader match {
-    case r: SupportsScanUnsafeRow => r.createUnsafeRowReadTasks()
+  private lazy val readerFactories: java.util.List[DataReaderFactory[UnsafeRow]] = reader match {
+    case r: SupportsScanUnsafeRow => r.createUnsafeRowReaderFactories()
     case _ =>
-      reader.createReadTasks().asScala.map {
-        new RowToUnsafeRowReadTask(_, reader.readSchema()): ReadTask[UnsafeRow]
+      reader.createDataReaderFactories().asScala.map {
+        new RowToUnsafeRowDataReaderFactory(_, reader.readSchema()): DataReaderFactory[UnsafeRow]
       }.asJava
   }
 
@@ -63,18 +63,19 @@ case class DataSourceV2ScanExec(
     case r: SupportsScanColumnarBatch if r.enableBatchRead() =>
       assert(!reader.isInstanceOf[ContinuousReader],
         "continuous stream reader does not support columnar read yet.")
-      new DataSourceRDD(sparkContext, r.createBatchReadTasks()).asInstanceOf[RDD[InternalRow]]
+      new DataSourceRDD(sparkContext, r.createBatchDataReaderFactories())
+        .asInstanceOf[RDD[InternalRow]]
 
     case _: ContinuousReader =>
       EpochCoordinatorRef.get(
           sparkContext.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY),
           sparkContext.env)
-        .askSync[Unit](SetReaderPartitions(readTasks.size()))
-      new ContinuousDataSourceRDD(sparkContext, sqlContext, readTasks)
+        .askSync[Unit](SetReaderPartitions(readerFactories.size()))
+      new ContinuousDataSourceRDD(sparkContext, sqlContext, readerFactories)
         .asInstanceOf[RDD[InternalRow]]
 
     case _ =>
-      new DataSourceRDD(sparkContext, readTasks).asInstanceOf[RDD[InternalRow]]
+      new DataSourceRDD(sparkContext, readerFactories).asInstanceOf[RDD[InternalRow]]
   }
 
   override def inputRDDs(): Seq[RDD[InternalRow]] = Seq(inputRDD)
@@ -99,14 +100,14 @@ case class DataSourceV2ScanExec(
   }
 }
 
-class RowToUnsafeRowReadTask(rowReadTask: ReadTask[Row], schema: StructType)
-  extends ReadTask[UnsafeRow] {
+class RowToUnsafeRowDataReaderFactory(rowReaderFactory: DataReaderFactory[Row], schema: StructType)
+  extends DataReaderFactory[UnsafeRow] {
 
-  override def preferredLocations: Array[String] = rowReadTask.preferredLocations
+  override def preferredLocations: Array[String] = rowReaderFactory.preferredLocations
 
   override def createDataReader: DataReader[UnsafeRow] = {
     new RowToUnsafeDataReader(
-      rowReadTask.createDataReader, RowEncoder.apply(schema).resolveAndBind())
+      rowReaderFactory.createDataReader, RowEncoder.apply(schema).resolveAndBind())
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
index cd7065f5e6601..8a7a38b22caca 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
@@ -39,15 +39,15 @@ import org.apache.spark.util.{SystemClock, ThreadUtils}
 class ContinuousDataSourceRDD(
     sc: SparkContext,
     sqlContext: SQLContext,
-    @transient private val readTasks: java.util.List[ReadTask[UnsafeRow]])
+    @transient private val readerFactories: java.util.List[DataReaderFactory[UnsafeRow]])
   extends RDD[UnsafeRow](sc, Nil) {
 
   private val dataQueueSize = sqlContext.conf.continuousStreamingExecutorQueueSize
   private val epochPollIntervalMs = sqlContext.conf.continuousStreamingExecutorPollIntervalMs
 
   override protected def getPartitions: Array[Partition] = {
-    readTasks.asScala.zipWithIndex.map {
-      case (readTask, index) => new DataSourceRDDPartition(index, readTask)
+    readerFactories.asScala.zipWithIndex.map {
+      case (readerFactory, index) => new DataSourceRDDPartition(index, readerFactory)
     }.toArray
   }
 
@@ -57,7 +57,8 @@ class ContinuousDataSourceRDD(
       throw new ContinuousTaskRetryException()
     }
 
-    val reader = split.asInstanceOf[DataSourceRDDPartition[UnsafeRow]].readTask.createDataReader()
+    val reader = split.asInstanceOf[DataSourceRDDPartition[UnsafeRow]]
+      .readerFactory.createDataReader()
 
     val coordinatorId = context.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY)
 
@@ -136,7 +137,7 @@ class ContinuousDataSourceRDD(
   }
 
   override def getPreferredLocations(split: Partition): Seq[String] = {
-    split.asInstanceOf[DataSourceRDDPartition[UnsafeRow]].readTask.preferredLocations()
+    split.asInstanceOf[DataSourceRDDPartition[UnsafeRow]].readerFactory.preferredLocations()
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
index b4b21e7d2052f..61304480f4721 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
@@ -68,7 +68,7 @@ class RateStreamContinuousReader(options: DataSourceV2Options)
 
   override def getStartOffset(): Offset = offset
 
-  override def createReadTasks(): java.util.List[ReadTask[Row]] = {
+  override def createDataReaderFactories(): java.util.List[DataReaderFactory[Row]] = {
     val partitionStartMap = offset match {
       case off: RateStreamOffset => off.partitionToValueAndRunTimeMs
       case off =>
@@ -86,13 +86,13 @@ class RateStreamContinuousReader(options: DataSourceV2Options)
       val start = partitionStartMap(i)
       // Have each partition advance by numPartitions each row, with starting points staggered
       // by their partition index.
-      RateStreamContinuousReadTask(
+      RateStreamContinuousDataReaderFactory(
         start.value,
         start.runTimeMs,
         i,
         numPartitions,
         perPartitionRate)
-        .asInstanceOf[ReadTask[Row]]
+        .asInstanceOf[DataReaderFactory[Row]]
     }.asJava
   }
 
@@ -101,13 +101,13 @@ class RateStreamContinuousReader(options: DataSourceV2Options)
 
 }
 
-case class RateStreamContinuousReadTask(
+case class RateStreamContinuousDataReaderFactory(
     startValue: Long,
     startTimeMs: Long,
     partitionIndex: Int,
     increment: Long,
     rowsPerSecond: Double)
-  extends ReadTask[Row] {
+  extends DataReaderFactory[Row] {
   override def createDataReader(): DataReader[Row] =
     new RateStreamContinuousDataReader(
       startValue, startTimeMs, partitionIndex, increment, rowsPerSecond)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
index c0ed12cec25ef..a25cc4f3b06f8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
@@ -123,7 +123,7 @@ class RateStreamMicroBatchReader(options: DataSourceV2Options)
     RateStreamOffset(Serialization.read[Map[Int, ValueRunTimeMsPair]](json))
   }
 
-  override def createReadTasks(): java.util.List[ReadTask[Row]] = {
+  override def createDataReaderFactories(): java.util.List[DataReaderFactory[Row]] = {
     val startMap = start.partitionToValueAndRunTimeMs
     val endMap = end.partitionToValueAndRunTimeMs
     endMap.keys.toSeq.map { part =>
@@ -139,7 +139,7 @@ class RateStreamMicroBatchReader(options: DataSourceV2Options)
         outTimeMs += msPerPartitionBetweenRows
       }
 
-      RateStreamBatchTask(packedRows).asInstanceOf[ReadTask[Row]]
+      RateStreamBatchTask(packedRows).asInstanceOf[DataReaderFactory[Row]]
     }.toList.asJava
   }
 
@@ -147,7 +147,7 @@ class RateStreamMicroBatchReader(options: DataSourceV2Options)
   override def stop(): Unit = {}
 }
 
-case class RateStreamBatchTask(vals: Seq[(Long, Long)]) extends ReadTask[Row] {
+case class RateStreamBatchTask(vals: Seq[(Long, Long)]) extends DataReaderFactory[Row] {
   override def createDataReader(): DataReader[Row] = new RateStreamBatchReader(vals)
 }
 
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java
index 1cfdc08217e6e..4026ee44bfdb7 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java
@@ -60,8 +60,8 @@ public Filter[] pushedFilters() {
     }
 
     @Override
-    public List<ReadTask<Row>> createReadTasks() {
-      List<ReadTask<Row>> res = new ArrayList<>();
+    public List<DataReaderFactory<Row>> createDataReaderFactories() {
+      List<DataReaderFactory<Row>> res = new ArrayList<>();
 
       Integer lowerBound = null;
       for (Filter filter : filters) {
@@ -75,25 +75,25 @@ public List<ReadTask<Row>> createReadTasks() {
       }
 
       if (lowerBound == null) {
-        res.add(new JavaAdvancedReadTask(0, 5, requiredSchema));
-        res.add(new JavaAdvancedReadTask(5, 10, requiredSchema));
+        res.add(new JavaAdvancedDataReaderFactory(0, 5, requiredSchema));
+        res.add(new JavaAdvancedDataReaderFactory(5, 10, requiredSchema));
       } else if (lowerBound < 4) {
-        res.add(new JavaAdvancedReadTask(lowerBound + 1, 5, requiredSchema));
-        res.add(new JavaAdvancedReadTask(5, 10, requiredSchema));
+        res.add(new JavaAdvancedDataReaderFactory(lowerBound + 1, 5, requiredSchema));
+        res.add(new JavaAdvancedDataReaderFactory(5, 10, requiredSchema));
       } else if (lowerBound < 9) {
-        res.add(new JavaAdvancedReadTask(lowerBound + 1, 10, requiredSchema));
+        res.add(new JavaAdvancedDataReaderFactory(lowerBound + 1, 10, requiredSchema));
       }
 
       return res;
     }
   }
 
-  static class JavaAdvancedReadTask implements ReadTask<Row>, DataReader<Row> {
+  static class JavaAdvancedDataReaderFactory implements DataReaderFactory<Row>, DataReader<Row> {
     private int start;
     private int end;
     private StructType requiredSchema;
 
-    JavaAdvancedReadTask(int start, int end, StructType requiredSchema) {
+    JavaAdvancedDataReaderFactory(int start, int end, StructType requiredSchema) {
       this.start = start;
       this.end = end;
       this.requiredSchema = requiredSchema;
@@ -101,7 +101,7 @@ static class JavaAdvancedReadTask implements ReadTask<Row>, DataReader<Row> {
 
     @Override
     public DataReader<Row> createDataReader() {
-      return new JavaAdvancedReadTask(start - 1, end, requiredSchema);
+      return new JavaAdvancedDataReaderFactory(start - 1, end, requiredSchema);
     }
 
     @Override
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java
index a5d77a90ece42..34e6c63801064 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java
@@ -42,12 +42,14 @@ public StructType readSchema() {
     }
 
     @Override
-    public List<ReadTask<ColumnarBatch>> createBatchReadTasks() {
-      return java.util.Arrays.asList(new JavaBatchReadTask(0, 50), new JavaBatchReadTask(50, 90));
+    public List<DataReaderFactory<ColumnarBatch>> createBatchDataReaderFactories() {
+      return java.util.Arrays.asList(
+               new JavaBatchDataReaderFactory(0, 50), new JavaBatchDataReaderFactory(50, 90));
     }
   }
 
-  static class JavaBatchReadTask implements ReadTask<ColumnarBatch>, DataReader<ColumnarBatch> {
+  static class JavaBatchDataReaderFactory
+      implements DataReaderFactory<ColumnarBatch>, DataReader<ColumnarBatch> {
     private int start;
     private int end;
 
@@ -57,7 +59,7 @@ static class JavaBatchReadTask implements ReadTask<ColumnarBatch>, DataReader<Co
     private OnHeapColumnVector j;
     private ColumnarBatch batch;
 
-    JavaBatchReadTask(int start, int end) {
+    JavaBatchDataReaderFactory(int start, int end) {
       this.start = start;
       this.end = end;
     }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaPartitionAwareDataSource.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaPartitionAwareDataSource.java
index 806d0bcd93f18..d0c87503ab455 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaPartitionAwareDataSource.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaPartitionAwareDataSource.java
@@ -40,10 +40,10 @@ public StructType readSchema() {
     }
 
     @Override
-    public List<ReadTask<Row>> createReadTasks() {
+    public List<DataReaderFactory<Row>> createDataReaderFactories() {
       return java.util.Arrays.asList(
-        new SpecificReadTask(new int[]{1, 1, 3}, new int[]{4, 4, 6}),
-        new SpecificReadTask(new int[]{2, 4, 4}, new int[]{6, 2, 2}));
+        new SpecificDataReaderFactory(new int[]{1, 1, 3}, new int[]{4, 4, 6}),
+        new SpecificDataReaderFactory(new int[]{2, 4, 4}, new int[]{6, 2, 2}));
     }
 
     @Override
@@ -70,12 +70,12 @@ public boolean satisfy(Distribution distribution) {
     }
   }
 
-  static class SpecificReadTask implements ReadTask<Row>, DataReader<Row> {
+  static class SpecificDataReaderFactory implements DataReaderFactory<Row>, DataReader<Row> {
     private int[] i;
     private int[] j;
     private int current = -1;
 
-    SpecificReadTask(int[] i, int[] j) {
+    SpecificDataReaderFactory(int[] i, int[] j) {
       assert i.length == j.length;
       this.i = i;
       this.j = j;
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSchemaRequiredDataSource.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSchemaRequiredDataSource.java
index a174bd8092cbd..f997366af1a64 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSchemaRequiredDataSource.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSchemaRequiredDataSource.java
@@ -24,7 +24,7 @@
 import org.apache.spark.sql.sources.v2.DataSourceV2Options;
 import org.apache.spark.sql.sources.v2.ReadSupportWithSchema;
 import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
-import org.apache.spark.sql.sources.v2.reader.ReadTask;
+import org.apache.spark.sql.sources.v2.reader.DataReaderFactory;
 import org.apache.spark.sql.types.StructType;
 
 public class JavaSchemaRequiredDataSource implements DataSourceV2, ReadSupportWithSchema {
@@ -42,7 +42,7 @@ public StructType readSchema() {
     }
 
     @Override
-    public List<ReadTask<Row>> createReadTasks() {
+    public List<DataReaderFactory<Row>> createDataReaderFactories() {
       return java.util.Collections.emptyList();
     }
   }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleDataSourceV2.java
index 2d458b7f7e906..2beed431d301f 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleDataSourceV2.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleDataSourceV2.java
@@ -26,7 +26,7 @@
 import org.apache.spark.sql.sources.v2.DataSourceV2Options;
 import org.apache.spark.sql.sources.v2.ReadSupport;
 import org.apache.spark.sql.sources.v2.reader.DataReader;
-import org.apache.spark.sql.sources.v2.reader.ReadTask;
+import org.apache.spark.sql.sources.v2.reader.DataReaderFactory;
 import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
 import org.apache.spark.sql.types.StructType;
 
@@ -41,25 +41,25 @@ public StructType readSchema() {
     }
 
     @Override
-    public List<ReadTask<Row>> createReadTasks() {
+    public List<DataReaderFactory<Row>> createDataReaderFactories() {
       return java.util.Arrays.asList(
-        new JavaSimpleReadTask(0, 5),
-        new JavaSimpleReadTask(5, 10));
+        new JavaSimpleDataReaderFactory(0, 5),
+        new JavaSimpleDataReaderFactory(5, 10));
     }
   }
 
-  static class JavaSimpleReadTask implements ReadTask<Row>, DataReader<Row> {
+  static class JavaSimpleDataReaderFactory implements DataReaderFactory<Row>, DataReader<Row> {
     private int start;
     private int end;
 
-    JavaSimpleReadTask(int start, int end) {
+    JavaSimpleDataReaderFactory(int start, int end) {
       this.start = start;
       this.end = end;
     }
 
     @Override
     public DataReader<Row> createDataReader() {
-      return new JavaSimpleReadTask(start - 1, end);
+      return new JavaSimpleDataReaderFactory(start - 1, end);
     }
 
     @Override
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaUnsafeRowDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaUnsafeRowDataSourceV2.java
index f6aa00869a681..e8187524ea871 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaUnsafeRowDataSourceV2.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaUnsafeRowDataSourceV2.java
@@ -38,19 +38,20 @@ public StructType readSchema() {
     }
 
     @Override
-    public List<ReadTask<UnsafeRow>> createUnsafeRowReadTasks() {
+    public List<DataReaderFactory<UnsafeRow>> createUnsafeRowReaderFactories() {
       return java.util.Arrays.asList(
-        new JavaUnsafeRowReadTask(0, 5),
-        new JavaUnsafeRowReadTask(5, 10));
+        new JavaUnsafeRowDataReaderFactory(0, 5),
+        new JavaUnsafeRowDataReaderFactory(5, 10));
     }
   }
 
-  static class JavaUnsafeRowReadTask implements ReadTask<UnsafeRow>, DataReader<UnsafeRow> {
+  static class JavaUnsafeRowDataReaderFactory
+      implements DataReaderFactory<UnsafeRow>, DataReader<UnsafeRow> {
     private int start;
     private int end;
     private UnsafeRow row;
 
-    JavaUnsafeRowReadTask(int start, int end) {
+    JavaUnsafeRowDataReaderFactory(int start, int end) {
       this.start = start;
       this.end = end;
       this.row = new UnsafeRow(2);
@@ -59,7 +60,7 @@ static class JavaUnsafeRowReadTask implements ReadTask<UnsafeRow>, DataReader<Un
 
     @Override
     public DataReader<UnsafeRow> createDataReader() {
-      return new JavaUnsafeRowReadTask(start - 1, end);
+      return new JavaUnsafeRowDataReaderFactory(start - 1, end);
     }
 
     @Override
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
index 85085d43061bd..d2cfe7905f6fa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
@@ -78,7 +78,7 @@ class RateSourceV2Suite extends StreamTest {
     val reader = new RateStreamMicroBatchReader(
       new DataSourceV2Options(Map("numPartitions" -> "11", "rowsPerSecond" -> "33").asJava))
     reader.setOffsetRange(Optional.empty(), Optional.empty())
-    val tasks = reader.createReadTasks()
+    val tasks = reader.createDataReaderFactories()
     assert(tasks.size == 11)
   }
 
@@ -118,7 +118,7 @@ class RateSourceV2Suite extends StreamTest {
     val startOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(0, 1000))))
     val endOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(20, 2000))))
     reader.setOffsetRange(Optional.of(startOffset), Optional.of(endOffset))
-    val tasks = reader.createReadTasks()
+    val tasks = reader.createDataReaderFactories()
     assert(tasks.size == 1)
     assert(tasks.get(0).asInstanceOf[RateStreamBatchTask].vals.size == 20)
   }
@@ -133,7 +133,7 @@ class RateSourceV2Suite extends StreamTest {
     }.toMap)
 
     reader.setOffsetRange(Optional.of(startOffset), Optional.of(endOffset))
-    val tasks = reader.createReadTasks()
+    val tasks = reader.createDataReaderFactories()
     assert(tasks.size == 11)
 
     val readData = tasks.asScala
@@ -161,12 +161,12 @@ class RateSourceV2Suite extends StreamTest {
     val reader = new RateStreamContinuousReader(
       new DataSourceV2Options(Map("numPartitions" -> "2", "rowsPerSecond" -> "20").asJava))
     reader.setOffset(Optional.empty())
-    val tasks = reader.createReadTasks()
+    val tasks = reader.createDataReaderFactories()
     assert(tasks.size == 2)
 
     val data = scala.collection.mutable.ListBuffer[Row]()
     tasks.asScala.foreach {
-      case t: RateStreamContinuousReadTask =>
+      case t: RateStreamContinuousDataReaderFactory =>
         val startTimeMs = reader.getStartOffset()
           .asInstanceOf[RateStreamOffset]
           .partitionToValueAndRunTimeMs(t.partitionIndex)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
index 0620693b35d16..42c5d3bcea44b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
@@ -204,18 +204,20 @@ class SimpleDataSourceV2 extends DataSourceV2 with ReadSupport {
   class Reader extends DataSourceV2Reader {
     override def readSchema(): StructType = new StructType().add("i", "int").add("j", "int")
 
-    override def createReadTasks(): JList[ReadTask[Row]] = {
-      java.util.Arrays.asList(new SimpleReadTask(0, 5), new SimpleReadTask(5, 10))
+    override def createDataReaderFactories(): JList[DataReaderFactory[Row]] = {
+      java.util.Arrays.asList(new SimpleDataReaderFactory(0, 5), new SimpleDataReaderFactory(5, 10))
     }
   }
 
   override def createReader(options: DataSourceV2Options): DataSourceV2Reader = new Reader
 }
 
-class SimpleReadTask(start: Int, end: Int) extends ReadTask[Row] with DataReader[Row] {
+class SimpleDataReaderFactory(start: Int, end: Int)
+  extends DataReaderFactory[Row]
+  with DataReader[Row] {
   private var current = start - 1
 
-  override def createDataReader(): DataReader[Row] = new SimpleReadTask(start, end)
+  override def createDataReader(): DataReader[Row] = new SimpleDataReaderFactory(start, end)
 
   override def next(): Boolean = {
     current += 1
@@ -252,21 +254,21 @@ class AdvancedDataSourceV2 extends DataSourceV2 with ReadSupport {
       requiredSchema
     }
 
-    override def createReadTasks(): JList[ReadTask[Row]] = {
+    override def createDataReaderFactories(): JList[DataReaderFactory[Row]] = {
       val lowerBound = filters.collect {
         case GreaterThan("i", v: Int) => v
       }.headOption
 
-      val res = new ArrayList[ReadTask[Row]]
+      val res = new ArrayList[DataReaderFactory[Row]]
 
       if (lowerBound.isEmpty) {
-        res.add(new AdvancedReadTask(0, 5, requiredSchema))
-        res.add(new AdvancedReadTask(5, 10, requiredSchema))
+        res.add(new AdvancedDataReaderFactory(0, 5, requiredSchema))
+        res.add(new AdvancedDataReaderFactory(5, 10, requiredSchema))
       } else if (lowerBound.get < 4) {
-        res.add(new AdvancedReadTask(lowerBound.get + 1, 5, requiredSchema))
-        res.add(new AdvancedReadTask(5, 10, requiredSchema))
+        res.add(new AdvancedDataReaderFactory(lowerBound.get + 1, 5, requiredSchema))
+        res.add(new AdvancedDataReaderFactory(5, 10, requiredSchema))
       } else if (lowerBound.get < 9) {
-        res.add(new AdvancedReadTask(lowerBound.get + 1, 10, requiredSchema))
+        res.add(new AdvancedDataReaderFactory(lowerBound.get + 1, 10, requiredSchema))
       }
 
       res
@@ -276,13 +278,13 @@ class AdvancedDataSourceV2 extends DataSourceV2 with ReadSupport {
   override def createReader(options: DataSourceV2Options): DataSourceV2Reader = new Reader
 }
 
-class AdvancedReadTask(start: Int, end: Int, requiredSchema: StructType)
-  extends ReadTask[Row] with DataReader[Row] {
+class AdvancedDataReaderFactory(start: Int, end: Int, requiredSchema: StructType)
+  extends DataReaderFactory[Row] with DataReader[Row] {
 
   private var current = start - 1
 
   override def createDataReader(): DataReader[Row] = {
-    new AdvancedReadTask(start, end, requiredSchema)
+    new AdvancedDataReaderFactory(start, end, requiredSchema)
   }
 
   override def close(): Unit = {}
@@ -307,16 +309,17 @@ class UnsafeRowDataSourceV2 extends DataSourceV2 with ReadSupport {
   class Reader extends DataSourceV2Reader with SupportsScanUnsafeRow {
     override def readSchema(): StructType = new StructType().add("i", "int").add("j", "int")
 
-    override def createUnsafeRowReadTasks(): JList[ReadTask[UnsafeRow]] = {
-      java.util.Arrays.asList(new UnsafeRowReadTask(0, 5), new UnsafeRowReadTask(5, 10))
+    override def createUnsafeRowReaderFactories(): JList[DataReaderFactory[UnsafeRow]] = {
+      java.util.Arrays.asList(new UnsafeRowDataReaderFactory(0, 5),
+        new UnsafeRowDataReaderFactory(5, 10))
     }
   }
 
   override def createReader(options: DataSourceV2Options): DataSourceV2Reader = new Reader
 }
 
-class UnsafeRowReadTask(start: Int, end: Int)
-  extends ReadTask[UnsafeRow] with DataReader[UnsafeRow] {
+class UnsafeRowDataReaderFactory(start: Int, end: Int)
+  extends DataReaderFactory[UnsafeRow] with DataReader[UnsafeRow] {
 
   private val row = new UnsafeRow(2)
   row.pointTo(new Array[Byte](8 * 3), 8 * 3)
@@ -341,7 +344,7 @@ class UnsafeRowReadTask(start: Int, end: Int)
 class SchemaRequiredDataSource extends DataSourceV2 with ReadSupportWithSchema {
 
   class Reader(val readSchema: StructType) extends DataSourceV2Reader {
-    override def createReadTasks(): JList[ReadTask[Row]] =
+    override def createDataReaderFactories(): JList[DataReaderFactory[Row]] =
       java.util.Collections.emptyList()
   }
 
@@ -354,16 +357,16 @@ class BatchDataSourceV2 extends DataSourceV2 with ReadSupport {
   class Reader extends DataSourceV2Reader with SupportsScanColumnarBatch {
     override def readSchema(): StructType = new StructType().add("i", "int").add("j", "int")
 
-    override def createBatchReadTasks(): JList[ReadTask[ColumnarBatch]] = {
-      java.util.Arrays.asList(new BatchReadTask(0, 50), new BatchReadTask(50, 90))
+    override def createBatchDataReaderFactories(): JList[DataReaderFactory[ColumnarBatch]] = {
+      java.util.Arrays.asList(new BatchDataReaderFactory(0, 50), new BatchDataReaderFactory(50, 90))
     }
   }
 
   override def createReader(options: DataSourceV2Options): DataSourceV2Reader = new Reader
 }
 
-class BatchReadTask(start: Int, end: Int)
-  extends ReadTask[ColumnarBatch] with DataReader[ColumnarBatch] {
+class BatchDataReaderFactory(start: Int, end: Int)
+  extends DataReaderFactory[ColumnarBatch] with DataReader[ColumnarBatch] {
 
   private final val BATCH_SIZE = 20
   private lazy val i = new OnHeapColumnVector(BATCH_SIZE, IntegerType)
@@ -406,11 +409,11 @@ class PartitionAwareDataSource extends DataSourceV2 with ReadSupport {
   class Reader extends DataSourceV2Reader with SupportsReportPartitioning {
     override def readSchema(): StructType = new StructType().add("a", "int").add("b", "int")
 
-    override def createReadTasks(): JList[ReadTask[Row]] = {
+    override def createDataReaderFactories(): JList[DataReaderFactory[Row]] = {
       // Note that we don't have same value of column `a` across partitions.
       java.util.Arrays.asList(
-        new SpecificReadTask(Array(1, 1, 3), Array(4, 4, 6)),
-        new SpecificReadTask(Array(2, 4, 4), Array(6, 2, 2)))
+        new SpecificDataReaderFactory(Array(1, 1, 3), Array(4, 4, 6)),
+        new SpecificDataReaderFactory(Array(2, 4, 4), Array(6, 2, 2)))
     }
 
     override def outputPartitioning(): Partitioning = new MyPartitioning
@@ -428,7 +431,9 @@ class PartitionAwareDataSource extends DataSourceV2 with ReadSupport {
   override def createReader(options: DataSourceV2Options): DataSourceV2Reader = new Reader
 }
 
-class SpecificReadTask(i: Array[Int], j: Array[Int]) extends ReadTask[Row] with DataReader[Row] {
+class SpecificDataReaderFactory(i: Array[Int], j: Array[Int])
+  extends DataReaderFactory[Row]
+  with DataReader[Row] {
   assert(i.length == j.length)
 
   private var current = -1
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala
index cd7252eb2e3d6..3310d6dd199d6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala
@@ -28,7 +28,7 @@ import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, Path}
 import org.apache.spark.SparkContext
 import org.apache.spark.sql.{Row, SaveMode}
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.sources.v2.reader.{DataReader, DataSourceV2Reader, ReadTask}
+import org.apache.spark.sql.sources.v2.reader.{DataReader, DataReaderFactory, DataSourceV2Reader}
 import org.apache.spark.sql.sources.v2.writer._
 import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.util.SerializableConfiguration
@@ -45,7 +45,7 @@ class SimpleWritableDataSource extends DataSourceV2 with ReadSupport with WriteS
   class Reader(path: String, conf: Configuration) extends DataSourceV2Reader {
     override def readSchema(): StructType = schema
 
-    override def createReadTasks(): JList[ReadTask[Row]] = {
+    override def createDataReaderFactories(): JList[DataReaderFactory[Row]] = {
       val dataPath = new Path(path)
       val fs = dataPath.getFileSystem(conf)
       if (fs.exists(dataPath)) {
@@ -54,7 +54,9 @@ class SimpleWritableDataSource extends DataSourceV2 with ReadSupport with WriteS
           name.startsWith("_") || name.startsWith(".")
         }.map { f =>
           val serializableConf = new SerializableConfiguration(conf)
-          new SimpleCSVReadTask(f.getPath.toUri.toString, serializableConf): ReadTask[Row]
+          new SimpleCSVDataReaderFactory(
+            f.getPath.toUri.toString,
+            serializableConf): DataReaderFactory[Row]
         }.toList.asJava
       } else {
         Collections.emptyList()
@@ -149,8 +151,8 @@ class SimpleWritableDataSource extends DataSourceV2 with ReadSupport with WriteS
   }
 }
 
-class SimpleCSVReadTask(path: String, conf: SerializableConfiguration)
-  extends ReadTask[Row] with DataReader[Row] {
+class SimpleCSVDataReaderFactory(path: String, conf: SerializableConfiguration)
+  extends DataReaderFactory[Row] with DataReader[Row] {
 
   @transient private var lines: Iterator[String] = _
   @transient private var currentLine: String = _
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
index d4f8bae96695d..dc8c857018457 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.execution.streaming.continuous.ContinuousTrigger
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider}
 import org.apache.spark.sql.sources.v2.DataSourceV2Options
-import org.apache.spark.sql.sources.v2.reader.ReadTask
+import org.apache.spark.sql.sources.v2.reader.DataReaderFactory
 import org.apache.spark.sql.sources.v2.streaming._
 import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousReader, MicroBatchReader, Offset, PartitionOffset}
 import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
@@ -45,7 +45,7 @@ case class FakeReader() extends MicroBatchReader with ContinuousReader {
   def mergeOffsets(offsets: Array[PartitionOffset]): Offset = RateStreamOffset(Map())
   def setOffset(start: Optional[Offset]): Unit = {}
 
-  def createReadTasks(): java.util.ArrayList[ReadTask[Row]] = {
+  def createDataReaderFactories(): java.util.ArrayList[DataReaderFactory[Row]] = {
     throw new IllegalStateException("fake source - cannot actually read")
   }
 }

From 4059454f979874caa9745861a2bcc60cac0bbffd Mon Sep 17 00:00:00 2001
From: caoxuewen <cao.xuewen@zte.com.cn>
Date: Mon, 29 Jan 2018 08:56:42 -0800
Subject: [PATCH 205/566] [SPARK-23199][SQL] improved Removes repetition from
 group expressions in Aggregate

## What changes were proposed in this pull request?

Currently, all Aggregate operations will go into RemoveRepetitionFromGroupExpressions, but there is no group expression or there is no duplicate group expression in group expression, we not need copy for logic plan.

## How was this patch tested?

the existed test case.

Author: caoxuewen <cao.xuewen@zte.com.cn>

Closes #20375 from heary-cao/RepetitionGroupExpressions.

(cherry picked from commit 54dd7cf4ef921bc9dc12f99cfb90d1da57939901)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../apache/spark/sql/catalyst/optimizer/Optimizer.scala   | 8 ++++++--
 .../sql/catalyst/optimizer/AggregateOptimizeSuite.scala   | 5 ++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 8d207708c12ad..a28b6a0feb8f9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -1302,8 +1302,12 @@ object RemoveLiteralFromGroupExpressions extends Rule[LogicalPlan] {
  */
 object RemoveRepetitionFromGroupExpressions extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-    case a @ Aggregate(grouping, _, _) =>
+    case a @ Aggregate(grouping, _, _) if grouping.size > 1 =>
       val newGrouping = ExpressionSet(grouping).toSeq
-      a.copy(groupingExpressions = newGrouping)
+      if (newGrouping.size == grouping.size) {
+        a
+      } else {
+        a.copy(groupingExpressions = newGrouping)
+      }
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/AggregateOptimizeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/AggregateOptimizeSuite.scala
index a3184a4266c7c..f8ddc93597070 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/AggregateOptimizeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/AggregateOptimizeSuite.scala
@@ -67,10 +67,9 @@ class AggregateOptimizeSuite extends PlanTest {
   }
 
   test("remove repetition in grouping expression") {
-    val input = LocalRelation('a.int, 'b.int, 'c.int)
-    val query = input.groupBy('a + 1, 'b + 2, Literal(1) + 'A, Literal(2) + 'B)(sum('c))
+    val query = testRelation.groupBy('a + 1, 'b + 2, Literal(1) + 'A, Literal(2) + 'B)(sum('c))
     val optimized = Optimize.execute(analyzer.execute(query))
-    val correctAnswer = input.groupBy('a + 1, 'b + 2)(sum('c)).analyze
+    val correctAnswer = testRelation.groupBy('a + 1, 'b + 2)(sum('c)).analyze
 
     comparePlans(optimized, correctAnswer)
   }

From d68198d26e32ce98cbf0d3f8755d21dc72b3756d Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@databricks.com>
Date: Mon, 29 Jan 2018 09:00:54 -0800
Subject: [PATCH 206/566] [SPARK-23223][SQL] Make stacking dataset transforms
 more performant

## What changes were proposed in this pull request?
It is a common pattern to apply multiple transforms to a `Dataset` (using `Dataset.withColumn` for example. This is currently quite expensive because we run `CheckAnalysis` on the full plan and create an encoder for each intermediate `Dataset`.

This PR extends the usage of the `AnalysisBarrier` to include `CheckAnalysis`. By doing this we hide the already analyzed plan  from `CheckAnalysis` because barrier is a `LeafNode`. The `AnalysisBarrier` is in the `FinishAnalysis` phase of the optimizer.

We also make binding the `Dataset` encoder lazy. The bound encoder is only needed when we materialize the dataset.

## How was this patch tested?
Existing test should cover this.

Author: Herman van Hovell <hvanhovell@databricks.com>

Closes #20402 from hvanhovell/SPARK-23223.

(cherry picked from commit 2d903cf9d3a827e54217dfc9f1e4be99d8204387)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../spark/sql/catalyst/analysis/Analyzer.scala   | 16 ++++++++++++++--
 .../sql/catalyst/analysis/CheckAnalysis.scala    |  1 +
 .../sql/catalyst/analysis/AnalysisTest.scala     |  3 +--
 .../scala/org/apache/spark/sql/Dataset.scala     |  8 ++++++--
 .../spark/sql/execution/QueryExecution.scala     | 16 ++--------------
 .../apache/spark/sql/hive/test/TestHive.scala    |  2 +-
 6 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 2b14c8220d43b..91cb0365a0856 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -98,6 +98,19 @@ class Analyzer(
     this(catalog, conf, conf.optimizerMaxIterations)
   }
 
+  def executeAndCheck(plan: LogicalPlan): LogicalPlan = {
+    val analyzed = execute(plan)
+    try {
+      checkAnalysis(analyzed)
+      EliminateBarriers(analyzed)
+    } catch {
+      case e: AnalysisException =>
+        val ae = new AnalysisException(e.message, e.line, e.startPosition, Option(analyzed))
+        ae.setStackTrace(e.getStackTrace)
+        throw ae
+    }
+  }
+
   override def execute(plan: LogicalPlan): LogicalPlan = {
     AnalysisContext.reset()
     try {
@@ -178,8 +191,7 @@ class Analyzer(
     Batch("Subquery", Once,
       UpdateOuterReferences),
     Batch("Cleanup", fixedPoint,
-      CleanupAliases,
-      EliminateBarriers)
+      CleanupAliases)
   )
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index bbcec5627bd49..0d189b4fd7743 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -348,6 +348,7 @@ trait CheckAnalysis extends PredicateHelper {
     }
     extendedCheckRules.foreach(_(plan))
     plan.foreachUp {
+      case AnalysisBarrier(child) if !child.resolved => checkAnalysis(child)
       case o if !o.resolved => failAnalysis(s"unresolved operator ${o.simpleString}")
       case _ =>
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala
index 549a4355dfba3..3d7c91870133b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala
@@ -54,8 +54,7 @@ trait AnalysisTest extends PlanTest {
       expectedPlan: LogicalPlan,
       caseSensitive: Boolean = true): Unit = {
     val analyzer = getAnalyzer(caseSensitive)
-    val actualPlan = analyzer.execute(inputPlan)
-    analyzer.checkAnalysis(actualPlan)
+    val actualPlan = analyzer.executeAndCheck(inputPlan)
     comparePlans(actualPlan, expectedPlan)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index edb6644ed5ac0..cc5b647b3f037 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -62,7 +62,11 @@ import org.apache.spark.util.Utils
 
 private[sql] object Dataset {
   def apply[T: Encoder](sparkSession: SparkSession, logicalPlan: LogicalPlan): Dataset[T] = {
-    new Dataset(sparkSession, logicalPlan, implicitly[Encoder[T]])
+    val dataset = new Dataset(sparkSession, logicalPlan, implicitly[Encoder[T]])
+    // Eagerly bind the encoder so we verify that the encoder matches the underlying
+    // schema. The user will get an error if this is not the case.
+    dataset.deserializer
+    dataset
   }
 
   def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan): DataFrame = {
@@ -204,7 +208,7 @@ class Dataset[T] private[sql](
 
   // The deserializer expression which can be used to build a projection and turn rows to objects
   // of type T, after collecting rows to the driver side.
-  private val deserializer =
+  private lazy val deserializer =
     exprEnc.resolveAndBind(logicalPlan.output, sparkSession.sessionState.analyzer).deserializer
 
   private implicit def classTag = exprEnc.clsTag
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
index 8bfe3eff0c3b3..7cae24bf5976c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -44,19 +44,7 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
   // TODO: Move the planner an optimizer into here from SessionState.
   protected def planner = sparkSession.sessionState.planner
 
-  def assertAnalyzed(): Unit = {
-    // Analyzer is invoked outside the try block to avoid calling it again from within the
-    // catch block below.
-    analyzed
-    try {
-      sparkSession.sessionState.analyzer.checkAnalysis(analyzed)
-    } catch {
-      case e: AnalysisException =>
-        val ae = new AnalysisException(e.message, e.line, e.startPosition, Option(analyzed))
-        ae.setStackTrace(e.getStackTrace)
-        throw ae
-    }
-  }
+  def assertAnalyzed(): Unit = analyzed
 
   def assertSupported(): Unit = {
     if (sparkSession.sessionState.conf.isUnsupportedOperationCheckEnabled) {
@@ -66,7 +54,7 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
 
   lazy val analyzed: LogicalPlan = {
     SparkSession.setActiveSession(sparkSession)
-    sparkSession.sessionState.analyzer.execute(logical)
+    sparkSession.sessionState.analyzer.executeAndCheck(logical)
   }
 
   lazy val withCachedData: LogicalPlan = {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 7287e20d55bbe..59708e7a0f2ff 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -575,7 +575,7 @@ private[hive] class TestHiveQueryExecution(
     logDebug(s"Query references test tables: ${referencedTestTables.mkString(", ")}")
     referencedTestTables.foreach(sparkSession.loadTestTable)
     // Proceed with analysis.
-    sparkSession.sessionState.analyzer.execute(logical)
+    sparkSession.sessionState.analyzer.executeAndCheck(logical)
   }
 }
 

From 6588e007e8e10da7cc9771451eeb4d3a2bdc6e0e Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Mon, 29 Jan 2018 10:25:25 -0800
Subject: [PATCH 207/566] [SPARK-22221][DOCS] Adding User Documentation for
 Arrow

## What changes were proposed in this pull request?

Adding user facing documentation for working with Arrow in Spark

Author: Bryan Cutler <cutlerb@gmail.com>
Author: Li Jin <ice.xelloss@gmail.com>
Author: hyukjinkwon <gurwls223@gmail.com>

Closes #19575 from BryanCutler/arrow-user-docs-SPARK-2221.

(cherry picked from commit 0d60b3213fe9a7ae5e9b208639f92011fdb2ca32)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 docs/sql-programming-guide.md         | 134 +++++++++++++++++++++++++-
 examples/src/main/python/sql/arrow.py | 129 +++++++++++++++++++++++++
 2 files changed, 262 insertions(+), 1 deletion(-)
 create mode 100644 examples/src/main/python/sql/arrow.py

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 502c0a8c37e01..d49c8d869cba6 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1640,6 +1640,138 @@ Configuration of Hive is done by placing your `hive-site.xml`, `core-site.xml` a
 You may run `./bin/spark-sql --help` for a complete list of all available
 options.
 
+# PySpark Usage Guide for Pandas with Apache Arrow
+
+## Apache Arrow in Spark
+
+Apache Arrow is an in-memory columnar data format that is used in Spark to efficiently transfer
+data between JVM and Python processes. This currently is most beneficial to Python users that
+work with Pandas/NumPy data. Its usage is not automatic and might require some minor
+changes to configuration or code to take full advantage and ensure compatibility. This guide will
+give a high-level description of how to use Arrow in Spark and highlight any differences when
+working with Arrow-enabled data.
+
+### Ensure PyArrow Installed
+
+If you install PySpark using pip, then PyArrow can be brought in as an extra dependency of the
+SQL module with the command `pip install pyspark[sql]`. Otherwise, you must ensure that PyArrow
+is installed and available on all cluster nodes. The current supported version is 0.8.0.
+You can install using pip or conda from the conda-forge channel. See PyArrow
+[installation](https://arrow.apache.org/docs/python/install.html) for details.
+
+## Enabling for Conversion to/from Pandas
+
+Arrow is available as an optimization when converting a Spark DataFrame to a Pandas DataFrame
+using the call `toPandas()` and when creating a Spark DataFrame from a Pandas DataFrame with
+`createDataFrame(pandas_df)`. To use Arrow when executing these calls, users need to first set
+the Spark configuration 'spark.sql.execution.arrow.enabled' to 'true'. This is disabled by default.
+
+<div class="codetabs">
+<div data-lang="python" markdown="1">
+{% include_example dataframe_with_arrow python/sql/arrow.py %}
+</div>
+</div>
+
+Using the above optimizations with Arrow will produce the same results as when Arrow is not
+enabled. Note that even with Arrow, `toPandas()` results in the collection of all records in the
+DataFrame to the driver program and should be done on a small subset of the data. Not all Spark
+data types are currently supported and an error can be raised if a column has an unsupported type,
+see [Supported Types](#supported-sql-arrow-types). If an error occurs during `createDataFrame()`,
+Spark will fall back to create the DataFrame without Arrow.
+
+## Pandas UDFs (a.k.a. Vectorized UDFs)
+
+Pandas UDFs are user defined functions that are executed by Spark using Arrow to transfer data and
+Pandas to work with the data. A Pandas UDF is defined using the keyword `pandas_udf` as a decorator
+or to wrap the function, no additional configuration is required. Currently, there are two types of
+Pandas UDF: Scalar and Group Map.
+
+### Scalar
+
+Scalar Pandas UDFs are used for vectorizing scalar operations. They can be used with functions such
+as `select` and `withColumn`. The Python function should take `pandas.Series` as inputs and return
+a `pandas.Series` of the same length. Internally, Spark will execute a Pandas UDF by splitting
+columns into batches and calling the function for each batch as a subset of the data, then
+concatenating the results together.
+
+The following example shows how to create a scalar Pandas UDF that computes the product of 2 columns.
+
+<div class="codetabs">
+<div data-lang="python" markdown="1">
+{% include_example scalar_pandas_udf python/sql/arrow.py %}
+</div>
+</div>
+
+### Group Map
+Group map Pandas UDFs are used with `groupBy().apply()` which implements the "split-apply-combine" pattern.
+Split-apply-combine consists of three steps:
+* Split the data into groups by using `DataFrame.groupBy`.
+* Apply a function on each group. The input and output of the function are both `pandas.DataFrame`. The
+  input data contains all the rows and columns for each group.
+* Combine the results into a new `DataFrame`.
+
+To use `groupBy().apply()`, the user needs to define the following:
+* A Python function that defines the computation for each group.
+* A `StructType` object or a string that defines the schema of the output `DataFrame`.
+
+Note that all data for a group will be loaded into memory before the function is applied. This can
+lead to out of memory exceptons, especially if the group sizes are skewed. The configuration for
+[maxRecordsPerBatch](#setting-arrow-batch-size) is not applied on groups and it is up to the user
+to ensure that the grouped data will fit into the available memory.
+
+The following example shows how to use `groupby().apply()` to subtract the mean from each value in the group.
+
+<div class="codetabs">
+<div data-lang="python" markdown="1">
+{% include_example group_map_pandas_udf python/sql/arrow.py %}
+</div>
+</div>
+
+For detailed usage, please see [`pyspark.sql.functions.pandas_udf`](api/python/pyspark.sql.html#pyspark.sql.functions.pandas_udf) and
+[`pyspark.sql.GroupedData.apply`](api/python/pyspark.sql.html#pyspark.sql.GroupedData.apply).
+
+## Usage Notes
+
+### Supported SQL Types
+
+Currently, all Spark SQL data types are supported by Arrow-based conversion except `MapType`,
+`ArrayType` of `TimestampType`, and nested `StructType`.
+
+### Setting Arrow Batch Size
+
+Data partitions in Spark are converted into Arrow record batches, which can temporarily lead to
+high memory usage in the JVM. To avoid possible out of memory exceptions, the size of the Arrow
+record batches can be adjusted by setting the conf "spark.sql.execution.arrow.maxRecordsPerBatch"
+to an integer that will determine the maximum number of rows for each batch. The default value is
+10,000 records per batch. If the number of columns is large, the value should be adjusted
+accordingly. Using this limit, each data partition will be made into 1 or more record batches for
+processing.
+
+### Timestamp with Time Zone Semantics
+
+Spark internally stores timestamps as UTC values, and timestamp data that is brought in without
+a specified time zone is converted as local time to UTC with microsecond resolution. When timestamp
+data is exported or displayed in Spark, the session time zone is used to localize the timestamp
+values. The session time zone is set with the configuration 'spark.sql.session.timeZone' and will
+default to the JVM system local time zone if not set. Pandas uses a `datetime64` type with nanosecond
+resolution, `datetime64[ns]`, with optional time zone on a per-column basis.
+
+When timestamp data is transferred from Spark to Pandas it will be converted to nanoseconds
+and each column will be converted to the Spark session time zone then localized to that time
+zone, which removes the time zone and displays values as local time. This will occur
+when calling `toPandas()` or `pandas_udf` with timestamp columns.
+
+When timestamp data is transferred from Pandas to Spark, it will be converted to UTC microseconds. This
+occurs when calling `createDataFrame` with a Pandas DataFrame or when returning a timestamp from a
+`pandas_udf`. These conversions are done automatically to ensure Spark will have data in the
+expected format, so it is not necessary to do any of these conversions yourself. Any nanosecond
+values will be truncated.
+
+Note that a standard UDF (non-Pandas) will load timestamp data as Python datetime objects, which is
+different than a Pandas timestamp. It is recommended to use Pandas time series functionality when
+working with timestamps in `pandas_udf`s to get the best performance, see
+[here](https://pandas.pydata.org/pandas-docs/stable/timeseries.html) for details.
+
 # Migration Guide
 
 ## Upgrading From Spark SQL 2.2 to 2.3
@@ -1788,7 +1920,7 @@ options.
     Note that, for <b>DecimalType(38,0)*</b>, the table above intentionally does not cover all other combinations of scales and precisions because currently we only infer decimal type like `BigInteger`/`BigInt`. For example, 1.1 is inferred as double type.
   - In PySpark, now we need Pandas 0.19.2 or upper if you want to use Pandas related functionalities, such as `toPandas`, `createDataFrame` from Pandas DataFrame, etc.
   - In PySpark, the behavior of timestamp values for Pandas related functionalities was changed to respect session timezone. If you want to use the old behavior, you need to set a configuration `spark.sql.execution.pandas.respectSessionTimeZone` to `False`. See [SPARK-22395](https://issues.apache.org/jira/browse/SPARK-22395) for details.
-  - In PySpark, `na.fill()` or `fillna` also accepts boolean and replaces nulls with booleans. In prior Spark versions, PySpark just ignores it and returns the original Dataset/DataFrame.  
+  - In PySpark, `na.fill()` or `fillna` also accepts boolean and replaces nulls with booleans. In prior Spark versions, PySpark just ignores it and returns the original Dataset/DataFrame.
   - Since Spark 2.3, when either broadcast hash join or broadcast nested loop join is applicable, we prefer to broadcasting the table that is explicitly specified in a broadcast hint. For details, see the section [Broadcast Hint](#broadcast-hint-for-sql-queries) and [SPARK-22489](https://issues.apache.org/jira/browse/SPARK-22489).
   - Since Spark 2.3, when all inputs are binary, `functions.concat()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.concatBinaryAsString` to `true`.
   - Since Spark 2.3, when all inputs are binary, SQL `elt()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.eltOutputAsString` to `true`.
diff --git a/examples/src/main/python/sql/arrow.py b/examples/src/main/python/sql/arrow.py
new file mode 100644
index 0000000000000..6c0028b3f1c1f
--- /dev/null
+++ b/examples/src/main/python/sql/arrow.py
@@ -0,0 +1,129 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+A simple example demonstrating Arrow in Spark.
+Run with:
+  ./bin/spark-submit examples/src/main/python/sql/arrow.py
+"""
+
+from __future__ import print_function
+
+from pyspark.sql import SparkSession
+from pyspark.sql.utils import require_minimum_pandas_version, require_minimum_pyarrow_version
+
+require_minimum_pandas_version()
+require_minimum_pyarrow_version()
+
+
+def dataframe_with_arrow_example(spark):
+    # $example on:dataframe_with_arrow$
+    import numpy as np
+    import pandas as pd
+
+    # Enable Arrow-based columnar data transfers
+    spark.conf.set("spark.sql.execution.arrow.enabled", "true")
+
+    # Generate a Pandas DataFrame
+    pdf = pd.DataFrame(np.random.rand(100, 3))
+
+    # Create a Spark DataFrame from a Pandas DataFrame using Arrow
+    df = spark.createDataFrame(pdf)
+
+    # Convert the Spark DataFrame back to a Pandas DataFrame using Arrow
+    result_pdf = df.select("*").toPandas()
+    # $example off:dataframe_with_arrow$
+    print("Pandas DataFrame result statistics:\n%s\n" % str(result_pdf.describe()))
+
+
+def scalar_pandas_udf_example(spark):
+    # $example on:scalar_pandas_udf$
+    import pandas as pd
+
+    from pyspark.sql.functions import col, pandas_udf
+    from pyspark.sql.types import LongType
+
+    # Declare the function and create the UDF
+    def multiply_func(a, b):
+        return a * b
+
+    multiply = pandas_udf(multiply_func, returnType=LongType())
+
+    # The function for a pandas_udf should be able to execute with local Pandas data
+    x = pd.Series([1, 2, 3])
+    print(multiply_func(x, x))
+    # 0    1
+    # 1    4
+    # 2    9
+    # dtype: int64
+
+    # Create a Spark DataFrame, 'spark' is an existing SparkSession
+    df = spark.createDataFrame(pd.DataFrame(x, columns=["x"]))
+
+    # Execute function as a Spark vectorized UDF
+    df.select(multiply(col("x"), col("x"))).show()
+    # +-------------------+
+    # |multiply_func(x, x)|
+    # +-------------------+
+    # |                  1|
+    # |                  4|
+    # |                  9|
+    # +-------------------+
+    # $example off:scalar_pandas_udf$
+
+
+def group_map_pandas_udf_example(spark):
+    # $example on:group_map_pandas_udf$
+    from pyspark.sql.functions import pandas_udf, PandasUDFType
+
+    df = spark.createDataFrame(
+        [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
+        ("id", "v"))
+
+    @pandas_udf("id long, v double", PandasUDFType.GROUP_MAP)
+    def substract_mean(pdf):
+        # pdf is a pandas.DataFrame
+        v = pdf.v
+        return pdf.assign(v=v - v.mean())
+
+    df.groupby("id").apply(substract_mean).show()
+    # +---+----+
+    # | id|   v|
+    # +---+----+
+    # |  1|-0.5|
+    # |  1| 0.5|
+    # |  2|-3.0|
+    # |  2|-1.0|
+    # |  2| 4.0|
+    # +---+----+
+    # $example off:group_map_pandas_udf$
+
+
+if __name__ == "__main__":
+    spark = SparkSession \
+        .builder \
+        .appName("Python Arrow-in-Spark example") \
+        .getOrCreate()
+
+    print("Running Pandas to/from conversion example")
+    dataframe_with_arrow_example(spark)
+    print("Running pandas_udf scalar example")
+    scalar_pandas_udf_example(spark)
+    print("Running pandas_udf group map example")
+    group_map_pandas_udf_example(spark)
+
+    spark.stop()

From 438631031b2a7d79f8c639ef8ef0de1303bb9f2b Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Mon, 29 Jan 2018 10:29:42 -0800
Subject: [PATCH 208/566] [SPARK-22916][SQL][FOLLOW-UP] Update the Description
 of Join Selection

## What changes were proposed in this pull request?
This PR is to update the description of the join algorithm changes.

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20420 from gatorsmile/followUp22916.

(cherry picked from commit e30b34f7bd9a687eb43d636fffeb98fe235fcbf4)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../spark/sql/execution/SparkStrategies.scala | 60 +++++++++++++++----
 1 file changed, 47 insertions(+), 13 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 910294853c318..25436e1c8dd45 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -90,23 +90,58 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
    * Select the proper physical plan for join based on joining keys and size of logical plan.
    *
    * At first, uses the [[ExtractEquiJoinKeys]] pattern to find joins where at least some of the
-   * predicates can be evaluated by matching join keys. If found,  Join implementations are chosen
+   * predicates can be evaluated by matching join keys. If found, join implementations are chosen
    * with the following precedence:
    *
-   * - Broadcast: We prefer to broadcast the join side with an explicit broadcast hint(e.g. the
-   *     user applied the [[org.apache.spark.sql.functions.broadcast()]] function to a DataFrame).
-   *     If both sides have the broadcast hint, we prefer to broadcast the side with a smaller
-   *     estimated physical size. If neither one of the sides has the broadcast hint,
-   *     we only broadcast the join side if its estimated physical size that is smaller than
-   *     the user-configurable [[SQLConf.AUTO_BROADCASTJOIN_THRESHOLD]] threshold.
+   * - Broadcast hash join (BHJ):
+   *     BHJ is not supported for full outer join. For right outer join, we only can broadcast the
+   *     left side. For left outer, left semi, left anti and the internal join type ExistenceJoin,
+   *     we only can broadcast the right side. For inner like join, we can broadcast both sides.
+   *     Normally, BHJ can perform faster than the other join algorithms when the broadcast side is
+   *     small. However, broadcasting tables is a network-intensive operation. It could cause OOM
+   *     or perform worse than the other join algorithms, especially when the build/broadcast side
+   *     is big.
+   *
+   *     For the supported cases, users can specify the broadcast hint (e.g. the user applied the
+   *     [[org.apache.spark.sql.functions.broadcast()]] function to a DataFrame) and session-based
+   *     [[SQLConf.AUTO_BROADCASTJOIN_THRESHOLD]] threshold to adjust whether BHJ is used and
+   *     which join side is broadcast.
+   *
+   *     1) Broadcast the join side with the broadcast hint, even if the size is larger than
+   *     [[SQLConf.AUTO_BROADCASTJOIN_THRESHOLD]]. If both sides have the hint (only when the type
+   *     is inner like join), the side with a smaller estimated physical size will be broadcast.
+   *     2) Respect the [[SQLConf.AUTO_BROADCASTJOIN_THRESHOLD]] threshold and broadcast the side
+   *     whose estimated physical size is smaller than the threshold. If both sides are below the
+   *     threshold, broadcast the smaller side. If neither is smaller, BHJ is not used.
+   *
    * - Shuffle hash join: if the average size of a single partition is small enough to build a hash
    *     table.
+   *
    * - Sort merge: if the matching join keys are sortable.
    *
    * If there is no joining keys, Join implementations are chosen with the following precedence:
-   * - BroadcastNestedLoopJoin: if one side of the join could be broadcasted
-   * - CartesianProduct: for Inner join
-   * - BroadcastNestedLoopJoin
+   * - BroadcastNestedLoopJoin (BNLJ):
+   *     BNLJ supports all the join types but the impl is OPTIMIZED for the following scenarios:
+   *     For right outer join, the left side is broadcast. For left outer, left semi, left anti
+   *     and the internal join type ExistenceJoin, the right side is broadcast. For inner like
+   *     joins, either side is broadcast.
+   *
+   *     Like BHJ, users still can specify the broadcast hint and session-based
+   *     [[SQLConf.AUTO_BROADCASTJOIN_THRESHOLD]] threshold to impact which side is broadcast.
+   *
+   *     1) Broadcast the join side with the broadcast hint, even if the size is larger than
+   *     [[SQLConf.AUTO_BROADCASTJOIN_THRESHOLD]]. If both sides have the hint (i.e., just for
+   *     inner-like join), the side with a smaller estimated physical size will be broadcast.
+   *     2) Respect the [[SQLConf.AUTO_BROADCASTJOIN_THRESHOLD]] threshold and broadcast the side
+   *     whose estimated physical size is smaller than the threshold. If both sides are below the
+   *     threshold, broadcast the smaller side. If neither is smaller, BNLJ is not used.
+   *
+   * - CartesianProduct: for inner like join, CartesianProduct is the fallback option.
+   *
+   * - BroadcastNestedLoopJoin (BNLJ):
+   *     For the other join types, BNLJ is the fallback option. Here, we just pick the broadcast
+   *     side with the broadcast hint. If neither side has a hint, we broadcast the side with
+   *     the smaller estimated physical size.
    */
   object JoinSelection extends Strategy with PredicateHelper {
 
@@ -139,8 +174,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
     }
 
     private def canBuildRight(joinType: JoinType): Boolean = joinType match {
-      case _: InnerLike | LeftOuter | LeftSemi | LeftAnti => true
-      case j: ExistenceJoin => true
+      case _: InnerLike | LeftOuter | LeftSemi | LeftAnti | _: ExistenceJoin => true
       case _ => false
     }
 
@@ -243,7 +277,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
 
       // --- Without joining keys ------------------------------------------------------------
 
-      // Pick BroadcastNestedLoopJoin if one side could be broadcasted
+      // Pick BroadcastNestedLoopJoin if one side could be broadcast
       case j @ logical.Join(left, right, joinType, condition)
           if canBroadcastByHints(joinType, left, right) =>
         val buildSide = broadcastSideByHints(joinType, left, right)

From 75131ee867bca17ca3aade5f832f1d49b7cfcff5 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Mon, 29 Jan 2018 16:09:14 -0600
Subject: [PATCH 209/566] [SPARK-23209][core] Allow credential manager to work
 when Hive not available.

The JVM seems to be doing early binding of classes that the Hive provider
depends on, causing an error to be thrown before it was caught by the code
in the class.

The fix wraps the creation of the provider in a try..catch so that
the provider can be ignored when dependencies are missing.

Added a unit test (which fails without the fix), and also tested
that getting tokens still works in a real cluster.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20399 from vanzin/SPARK-23209.

(cherry picked from commit b834446ec1338349f6d974afd96f677db3e8fd1a)
Signed-off-by: Imran Rashid <irashid@cloudera.com>
---
 .../HadoopDelegationTokenManager.scala        | 17 +++++-
 .../HadoopDelegationTokenManagerSuite.scala   | 58 +++++++++++++++++++
 2 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala
index 116a686fe1480..5151df00476f9 100644
--- a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala
@@ -64,9 +64,9 @@ private[spark] class HadoopDelegationTokenManager(
   }
 
   private def getDelegationTokenProviders: Map[String, HadoopDelegationTokenProvider] = {
-    val providers = List(new HadoopFSDelegationTokenProvider(fileSystems),
-      new HiveDelegationTokenProvider,
-      new HBaseDelegationTokenProvider)
+    val providers = Seq(new HadoopFSDelegationTokenProvider(fileSystems)) ++
+      safeCreateProvider(new HiveDelegationTokenProvider) ++
+      safeCreateProvider(new HBaseDelegationTokenProvider)
 
     // Filter out providers for which spark.security.credentials.{service}.enabled is false.
     providers
@@ -75,6 +75,17 @@ private[spark] class HadoopDelegationTokenManager(
       .toMap
   }
 
+  private def safeCreateProvider(
+      createFn: => HadoopDelegationTokenProvider): Option[HadoopDelegationTokenProvider] = {
+    try {
+      Some(createFn)
+    } catch {
+      case t: Throwable =>
+        logDebug(s"Failed to load built in provider.", t)
+        None
+    }
+  }
+
   def isServiceEnabled(serviceName: String): Boolean = {
     val key = providerEnabledConfig.format(serviceName)
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManagerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManagerSuite.scala
index eeffc36070b44..2849a10a2c81e 100644
--- a/core/src/test/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManagerSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.deploy.security
 
+import org.apache.commons.io.IOUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.FileSystem
 import org.apache.hadoop.security.Credentials
@@ -110,7 +111,64 @@ class HadoopDelegationTokenManagerSuite extends SparkFunSuite with Matchers {
     creds.getAllTokens.size should be (0)
   }
 
+  test("SPARK-23209: obtain tokens when Hive classes are not available") {
+    // This test needs a custom class loader to hide Hive classes which are in the classpath.
+    // Because the manager code loads the Hive provider directly instead of using reflection, we
+    // need to drive the test through the custom class loader so a new copy that cannot find
+    // Hive classes is loaded.
+    val currentLoader = Thread.currentThread().getContextClassLoader()
+    val noHive = new ClassLoader() {
+      override def loadClass(name: String, resolve: Boolean): Class[_] = {
+        if (name.startsWith("org.apache.hive") || name.startsWith("org.apache.hadoop.hive")) {
+          throw new ClassNotFoundException(name)
+        }
+
+        if (name.startsWith("java") || name.startsWith("scala")) {
+          currentLoader.loadClass(name)
+        } else {
+          val classFileName = name.replaceAll("\\.", "/") + ".class"
+          val in = currentLoader.getResourceAsStream(classFileName)
+          if (in != null) {
+            val bytes = IOUtils.toByteArray(in)
+            defineClass(name, bytes, 0, bytes.length)
+          } else {
+            throw new ClassNotFoundException(name)
+          }
+        }
+      }
+    }
+
+    try {
+      Thread.currentThread().setContextClassLoader(noHive)
+      val test = noHive.loadClass(NoHiveTest.getClass.getName().stripSuffix("$"))
+      test.getMethod("runTest").invoke(null)
+    } finally {
+      Thread.currentThread().setContextClassLoader(currentLoader)
+    }
+  }
+
   private[spark] def hadoopFSsToAccess(hadoopConf: Configuration): Set[FileSystem] = {
     Set(FileSystem.get(hadoopConf))
   }
 }
+
+/** Test code for SPARK-23209 to avoid using too much reflection above. */
+private object NoHiveTest extends Matchers {
+
+  def runTest(): Unit = {
+    try {
+      val manager = new HadoopDelegationTokenManager(new SparkConf(), new Configuration(),
+        _ => Set())
+      manager.getServiceDelegationTokenProvider("hive") should be (None)
+    } catch {
+      case e: Throwable =>
+        // Throw a better exception in case the test fails, since there may be a lot of nesting.
+        var cause = e
+        while (cause.getCause() != null) {
+          cause = cause.getCause()
+        }
+        throw cause
+    }
+  }
+
+}

From 2858eaafaf06d3b8c55a8a5ed7831260244932cd Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Mon, 29 Jan 2018 17:37:55 -0800
Subject: [PATCH 210/566] [SPARK-22221][SQL][FOLLOWUP] Externalize
 spark.sql.execution.arrow.maxRecordsPerBatch

## What changes were proposed in this pull request?

This is a followup to #19575 which added a section on setting max Arrow record batches and this will externalize the conf that was referenced in the docs.

## How was this patch tested?
NA

Author: Bryan Cutler <cutlerb@gmail.com>

Closes #20423 from BryanCutler/arrow-user-doc-externalize-maxRecordsPerBatch-SPARK-22221.

(cherry picked from commit f235df66a4754cbb64d5b7b5cfd5a52bdd243b8a)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 61ea03d395afc..54a35594f505e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1051,7 +1051,6 @@ object SQLConf {
 
   val ARROW_EXECUTION_MAX_RECORDS_PER_BATCH =
     buildConf("spark.sql.execution.arrow.maxRecordsPerBatch")
-      .internal()
       .doc("When using Apache Arrow, limit the maximum number of records that can be written " +
         "to a single ArrowRecordBatch in memory. If set to zero or negative there is no limit.")
       .intConf

From a81ace196ec41b1b3f739294c965df270ee2ddd2 Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Tue, 30 Jan 2018 11:40:42 +0800
Subject: [PATCH 211/566] [SPARK-23207][SQL][FOLLOW-UP] Don't perform local
 sort for DataFrame.repartition(1)

## What changes were proposed in this pull request?

In `ShuffleExchangeExec`, we don't need to insert extra local sort before round-robin partitioning, if the new partitioning has only 1 partition, because under that case all output rows go to the same partition.

## How was this patch tested?

The existing test cases.

Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #20426 from jiangxb1987/repartition1.

(cherry picked from commit b375397b1678b7fe20a0b7f87a7e8b37ae5646ef)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/execution/exchange/ShuffleExchangeExec.scala    | 4 ++++
 .../spark/sql/execution/streaming/ForeachSinkSuite.scala      | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala
index 76c1fa65f924b..4d95ee34f30de 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala
@@ -257,7 +257,11 @@ object ShuffleExchangeExec {
       //
       // Currently we following the most straight-forward way that perform a local sort before
       // partitioning.
+      //
+      // Note that we don't perform local sort if the new partitioning has only 1 partition, under
+      // that case all output rows go to the same partition.
       val newRdd = if (SQLConf.get.sortBeforeRepartition &&
+          newPartitioning.numPartitions > 1 &&
           newPartitioning.isInstanceOf[RoundRobinPartitioning]) {
         rdd.mapPartitionsInternal { iter =>
           val recordComparatorSupplier = new Supplier[RecordComparator] {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala
index 1248c670df45c..41434e6d8b974 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala
@@ -162,7 +162,7 @@ class ForeachSinkSuite extends StreamTest with SharedSQLContext with BeforeAndAf
       val allEvents = ForeachSinkSuite.allEvents()
       assert(allEvents.size === 1)
       assert(allEvents(0)(0) === ForeachSinkSuite.Open(partition = 0, version = 0))
-      assert(allEvents(0)(1) === ForeachSinkSuite.Process(value = 2))
+      assert(allEvents(0)(1) === ForeachSinkSuite.Process(value = 1))
 
       // `close` should be called with the error
       val errorEvent = allEvents(0)(2).asInstanceOf[ForeachSinkSuite.Close]

From bb7502f9a506d52365d7532b3b0281098dd85763 Mon Sep 17 00:00:00 2001
From: Henry Robinson <henry@cloudera.com>
Date: Mon, 29 Jan 2018 22:19:59 -0800
Subject: [PATCH 212/566] [SPARK-23157][SQL] Explain restriction on column
 expression in withColumn()

## What changes were proposed in this pull request?

It's not obvious from the comments that any added column must be a
function of the dataset that we are adding it to. Add a comment to
that effect to Scala, Python and R Data* methods.

Author: Henry Robinson <henry@cloudera.com>

Closes #20429 from henryr/SPARK-23157.

(cherry picked from commit 8b983243e45dfe2617c043a3229a7d87f4c4b44b)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 R/pkg/R/DataFrame.R                                        | 3 ++-
 python/pyspark/sql/dataframe.py                            | 4 ++++
 sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala | 3 +++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 29f3e986eaab6..547b5ea48a555 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2090,7 +2090,8 @@ setMethod("selectExpr",
 #'
 #' @param x a SparkDataFrame.
 #' @param colName a column name.
-#' @param col a Column expression, or an atomic vector in the length of 1 as literal value.
+#' @param col a Column expression (which must refer only to this DataFrame), or an atomic vector in
+#' the length of 1 as literal value.
 #' @return A SparkDataFrame with the new column added or the existing column replaced.
 #' @family SparkDataFrame functions
 #' @aliases withColumn,SparkDataFrame,character-method
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index ac403080acfdf..055b2c4a0ffec 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1829,11 +1829,15 @@ def withColumn(self, colName, col):
         Returns a new :class:`DataFrame` by adding a column or replacing the
         existing column that has the same name.
 
+        The column expression must be an expression over this dataframe; attempting to add
+        a column from some other dataframe will raise an error.
+
         :param colName: string, name of the new column.
         :param col: a :class:`Column` expression for the new column.
 
         >>> df.withColumn('age2', df.age + 2).collect()
         [Row(age=2, name=u'Alice', age2=4), Row(age=5, name=u'Bob', age2=7)]
+
         """
         assert isinstance(col, Column), "col should be Column"
         return DataFrame(self._jdf.withColumn(colName, col._jc), self.sql_ctx)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index cc5b647b3f037..d47cd0aecf56a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -2150,6 +2150,9 @@ class Dataset[T] private[sql](
    * Returns a new Dataset by adding a column or replacing the existing column that has
    * the same name.
    *
+   * `column`'s expression must only refer to attributes supplied by this Dataset. It is an
+   * error to add a column that refers to some other Dataset.
+   *
    * @group untypedrel
    * @since 2.0.0
    */

From 107d4e293867af42c87cbc1a93d14c5492c2ba84 Mon Sep 17 00:00:00 2001
From: sethah <shendrickson@cloudera.com>
Date: Tue, 30 Jan 2018 09:02:16 +0200
Subject: [PATCH 213/566] [SPARK-23138][ML][DOC] Multiclass logistic regression
 summary example and user guide

## What changes were proposed in this pull request?

User guide and examples are updated to reflect multiclass logistic regression summary which was added in [SPARK-17139](https://issues.apache.org/jira/browse/SPARK-17139).

I did not make a separate summary example, but added the summary code to the multiclass example that already existed. I don't see the need for a separate example for the summary.

## How was this patch tested?

Docs and examples only. Ran all examples locally using spark-submit.

Author: sethah <shendrickson@cloudera.com>

Closes #20332 from sethah/multiclass_summary_example.

(cherry picked from commit 5056877e8bea56dd0f4dc9e3385669e1e78b2925)
Signed-off-by: Nick Pentreath <nickp@za.ibm.com>
---
 docs/ml-classification-regression.md          | 22 +++----
 .../JavaLogisticRegressionSummaryExample.java | 17 ++---
 ...gisticRegressionWithElasticNetExample.java | 62 +++++++++++++++++++
 ...ss_logistic_regression_with_elastic_net.py | 38 ++++++++++++
 .../ml/LogisticRegressionSummaryExample.scala | 15 ++---
 ...isticRegressionWithElasticNetExample.scala | 43 +++++++++++++
 6 files changed, 164 insertions(+), 33 deletions(-)

diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md
index bf979f3c73a52..ddd2f4b49ca07 100644
--- a/docs/ml-classification-regression.md
+++ b/docs/ml-classification-regression.md
@@ -87,7 +87,7 @@ More details on parameters can be found in the [R API documentation](api/R/spark
 The `spark.ml` implementation of logistic regression also supports
 extracting a summary of the model over the training set. Note that the
 predictions and metrics which are stored as `DataFrame` in
-`BinaryLogisticRegressionSummary` are annotated `@transient` and hence
+`LogisticRegressionSummary` are annotated `@transient` and hence
 only available on the driver.
 
 <div class="codetabs">
@@ -97,10 +97,9 @@ only available on the driver.
 [`LogisticRegressionTrainingSummary`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegressionTrainingSummary)
 provides a summary for a
 [`LogisticRegressionModel`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegressionModel).
-Currently, only binary classification is supported and the
-summary must be explicitly cast to
-[`BinaryLogisticRegressionTrainingSummary`](api/scala/index.html#org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary).
-This will likely change when multiclass classification is supported.
+In the case of binary classification, certain additional metrics are
+available, e.g. ROC curve. The binary summary can be accessed via the
+`binarySummary` method. See [`BinaryLogisticRegressionTrainingSummary`](api/scala/index.html#org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary).
 
 Continuing the earlier example:
 
@@ -111,10 +110,9 @@ Continuing the earlier example:
 [`LogisticRegressionTrainingSummary`](api/java/org/apache/spark/ml/classification/LogisticRegressionTrainingSummary.html)
 provides a summary for a
 [`LogisticRegressionModel`](api/java/org/apache/spark/ml/classification/LogisticRegressionModel.html).
-Currently, only binary classification is supported and the
-summary must be explicitly cast to
-[`BinaryLogisticRegressionTrainingSummary`](api/java/org/apache/spark/ml/classification/BinaryLogisticRegressionTrainingSummary.html). 
-Support for multiclass model summaries will be added in the future.
+In the case of binary classification, certain additional metrics are
+available, e.g. ROC curve. The binary summary can be accessed via the
+`binarySummary` method. See [`BinaryLogisticRegressionTrainingSummary`](api/java/org/apache/spark/ml/classification/BinaryLogisticRegressionTrainingSummary.html).
 
 Continuing the earlier example:
 
@@ -125,7 +123,8 @@ Continuing the earlier example:
 [`LogisticRegressionTrainingSummary`](api/python/pyspark.ml.html#pyspark.ml.classification.LogisticRegressionSummary)
 provides a summary for a
 [`LogisticRegressionModel`](api/python/pyspark.ml.html#pyspark.ml.classification.LogisticRegressionModel).
-Currently, only binary classification is supported. Support for multiclass model summaries will be added in the future.
+In the case of binary classification, certain additional metrics are
+available, e.g. ROC curve. See [`BinaryLogisticRegressionTrainingSummary`](api/python/pyspark.ml.html#pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary).
 
 Continuing the earlier example:
 
@@ -162,7 +161,8 @@ For a detailed derivation please see [here](https://en.wikipedia.org/wiki/Multin
 **Examples**
 
 The following example shows how to train a multiclass logistic regression 
-model with elastic net regularization.
+model with elastic net regularization, as well as extract the multiclass
+training summary for evaluating the model.
 
 <div class="codetabs">
 
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java
index dee56799d8aee..1529da16f051f 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java
@@ -18,10 +18,9 @@
 package org.apache.spark.examples.ml;
 
 // $example on$
-import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary;
+import org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary;
 import org.apache.spark.ml.classification.LogisticRegression;
 import org.apache.spark.ml.classification.LogisticRegressionModel;
-import org.apache.spark.ml.classification.LogisticRegressionTrainingSummary;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SparkSession;
@@ -50,7 +49,7 @@ public static void main(String[] args) {
     // $example on$
     // Extract the summary from the returned LogisticRegressionModel instance trained in the earlier
     // example
-    LogisticRegressionTrainingSummary trainingSummary = lrModel.summary();
+    BinaryLogisticRegressionTrainingSummary trainingSummary = lrModel.binarySummary();
 
     // Obtain the loss per iteration.
     double[] objectiveHistory = trainingSummary.objectiveHistory();
@@ -58,21 +57,15 @@ public static void main(String[] args) {
       System.out.println(lossPerIteration);
     }
 
-    // Obtain the metrics useful to judge performance on test data.
-    // We cast the summary to a BinaryLogisticRegressionSummary since the problem is a binary
-    // classification problem.
-    BinaryLogisticRegressionSummary binarySummary =
-      (BinaryLogisticRegressionSummary) trainingSummary;
-
     // Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
-    Dataset<Row> roc = binarySummary.roc();
+    Dataset<Row> roc = trainingSummary.roc();
     roc.show();
     roc.select("FPR").show();
-    System.out.println(binarySummary.areaUnderROC());
+    System.out.println(trainingSummary.areaUnderROC());
 
     // Get the threshold corresponding to the maximum F-Measure and rerun LogisticRegression with
     // this selected threshold.
-    Dataset<Row> fMeasure = binarySummary.fMeasureByThreshold();
+    Dataset<Row> fMeasure = trainingSummary.fMeasureByThreshold();
     double maxFMeasure = fMeasure.select(functions.max("F-Measure")).head().getDouble(0);
     double bestThreshold = fMeasure.where(fMeasure.col("F-Measure").equalTo(maxFMeasure))
       .select("threshold").head().getDouble(0);
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMulticlassLogisticRegressionWithElasticNetExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMulticlassLogisticRegressionWithElasticNetExample.java
index da410cba2b3f1..801a82cd2f24f 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMulticlassLogisticRegressionWithElasticNetExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMulticlassLogisticRegressionWithElasticNetExample.java
@@ -20,6 +20,7 @@
 // $example on$
 import org.apache.spark.ml.classification.LogisticRegression;
 import org.apache.spark.ml.classification.LogisticRegressionModel;
+import org.apache.spark.ml.classification.LogisticRegressionTrainingSummary;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SparkSession;
@@ -48,6 +49,67 @@ public static void main(String[] args) {
         // Print the coefficients and intercept for multinomial logistic regression
         System.out.println("Coefficients: \n"
                 + lrModel.coefficientMatrix() + " \nIntercept: " + lrModel.interceptVector());
+        LogisticRegressionTrainingSummary trainingSummary = lrModel.summary();
+
+        // Obtain the loss per iteration.
+        double[] objectiveHistory = trainingSummary.objectiveHistory();
+        for (double lossPerIteration : objectiveHistory) {
+            System.out.println(lossPerIteration);
+        }
+
+        // for multiclass, we can inspect metrics on a per-label basis
+        System.out.println("False positive rate by label:");
+        int i = 0;
+        double[] fprLabel = trainingSummary.falsePositiveRateByLabel();
+        for (double fpr : fprLabel) {
+            System.out.println("label " + i + ": " + fpr);
+            i++;
+        }
+
+        System.out.println("True positive rate by label:");
+        i = 0;
+        double[] tprLabel = trainingSummary.truePositiveRateByLabel();
+        for (double tpr : tprLabel) {
+            System.out.println("label " + i + ": " + tpr);
+            i++;
+        }
+
+        System.out.println("Precision by label:");
+        i = 0;
+        double[] precLabel = trainingSummary.precisionByLabel();
+        for (double prec : precLabel) {
+            System.out.println("label " + i + ": " + prec);
+            i++;
+        }
+
+        System.out.println("Recall by label:");
+        i = 0;
+        double[] recLabel = trainingSummary.recallByLabel();
+        for (double rec : recLabel) {
+            System.out.println("label " + i + ": " + rec);
+            i++;
+        }
+
+        System.out.println("F-measure by label:");
+        i = 0;
+        double[] fLabel = trainingSummary.fMeasureByLabel();
+        for (double f : fLabel) {
+            System.out.println("label " + i + ": " + f);
+            i++;
+        }
+
+        double accuracy = trainingSummary.accuracy();
+        double falsePositiveRate = trainingSummary.weightedFalsePositiveRate();
+        double truePositiveRate = trainingSummary.weightedTruePositiveRate();
+        double fMeasure = trainingSummary.weightedFMeasure();
+        double precision = trainingSummary.weightedPrecision();
+        double recall = trainingSummary.weightedRecall();
+        System.out.println("Accuracy: " + accuracy);
+        System.out.println("FPR: " + falsePositiveRate);
+        System.out.println("TPR: " + truePositiveRate);
+        System.out.println("F-measure: " + fMeasure);
+        System.out.println("Precision: " + precision);
+        System.out.println("Recall: " + recall);
         // $example off$
 
         spark.stop();
diff --git a/examples/src/main/python/ml/multiclass_logistic_regression_with_elastic_net.py b/examples/src/main/python/ml/multiclass_logistic_regression_with_elastic_net.py
index bb9cd82d6ba27..bec9860c79a2d 100644
--- a/examples/src/main/python/ml/multiclass_logistic_regression_with_elastic_net.py
+++ b/examples/src/main/python/ml/multiclass_logistic_regression_with_elastic_net.py
@@ -43,6 +43,44 @@
     # Print the coefficients and intercept for multinomial logistic regression
     print("Coefficients: \n" + str(lrModel.coefficientMatrix))
     print("Intercept: " + str(lrModel.interceptVector))
+
+    trainingSummary = lrModel.summary
+
+    # Obtain the objective per iteration
+    objectiveHistory = trainingSummary.objectiveHistory
+    print("objectiveHistory:")
+    for objective in objectiveHistory:
+        print(objective)
+
+    # for multiclass, we can inspect metrics on a per-label basis
+    print("False positive rate by label:")
+    for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
+        print("label %d: %s" % (i, rate))
+
+    print("True positive rate by label:")
+    for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
+        print("label %d: %s" % (i, rate))
+
+    print("Precision by label:")
+    for i, prec in enumerate(trainingSummary.precisionByLabel):
+        print("label %d: %s" % (i, prec))
+
+    print("Recall by label:")
+    for i, rec in enumerate(trainingSummary.recallByLabel):
+        print("label %d: %s" % (i, rec))
+
+    print("F-measure by label:")
+    for i, f in enumerate(trainingSummary.fMeasureByLabel()):
+        print("label %d: %s" % (i, f))
+
+    accuracy = trainingSummary.accuracy
+    falsePositiveRate = trainingSummary.weightedFalsePositiveRate
+    truePositiveRate = trainingSummary.weightedTruePositiveRate
+    fMeasure = trainingSummary.weightedFMeasure()
+    precision = trainingSummary.weightedPrecision
+    recall = trainingSummary.weightedRecall
+    print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
+          % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))
     # $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala
index 1740a0d3f9d12..0368dcba460b5 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala
@@ -19,7 +19,7 @@
 package org.apache.spark.examples.ml
 
 // $example on$
-import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression}
+import org.apache.spark.ml.classification.LogisticRegression
 // $example off$
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.functions.max
@@ -47,25 +47,20 @@ object LogisticRegressionSummaryExample {
     // $example on$
     // Extract the summary from the returned LogisticRegressionModel instance trained in the earlier
     // example
-    val trainingSummary = lrModel.summary
+    val trainingSummary = lrModel.binarySummary
 
     // Obtain the objective per iteration.
     val objectiveHistory = trainingSummary.objectiveHistory
     println("objectiveHistory:")
     objectiveHistory.foreach(loss => println(loss))
 
-    // Obtain the metrics useful to judge performance on test data.
-    // We cast the summary to a BinaryLogisticRegressionSummary since the problem is a
-    // binary classification problem.
-    val binarySummary = trainingSummary.asInstanceOf[BinaryLogisticRegressionSummary]
-
     // Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
-    val roc = binarySummary.roc
+    val roc = trainingSummary.roc
     roc.show()
-    println(s"areaUnderROC: ${binarySummary.areaUnderROC}")
+    println(s"areaUnderROC: ${trainingSummary.areaUnderROC}")
 
     // Set the model threshold to maximize F-Measure
-    val fMeasure = binarySummary.fMeasureByThreshold
+    val fMeasure = trainingSummary.fMeasureByThreshold
     val maxFMeasure = fMeasure.select(max("F-Measure")).head().getDouble(0)
     val bestThreshold = fMeasure.where($"F-Measure" === maxFMeasure)
       .select("threshold").head().getDouble(0)
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MulticlassLogisticRegressionWithElasticNetExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MulticlassLogisticRegressionWithElasticNetExample.scala
index 3e61dbe628c20..1f7dbddd454e8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MulticlassLogisticRegressionWithElasticNetExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MulticlassLogisticRegressionWithElasticNetExample.scala
@@ -49,6 +49,49 @@ object MulticlassLogisticRegressionWithElasticNetExample {
     // Print the coefficients and intercept for multinomial logistic regression
     println(s"Coefficients: \n${lrModel.coefficientMatrix}")
     println(s"Intercepts: \n${lrModel.interceptVector}")
+
+    val trainingSummary = lrModel.summary
+
+    // Obtain the objective per iteration
+    val objectiveHistory = trainingSummary.objectiveHistory
+    println("objectiveHistory:")
+    objectiveHistory.foreach(println)
+
+    // for multiclass, we can inspect metrics on a per-label basis
+    println("False positive rate by label:")
+    trainingSummary.falsePositiveRateByLabel.zipWithIndex.foreach { case (rate, label) =>
+      println(s"label $label: $rate")
+    }
+
+    println("True positive rate by label:")
+    trainingSummary.truePositiveRateByLabel.zipWithIndex.foreach { case (rate, label) =>
+      println(s"label $label: $rate")
+    }
+
+    println("Precision by label:")
+    trainingSummary.precisionByLabel.zipWithIndex.foreach { case (prec, label) =>
+      println(s"label $label: $prec")
+    }
+
+    println("Recall by label:")
+    trainingSummary.recallByLabel.zipWithIndex.foreach { case (rec, label) =>
+      println(s"label $label: $rec")
+    }
+
+
+    println("F-measure by label:")
+    trainingSummary.fMeasureByLabel.zipWithIndex.foreach { case (f, label) =>
+      println(s"label $label: $f")
+    }
+
+    val accuracy = trainingSummary.accuracy
+    val falsePositiveRate = trainingSummary.weightedFalsePositiveRate
+    val truePositiveRate = trainingSummary.weightedTruePositiveRate
+    val fMeasure = trainingSummary.weightedFMeasure
+    val precision = trainingSummary.weightedPrecision
+    val recall = trainingSummary.weightedRecall
+    println(s"Accuracy: $accuracy\nFPR: $falsePositiveRate\nTPR: $truePositiveRate\n" +
+      s"F-measure: $fMeasure\nPrecision: $precision\nRecall: $recall")
     // $example off$
 
     spark.stop()

From d3e623b19231e6d59793b86afa01f169fb2dedb2 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 30 Jan 2018 19:43:17 +0800
Subject: [PATCH 214/566] [SPARK-23260][SPARK-23262][SQL] several data source
 v2 naming cleanup

## What changes were proposed in this pull request?

All other classes in the reader/writer package doesn't have `V2` in their names, and the streaming reader/writer don't have `V2` either. It's more consistent to remove `V2` from `DataSourceV2Reader` and `DataSourceVWriter`.

Also rename `DataSourceV2Option` to remote the `V2`, we should only have `V2` in the root interface: `DataSourceV2`.

This PR also fixes some places that the mix-in interface doesn't extend the interface it aimed to mix in.

## How was this patch tested?

existing tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20427 from cloud-fan/ds-v2.

(cherry picked from commit 0a9ac0248b6514a1e83ff7e4c522424f01b8b78d)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/kafka010/KafkaContinuousReader.scala  |  2 +-
 .../sql/kafka010/KafkaSourceProvider.scala    |  6 ++---
 ...eV2Options.java => DataSourceOptions.java} |  8 +++----
 .../spark/sql/sources/v2/ReadSupport.java     |  8 +++----
 .../sql/sources/v2/ReadSupportWithSchema.java |  8 +++----
 .../sql/sources/v2/SessionConfigSupport.java  |  2 +-
 .../spark/sql/sources/v2/WriteSupport.java    | 12 +++++-----
 .../sources/v2/reader/DataReaderFactory.java  |  2 +-
 ...rceV2Reader.java => DataSourceReader.java} | 11 +++++----
 .../SupportsPushDownCatalystFilters.java      |  4 ++--
 .../v2/reader/SupportsPushDownFilters.java    |  4 ++--
 .../SupportsPushDownRequiredColumns.java      |  6 ++---
 .../v2/reader/SupportsReportPartitioning.java |  4 ++--
 .../v2/reader/SupportsReportStatistics.java   |  4 ++--
 .../v2/reader/SupportsScanColumnarBatch.java  |  6 ++---
 .../v2/reader/SupportsScanUnsafeRow.java      |  6 ++---
 .../v2/streaming/ContinuousReadSupport.java   |  4 ++--
 .../v2/streaming/MicroBatchReadSupport.java   |  4 ++--
 .../v2/streaming/StreamWriteSupport.java      | 10 ++++----
 .../v2/streaming/reader/ContinuousReader.java |  6 ++---
 .../v2/streaming/reader/MicroBatchReader.java |  6 ++---
 .../v2/streaming/writer/StreamWriter.java     |  6 ++---
 ...rceV2Writer.java => DataSourceWriter.java} |  8 +++----
 .../sql/sources/v2/writer/DataWriter.java     | 12 +++++-----
 .../sources/v2/writer/DataWriterFactory.java  |  2 +-
 .../v2/writer/SupportsWriteInternalRow.java   |  4 ++--
 .../v2/writer/WriterCommitMessage.java        |  4 ++--
 .../apache/spark/sql/DataFrameReader.scala    |  2 +-
 .../apache/spark/sql/DataFrameWriter.scala    |  2 +-
 .../v2/DataSourceReaderHolder.scala           |  2 +-
 .../datasources/v2/DataSourceV2Relation.scala |  6 ++---
 .../datasources/v2/DataSourceV2ScanExec.scala |  2 +-
 .../datasources/v2/WriteToDataSourceV2.scala  |  4 ++--
 .../streaming/MicroBatchExecution.scala       |  6 ++---
 .../streaming/RateSourceProvider.scala        |  2 +-
 .../sql/execution/streaming/console.scala     |  4 ++--
 .../continuous/ContinuousExecution.scala      |  6 ++---
 .../ContinuousRateStreamSource.scala          |  7 +++---
 .../streaming/sources/ConsoleWriter.scala     |  4 ++--
 .../streaming/sources/MicroBatchWriter.scala  |  8 +++----
 .../sources/PackedRowWriterFactory.scala      |  4 ++--
 .../sources/RateStreamSourceV2.scala          |  6 ++---
 .../streaming/sources/memoryV2.scala          |  6 ++---
 .../sql/streaming/DataStreamReader.scala      |  4 ++--
 .../sources/v2/JavaAdvancedDataSourceV2.java  |  6 ++---
 .../sql/sources/v2/JavaBatchDataSourceV2.java |  6 ++---
 .../v2/JavaPartitionAwareDataSource.java      |  6 ++---
 .../v2/JavaSchemaRequiredDataSource.java      |  8 +++----
 .../sources/v2/JavaSimpleDataSourceV2.java    |  8 +++----
 .../sources/v2/JavaUnsafeRowDataSourceV2.java |  6 ++---
 .../streaming/RateSourceV2Suite.scala         | 18 +++++++-------
 ...ite.scala => DataSourceOptionsSuite.scala} | 16 ++++++-------
 .../sql/sources/v2/DataSourceV2Suite.scala    | 24 +++++++++----------
 .../sources/v2/SimpleWritableDataSource.scala | 12 +++++-----
 .../sources/StreamingDataSourceV2Suite.scala  |  8 +++----
 55 files changed, 176 insertions(+), 176 deletions(-)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{DataSourceV2Options.java => DataSourceOptions.java} (94%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/{DataSourceV2Reader.java => DataSourceReader.java} (91%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/{DataSourceV2Writer.java => DataSourceWriter.java} (96%)
 rename sql/core/src/test/scala/org/apache/spark/sql/sources/v2/{DataSourceV2OptionsSuite.scala => DataSourceOptionsSuite.scala} (80%)

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
index 9125cf5799d74..8c733426b256f 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
@@ -41,7 +41,7 @@ import org.apache.spark.unsafe.types.UTF8String
  * @param offsetReader  a reader used to get kafka offsets. Note that the actual data will be
  *                      read by per-task consumers generated later.
  * @param kafkaParams   String params for per-task Kafka consumers.
- * @param sourceOptions The [[org.apache.spark.sql.sources.v2.DataSourceV2Options]] params which
+ * @param sourceOptions The [[org.apache.spark.sql.sources.v2.DataSourceOptions]] params which
  *                      are not Kafka consumer params.
  * @param metadataPath Path to a directory this reader can use for writing metadata.
  * @param initialOffsets The Kafka offsets to start reading data at.
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
index 2deb7fa2cdf1e..85e96b6783327 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
@@ -30,7 +30,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SparkSession, SQLContext}
 import org.apache.spark.sql.execution.streaming.{Sink, Source}
 import org.apache.spark.sql.sources._
-import org.apache.spark.sql.sources.v2.DataSourceV2Options
+import org.apache.spark.sql.sources.v2.DataSourceOptions
 import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, StreamWriteSupport}
 import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
 import org.apache.spark.sql.streaming.OutputMode
@@ -109,7 +109,7 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
   override def createContinuousReader(
       schema: Optional[StructType],
       metadataPath: String,
-      options: DataSourceV2Options): KafkaContinuousReader = {
+      options: DataSourceOptions): KafkaContinuousReader = {
     val parameters = options.asMap().asScala.toMap
     validateStreamOptions(parameters)
     // Each running query should use its own group id. Otherwise, the query may be only assigned
@@ -227,7 +227,7 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
       queryId: String,
       schema: StructType,
       mode: OutputMode,
-      options: DataSourceV2Options): StreamWriter = {
+      options: DataSourceOptions): StreamWriter = {
     import scala.collection.JavaConverters._
 
     val spark = SparkSession.getActiveSession.get
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2Options.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceOptions.java
similarity index 94%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2Options.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceOptions.java
index ddc2acca693ac..c32053580f016 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2Options.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceOptions.java
@@ -29,18 +29,18 @@
  * data source options.
  */
 @InterfaceStability.Evolving
-public class DataSourceV2Options {
+public class DataSourceOptions {
   private final Map<String, String> keyLowerCasedMap;
 
   private String toLowerCase(String key) {
     return key.toLowerCase(Locale.ROOT);
   }
 
-  public static DataSourceV2Options empty() {
-    return new DataSourceV2Options(new HashMap<>());
+  public static DataSourceOptions empty() {
+    return new DataSourceOptions(new HashMap<>());
   }
 
-  public DataSourceV2Options(Map<String, String> originalMap) {
+  public DataSourceOptions(Map<String, String> originalMap) {
     keyLowerCasedMap = new HashMap<>(originalMap.size());
     for (Map.Entry<String, String> entry : originalMap.entrySet()) {
       keyLowerCasedMap.put(toLowerCase(entry.getKey()), entry.getValue());
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java
index 948e20bacf4a2..0ea4dc6b5def3 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java
@@ -18,17 +18,17 @@
 package org.apache.spark.sql.sources.v2;
 
 import org.apache.spark.annotation.InterfaceStability;
-import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
+import org.apache.spark.sql.sources.v2.reader.DataSourceReader;
 
 /**
  * A mix-in interface for {@link DataSourceV2}. Data sources can implement this interface to
  * provide data reading ability and scan the data from the data source.
  */
 @InterfaceStability.Evolving
-public interface ReadSupport {
+public interface ReadSupport extends DataSourceV2 {
 
   /**
-   * Creates a {@link DataSourceV2Reader} to scan the data from this data source.
+   * Creates a {@link DataSourceReader} to scan the data from this data source.
    *
    * If this method fails (by throwing an exception), the action would fail and no Spark job was
    * submitted.
@@ -36,5 +36,5 @@ public interface ReadSupport {
    * @param options the options for the returned data source reader, which is an immutable
    *                case-insensitive string-to-string map.
    */
-  DataSourceV2Reader createReader(DataSourceV2Options options);
+  DataSourceReader createReader(DataSourceOptions options);
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java
index b69c6bed8d1b5..3801402268af1 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.sources.v2;
 
 import org.apache.spark.annotation.InterfaceStability;
-import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
+import org.apache.spark.sql.sources.v2.reader.DataSourceReader;
 import org.apache.spark.sql.types.StructType;
 
 /**
@@ -30,10 +30,10 @@
  * supports both schema inference and user-specified schema.
  */
 @InterfaceStability.Evolving
-public interface ReadSupportWithSchema {
+public interface ReadSupportWithSchema extends DataSourceV2 {
 
   /**
-   * Create a {@link DataSourceV2Reader} to scan the data from this data source.
+   * Create a {@link DataSourceReader} to scan the data from this data source.
    *
    * If this method fails (by throwing an exception), the action would fail and no Spark job was
    * submitted.
@@ -45,5 +45,5 @@ public interface ReadSupportWithSchema {
    * @param options the options for the returned data source reader, which is an immutable
    *                case-insensitive string-to-string map.
    */
-  DataSourceV2Reader createReader(StructType schema, DataSourceV2Options options);
+  DataSourceReader createReader(StructType schema, DataSourceOptions options);
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/SessionConfigSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/SessionConfigSupport.java
index 3cb020d2e0836..9d66805d79b9e 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/SessionConfigSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/SessionConfigSupport.java
@@ -25,7 +25,7 @@
  * session.
  */
 @InterfaceStability.Evolving
-public interface SessionConfigSupport {
+public interface SessionConfigSupport extends DataSourceV2 {
 
     /**
      * Key prefix of the session configs to propagate. Spark will extract all session configs that
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/WriteSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/WriteSupport.java
index 1e3b644d8c4ae..cab56453816cc 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/WriteSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/WriteSupport.java
@@ -21,7 +21,7 @@
 
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.SaveMode;
-import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer;
+import org.apache.spark.sql.sources.v2.writer.DataSourceWriter;
 import org.apache.spark.sql.types.StructType;
 
 /**
@@ -29,17 +29,17 @@
  * provide data writing ability and save the data to the data source.
  */
 @InterfaceStability.Evolving
-public interface WriteSupport {
+public interface WriteSupport extends DataSourceV2 {
 
   /**
-   * Creates an optional {@link DataSourceV2Writer} to save the data to this data source. Data
+   * Creates an optional {@link DataSourceWriter} to save the data to this data source. Data
    * sources can return None if there is no writing needed to be done according to the save mode.
    *
    * If this method fails (by throwing an exception), the action would fail and no Spark job was
    * submitted.
    *
    * @param jobId A unique string for the writing job. It's possible that there are many writing
-   *              jobs running at the same time, and the returned {@link DataSourceV2Writer} can
+   *              jobs running at the same time, and the returned {@link DataSourceWriter} can
    *              use this job id to distinguish itself from other jobs.
    * @param schema the schema of the data to be written.
    * @param mode the save mode which determines what to do when the data are already in this data
@@ -47,6 +47,6 @@ public interface WriteSupport {
    * @param options the options for the returned data source writer, which is an immutable
    *                case-insensitive string-to-string map.
    */
-  Optional<DataSourceV2Writer> createWriter(
-      String jobId, StructType schema, SaveMode mode, DataSourceV2Options options);
+  Optional<DataSourceWriter> createWriter(
+      String jobId, StructType schema, SaveMode mode, DataSourceOptions options);
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReaderFactory.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReaderFactory.java
index 077b95b837964..32e98e8f5d8bd 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReaderFactory.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReaderFactory.java
@@ -22,7 +22,7 @@
 import org.apache.spark.annotation.InterfaceStability;
 
 /**
- * A reader factory returned by {@link DataSourceV2Reader#createDataReaderFactories()} and is
+ * A reader factory returned by {@link DataSourceReader#createDataReaderFactories()} and is
  * responsible for creating the actual data reader. The relationship between
  * {@link DataReaderFactory} and {@link DataReader}
  * is similar to the relationship between {@link Iterable} and {@link java.util.Iterator}.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceReader.java
similarity index 91%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceReader.java
index 0180cd9ea47f8..a470bccc5aad2 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceReader.java
@@ -21,14 +21,15 @@
 
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.Row;
+import org.apache.spark.sql.sources.v2.DataSourceOptions;
+import org.apache.spark.sql.sources.v2.ReadSupport;
+import org.apache.spark.sql.sources.v2.ReadSupportWithSchema;
 import org.apache.spark.sql.types.StructType;
 
 /**
  * A data source reader that is returned by
- * {@link org.apache.spark.sql.sources.v2.ReadSupport#createReader(
- * org.apache.spark.sql.sources.v2.DataSourceV2Options)} or
- * {@link org.apache.spark.sql.sources.v2.ReadSupportWithSchema#createReader(
- * StructType, org.apache.spark.sql.sources.v2.DataSourceV2Options)}.
+ * {@link ReadSupport#createReader(DataSourceOptions)} or
+ * {@link ReadSupportWithSchema#createReader(StructType, DataSourceOptions)}.
  * It can mix in various query optimization interfaces to speed up the data scan. The actual scan
  * logic is delegated to {@link DataReaderFactory}s that are returned by
  * {@link #createDataReaderFactories()}.
@@ -52,7 +53,7 @@
  * issues the scan request and does the actual data reading.
  */
 @InterfaceStability.Evolving
-public interface DataSourceV2Reader {
+public interface DataSourceReader {
 
   /**
    * Returns the actual schema of this data source reader, which may be different from the physical
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java
index f76c687f450c8..98224102374aa 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java
@@ -21,7 +21,7 @@
 import org.apache.spark.sql.catalyst.expressions.Expression;
 
 /**
- * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * A mix-in interface for {@link DataSourceReader}. Data source readers can implement this
  * interface to push down arbitrary expressions as predicates to the data source.
  * This is an experimental and unstable interface as {@link Expression} is not public and may get
  * changed in the future Spark versions.
@@ -31,7 +31,7 @@
  * process this interface.
  */
 @InterfaceStability.Unstable
-public interface SupportsPushDownCatalystFilters {
+public interface SupportsPushDownCatalystFilters extends DataSourceReader {
 
   /**
    * Pushes down filters, and returns unsupported filters.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java
index 6b0c9d417eeae..f35c711b0387a 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java
@@ -21,7 +21,7 @@
 import org.apache.spark.sql.sources.Filter;
 
 /**
- * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * A mix-in interface for {@link DataSourceReader}. Data source readers can implement this
  * interface to push down filters to the data source and reduce the size of the data to be read.
  *
  * Note that, if data source readers implement both this interface and
@@ -29,7 +29,7 @@
  * {@link SupportsPushDownCatalystFilters}.
  */
 @InterfaceStability.Evolving
-public interface SupportsPushDownFilters {
+public interface SupportsPushDownFilters extends DataSourceReader {
 
   /**
    * Pushes down filters, and returns unsupported filters.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownRequiredColumns.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownRequiredColumns.java
index fe0ac8ee0ee32..427b4d00a1128 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownRequiredColumns.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownRequiredColumns.java
@@ -21,12 +21,12 @@
 import org.apache.spark.sql.types.StructType;
 
 /**
- * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * A mix-in interface for {@link DataSourceReader}. Data source readers can implement this
  * interface to push down required columns to the data source and only read these columns during
  * scan to reduce the size of the data to be read.
  */
 @InterfaceStability.Evolving
-public interface SupportsPushDownRequiredColumns {
+public interface SupportsPushDownRequiredColumns extends DataSourceReader {
 
   /**
    * Applies column pruning w.r.t. the given requiredSchema.
@@ -35,7 +35,7 @@ public interface SupportsPushDownRequiredColumns {
    * also OK to do the pruning partially, e.g., a data source may not be able to prune nested
    * fields, and only prune top-level columns.
    *
-   * Note that, data source readers should update {@link DataSourceV2Reader#readSchema()} after
+   * Note that, data source readers should update {@link DataSourceReader#readSchema()} after
    * applying column pruning.
    */
   void pruneColumns(StructType requiredSchema);
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java
index f786472ccf345..a2383a9d7d680 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java
@@ -20,11 +20,11 @@
 import org.apache.spark.annotation.InterfaceStability;
 
 /**
- * A mix in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * A mix in interface for {@link DataSourceReader}. Data source readers can implement this
  * interface to report data partitioning and try to avoid shuffle at Spark side.
  */
 @InterfaceStability.Evolving
-public interface SupportsReportPartitioning {
+public interface SupportsReportPartitioning extends DataSourceReader {
 
   /**
    * Returns the output data partitioning that this reader guarantees.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportStatistics.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportStatistics.java
index c019d2f819ab7..11bb13fd3b211 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportStatistics.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportStatistics.java
@@ -20,11 +20,11 @@
 import org.apache.spark.annotation.InterfaceStability;
 
 /**
- * A mix in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * A mix in interface for {@link DataSourceReader}. Data source readers can implement this
  * interface to report statistics to Spark.
  */
 @InterfaceStability.Evolving
-public interface SupportsReportStatistics {
+public interface SupportsReportStatistics extends DataSourceReader {
 
   /**
    * Returns the basic statistics of this data source.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanColumnarBatch.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanColumnarBatch.java
index 67da55554bbf3..2e5cfa78511f0 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanColumnarBatch.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanColumnarBatch.java
@@ -24,11 +24,11 @@
 import org.apache.spark.sql.vectorized.ColumnarBatch;
 
 /**
- * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * A mix-in interface for {@link DataSourceReader}. Data source readers can implement this
  * interface to output {@link ColumnarBatch} and make the scan faster.
  */
 @InterfaceStability.Evolving
-public interface SupportsScanColumnarBatch extends DataSourceV2Reader {
+public interface SupportsScanColumnarBatch extends DataSourceReader {
   @Override
   default List<DataReaderFactory<Row>> createDataReaderFactories() {
     throw new IllegalStateException(
@@ -36,7 +36,7 @@ default List<DataReaderFactory<Row>> createDataReaderFactories() {
   }
 
   /**
-   * Similar to {@link DataSourceV2Reader#createDataReaderFactories()}, but returns columnar data
+   * Similar to {@link DataSourceReader#createDataReaderFactories()}, but returns columnar data
    * in batches.
    */
   List<DataReaderFactory<ColumnarBatch>> createBatchDataReaderFactories();
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java
index 156af69520f77..9cd749e8e4ce9 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java
@@ -24,13 +24,13 @@
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
 
 /**
- * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * A mix-in interface for {@link DataSourceReader}. Data source readers can implement this
  * interface to output {@link UnsafeRow} directly and avoid the row copy at Spark side.
  * This is an experimental and unstable interface, as {@link UnsafeRow} is not public and may get
  * changed in the future Spark versions.
  */
 @InterfaceStability.Unstable
-public interface SupportsScanUnsafeRow extends DataSourceV2Reader {
+public interface SupportsScanUnsafeRow extends DataSourceReader {
 
   @Override
   default List<DataReaderFactory<Row>> createDataReaderFactories() {
@@ -39,7 +39,7 @@ default List<DataReaderFactory<Row>> createDataReaderFactories() {
   }
 
   /**
-   * Similar to {@link DataSourceV2Reader#createDataReaderFactories()},
+   * Similar to {@link DataSourceReader#createDataReaderFactories()},
    * but returns data in unsafe row format.
    */
   List<DataReaderFactory<UnsafeRow>> createUnsafeRowReaderFactories();
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java
index 9a93a806b0efc..f79424e036a52 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java
@@ -21,7 +21,7 @@
 
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.sources.v2.DataSourceV2;
-import org.apache.spark.sql.sources.v2.DataSourceV2Options;
+import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.streaming.reader.ContinuousReader;
 import org.apache.spark.sql.types.StructType;
 
@@ -44,5 +44,5 @@ public interface ContinuousReadSupport extends DataSourceV2 {
   ContinuousReader createContinuousReader(
     Optional<StructType> schema,
     String checkpointLocation,
-    DataSourceV2Options options);
+    DataSourceOptions options);
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchReadSupport.java
index 3b357c01a29fe..22660e42ad850 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchReadSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchReadSupport.java
@@ -20,8 +20,8 @@
 import java.util.Optional;
 
 import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.DataSourceV2;
-import org.apache.spark.sql.sources.v2.DataSourceV2Options;
 import org.apache.spark.sql.sources.v2.streaming.reader.MicroBatchReader;
 import org.apache.spark.sql.types.StructType;
 
@@ -50,5 +50,5 @@ public interface MicroBatchReadSupport extends DataSourceV2 {
   MicroBatchReader createMicroBatchReader(
       Optional<StructType> schema,
       String checkpointLocation,
-      DataSourceV2Options options);
+      DataSourceOptions options);
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/StreamWriteSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/StreamWriteSupport.java
index 6cd219c67109a..7c5f304425093 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/StreamWriteSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/StreamWriteSupport.java
@@ -19,10 +19,10 @@
 
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.execution.streaming.BaseStreamingSink;
+import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.DataSourceV2;
-import org.apache.spark.sql.sources.v2.DataSourceV2Options;
 import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter;
-import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer;
+import org.apache.spark.sql.sources.v2.writer.DataSourceWriter;
 import org.apache.spark.sql.streaming.OutputMode;
 import org.apache.spark.sql.types.StructType;
 
@@ -31,7 +31,7 @@
  * provide data writing ability for structured streaming.
  */
 @InterfaceStability.Evolving
-public interface StreamWriteSupport extends BaseStreamingSink {
+public interface StreamWriteSupport extends DataSourceV2, BaseStreamingSink {
 
     /**
      * Creates an optional {@link StreamWriter} to save the data to this data source. Data
@@ -39,7 +39,7 @@ public interface StreamWriteSupport extends BaseStreamingSink {
      *
      * @param queryId A unique string for the writing query. It's possible that there are many
      *                writing queries running at the same time, and the returned
-     *                {@link DataSourceV2Writer} can use this id to distinguish itself from others.
+     *                {@link DataSourceWriter} can use this id to distinguish itself from others.
      * @param schema the schema of the data to be written.
      * @param mode the output mode which determines what successive epoch output means to this
      *             sink, please refer to {@link OutputMode} for more details.
@@ -50,5 +50,5 @@ StreamWriter createStreamWriter(
         String queryId,
         StructType schema,
         OutputMode mode,
-        DataSourceV2Options options);
+        DataSourceOptions options);
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java
index 3ac979cb0b7b4..6e5177ee83a62 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java
@@ -19,12 +19,12 @@
 
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.execution.streaming.BaseStreamingSource;
-import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
+import org.apache.spark.sql.sources.v2.reader.DataSourceReader;
 
 import java.util.Optional;
 
 /**
- * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * A mix-in interface for {@link DataSourceReader}. Data source readers can implement this
  * interface to allow reading in a continuous processing mode stream.
  *
  * Implementations must ensure each reader factory output is a {@link ContinuousDataReader}.
@@ -33,7 +33,7 @@
  * DataSource V1 APIs. This extension will be removed once we get rid of V1 completely.
  */
 @InterfaceStability.Evolving
-public interface ContinuousReader extends BaseStreamingSource, DataSourceV2Reader {
+public interface ContinuousReader extends BaseStreamingSource, DataSourceReader {
     /**
      * Merge partitioned offsets coming from {@link ContinuousDataReader} instances for each
      * partition to a single global offset.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java
index 68887e569fc1d..fcec446d892f5 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java
@@ -18,20 +18,20 @@
 package org.apache.spark.sql.sources.v2.streaming.reader;
 
 import org.apache.spark.annotation.InterfaceStability;
-import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
+import org.apache.spark.sql.sources.v2.reader.DataSourceReader;
 import org.apache.spark.sql.execution.streaming.BaseStreamingSource;
 
 import java.util.Optional;
 
 /**
- * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * A mix-in interface for {@link DataSourceReader}. Data source readers can implement this
  * interface to indicate they allow micro-batch streaming reads.
  *
  * Note: This class currently extends {@link BaseStreamingSource} to maintain compatibility with
  * DataSource V1 APIs. This extension will be removed once we get rid of V1 completely.
  */
 @InterfaceStability.Evolving
-public interface MicroBatchReader extends DataSourceV2Reader, BaseStreamingSource {
+public interface MicroBatchReader extends DataSourceReader, BaseStreamingSource {
     /**
      * Set the desired offset range for reader factories created from this reader. Reader factories
      * will generate only data within (`start`, `end`]; that is, from the first record after `start`
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/writer/StreamWriter.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/writer/StreamWriter.java
index 3156c88933e5e..915ee6c4fb390 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/writer/StreamWriter.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/writer/StreamWriter.java
@@ -18,19 +18,19 @@
 package org.apache.spark.sql.sources.v2.streaming.writer;
 
 import org.apache.spark.annotation.InterfaceStability;
-import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer;
+import org.apache.spark.sql.sources.v2.writer.DataSourceWriter;
 import org.apache.spark.sql.sources.v2.writer.DataWriter;
 import org.apache.spark.sql.sources.v2.writer.WriterCommitMessage;
 
 /**
- * A {@link DataSourceV2Writer} for use with structured streaming. This writer handles commits and
+ * A {@link DataSourceWriter} for use with structured streaming. This writer handles commits and
  * aborts relative to an epoch ID determined by the execution engine.
  *
  * {@link DataWriter} implementations generated by a StreamWriter may be reused for multiple epochs,
  * and so must reset any internal state after a successful commit.
  */
 @InterfaceStability.Evolving
-public interface StreamWriter extends DataSourceV2Writer {
+public interface StreamWriter extends DataSourceWriter {
   /**
    * Commits this writing job for the specified epoch with a list of commit messages. The commit
    * messages are collected from successful data writers and are produced by
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceWriter.java
similarity index 96%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceWriter.java
index 8048f507a1dca..d89d27d0e5b1b 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceWriter.java
@@ -20,16 +20,16 @@
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SaveMode;
-import org.apache.spark.sql.sources.v2.DataSourceV2Options;
+import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.WriteSupport;
 import org.apache.spark.sql.streaming.OutputMode;
 import org.apache.spark.sql.types.StructType;
 
 /**
  * A data source writer that is returned by
- * {@link WriteSupport#createWriter(String, StructType, SaveMode, DataSourceV2Options)}/
+ * {@link WriteSupport#createWriter(String, StructType, SaveMode, DataSourceOptions)}/
  * {@link org.apache.spark.sql.sources.v2.streaming.StreamWriteSupport#createStreamWriter(
- * String, StructType, OutputMode, DataSourceV2Options)}.
+ * String, StructType, OutputMode, DataSourceOptions)}.
  * It can mix in various writing optimization interfaces to speed up the data saving. The actual
  * writing logic is delegated to {@link DataWriter}.
  *
@@ -52,7 +52,7 @@
  * Please refer to the documentation of commit/abort methods for detailed specifications.
  */
 @InterfaceStability.Evolving
-public interface DataSourceV2Writer {
+public interface DataSourceWriter {
 
   /**
    * Creates a writer factory which will be serialized and sent to executors.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriter.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriter.java
index 04b03e63de500..53941a89ba94e 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriter.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriter.java
@@ -33,11 +33,11 @@
  *
  * If this data writer succeeds(all records are successfully written and {@link #commit()}
  * succeeds), a {@link WriterCommitMessage} will be sent to the driver side and pass to
- * {@link DataSourceV2Writer#commit(WriterCommitMessage[])} with commit messages from other data
+ * {@link DataSourceWriter#commit(WriterCommitMessage[])} with commit messages from other data
  * writers. If this data writer fails(one record fails to write or {@link #commit()} fails), an
  * exception will be sent to the driver side, and Spark will retry this writing task for some times,
  * each time {@link DataWriterFactory#createDataWriter(int, int)} gets a different `attemptNumber`,
- * and finally call {@link DataSourceV2Writer#abort(WriterCommitMessage[])} if all retry fail.
+ * and finally call {@link DataSourceWriter#abort(WriterCommitMessage[])} if all retry fail.
  *
  * Besides the retry mechanism, Spark may launch speculative tasks if the existing writing task
  * takes too long to finish. Different from retried tasks, which are launched one by one after the
@@ -69,11 +69,11 @@ public interface DataWriter<T> {
   /**
    * Commits this writer after all records are written successfully, returns a commit message which
    * will be sent back to driver side and passed to
-   * {@link DataSourceV2Writer#commit(WriterCommitMessage[])}.
+   * {@link DataSourceWriter#commit(WriterCommitMessage[])}.
    *
    * The written data should only be visible to data source readers after
-   * {@link DataSourceV2Writer#commit(WriterCommitMessage[])} succeeds, which means this method
-   * should still "hide" the written data and ask the {@link DataSourceV2Writer} at driver side to
+   * {@link DataSourceWriter#commit(WriterCommitMessage[])} succeeds, which means this method
+   * should still "hide" the written data and ask the {@link DataSourceWriter} at driver side to
    * do the final commit via {@link WriterCommitMessage}.
    *
    * If this method fails (by throwing an exception), {@link #abort()} will be called and this
@@ -91,7 +91,7 @@ public interface DataWriter<T> {
    * failed.
    *
    * If this method fails(by throwing an exception), the underlying data source may have garbage
-   * that need to be cleaned by {@link DataSourceV2Writer#abort(WriterCommitMessage[])} or manually,
+   * that need to be cleaned by {@link DataSourceWriter#abort(WriterCommitMessage[])} or manually,
    * but these garbage should not be visible to data source readers.
    *
    * @throws IOException if failure happens during disk/network IO like writing files.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriterFactory.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriterFactory.java
index 18ec792f5a2c9..ea95442511ce5 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriterFactory.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriterFactory.java
@@ -22,7 +22,7 @@
 import org.apache.spark.annotation.InterfaceStability;
 
 /**
- * A factory of {@link DataWriter} returned by {@link DataSourceV2Writer#createWriterFactory()},
+ * A factory of {@link DataWriter} returned by {@link DataSourceWriter#createWriterFactory()},
  * which is responsible for creating and initializing the actual data writer at executor side.
  *
  * Note that, the writer factory will be serialized and sent to executors, then the data writer
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/SupportsWriteInternalRow.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/SupportsWriteInternalRow.java
index 3e0518814f458..d2cf7e01c08c8 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/SupportsWriteInternalRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/SupportsWriteInternalRow.java
@@ -22,14 +22,14 @@
 import org.apache.spark.sql.catalyst.InternalRow;
 
 /**
- * A mix-in interface for {@link DataSourceV2Writer}. Data source writers can implement this
+ * A mix-in interface for {@link DataSourceWriter}. Data source writers can implement this
  * interface to write {@link InternalRow} directly and avoid the row conversion at Spark side.
  * This is an experimental and unstable interface, as {@link InternalRow} is not public and may get
  * changed in the future Spark versions.
  */
 
 @InterfaceStability.Unstable
-public interface SupportsWriteInternalRow extends DataSourceV2Writer {
+public interface SupportsWriteInternalRow extends DataSourceWriter {
 
   @Override
   default DataWriterFactory<Row> createWriterFactory() {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/WriterCommitMessage.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/WriterCommitMessage.java
index 082d6b5dc409f..9e38836c0edf9 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/WriterCommitMessage.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/WriterCommitMessage.java
@@ -23,10 +23,10 @@
 
 /**
  * A commit message returned by {@link DataWriter#commit()} and will be sent back to the driver side
- * as the input parameter of {@link DataSourceV2Writer#commit(WriterCommitMessage[])}.
+ * as the input parameter of {@link DataSourceWriter#commit(WriterCommitMessage[])}.
  *
  * This is an empty interface, data sources should define their own message class and use it in
- * their {@link DataWriter#commit()} and {@link DataSourceV2Writer#commit(WriterCommitMessage[])}
+ * their {@link DataWriter#commit()} and {@link DataSourceWriter#commit(WriterCommitMessage[])}
  * implementations.
  */
 @InterfaceStability.Evolving
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index b714a46b5f786..46b5f54a33f74 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -186,7 +186,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
     val cls = DataSource.lookupDataSource(source, sparkSession.sessionState.conf)
     if (classOf[DataSourceV2].isAssignableFrom(cls)) {
       val ds = cls.newInstance()
-      val options = new DataSourceV2Options((extraOptions ++
+      val options = new DataSourceOptions((extraOptions ++
         DataSourceV2Utils.extractSessionConfigs(
           ds = ds.asInstanceOf[DataSourceV2],
           conf = sparkSession.sessionState.conf)).asJava)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 5c02eae05304b..ed7a9100cc7f1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -243,7 +243,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
       val ds = cls.newInstance()
       ds match {
         case ws: WriteSupport =>
-          val options = new DataSourceV2Options((extraOptions ++
+          val options = new DataSourceOptions((extraOptions ++
             DataSourceV2Utils.extractSessionConfigs(
               ds = ds.asInstanceOf[DataSourceV2],
               conf = df.sparkSession.sessionState.conf)).asJava)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala
index 6093df26630cd..6460c97abe344 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala
@@ -35,7 +35,7 @@ trait DataSourceReaderHolder {
   /**
    * The held data source reader.
    */
-  def reader: DataSourceV2Reader
+  def reader: DataSourceReader
 
   /**
    * The metadata of this data source reader that can be used for equality test.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
index cba20dd902007..3d4c64981373d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.sources.v2.reader._
 
 case class DataSourceV2Relation(
     fullOutput: Seq[AttributeReference],
-    reader: DataSourceV2Reader) extends LeafNode with DataSourceReaderHolder {
+    reader: DataSourceReader) extends LeafNode with DataSourceReaderHolder {
 
   override def canEqual(other: Any): Boolean = other.isInstanceOf[DataSourceV2Relation]
 
@@ -41,12 +41,12 @@ case class DataSourceV2Relation(
  */
 class StreamingDataSourceV2Relation(
     fullOutput: Seq[AttributeReference],
-    reader: DataSourceV2Reader) extends DataSourceV2Relation(fullOutput, reader) {
+    reader: DataSourceReader) extends DataSourceV2Relation(fullOutput, reader) {
   override def isStreaming: Boolean = true
 }
 
 object DataSourceV2Relation {
-  def apply(reader: DataSourceV2Reader): DataSourceV2Relation = {
+  def apply(reader: DataSourceReader): DataSourceV2Relation = {
     new DataSourceV2Relation(reader.readSchema().toAttributes, reader)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
index 3f808fbb40932..ee085820b0775 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
@@ -36,7 +36,7 @@ import org.apache.spark.sql.types.StructType
  */
 case class DataSourceV2ScanExec(
     fullOutput: Seq[AttributeReference],
-    @transient reader: DataSourceV2Reader)
+    @transient reader: DataSourceReader)
   extends LeafExecNode with DataSourceReaderHolder with ColumnarBatchScan {
 
   override def canEqual(other: Any): Boolean = other.isInstanceOf[DataSourceV2ScanExec]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
index cd6b3e99b6bcb..c544adbf32cdf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
@@ -35,7 +35,7 @@ import org.apache.spark.util.Utils
 /**
  * The logical plan for writing data into data source v2.
  */
-case class WriteToDataSourceV2(writer: DataSourceV2Writer, query: LogicalPlan) extends LogicalPlan {
+case class WriteToDataSourceV2(writer: DataSourceWriter, query: LogicalPlan) extends LogicalPlan {
   override def children: Seq[LogicalPlan] = Seq(query)
   override def output: Seq[Attribute] = Nil
 }
@@ -43,7 +43,7 @@ case class WriteToDataSourceV2(writer: DataSourceV2Writer, query: LogicalPlan) e
 /**
  * The physical plan for writing data into data source v2.
  */
-case class WriteToDataSourceV2Exec(writer: DataSourceV2Writer, query: SparkPlan) extends SparkPlan {
+case class WriteToDataSourceV2Exec(writer: DataSourceWriter, query: SparkPlan) extends SparkPlan {
   override def children: Seq[SparkPlan] = Seq(query)
   override def output: Seq[Attribute] = Nil
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
index 975975243a3d1..93572f7a63132 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.datasources.v2.{StreamingDataSourceV2Relation, WriteToDataSourceV2}
 import org.apache.spark.sql.execution.streaming.sources.{InternalRowMicroBatchWriter, MicroBatchWriter}
-import org.apache.spark.sql.sources.v2.DataSourceV2Options
+import org.apache.spark.sql.sources.v2.DataSourceOptions
 import org.apache.spark.sql.sources.v2.streaming.{MicroBatchReadSupport, StreamWriteSupport}
 import org.apache.spark.sql.sources.v2.streaming.reader.{MicroBatchReader, Offset => OffsetV2}
 import org.apache.spark.sql.sources.v2.writer.SupportsWriteInternalRow
@@ -89,7 +89,7 @@ class MicroBatchExecution(
           val reader = source.createMicroBatchReader(
             Optional.empty(), // user specified schema
             metadataPath,
-            new DataSourceV2Options(options.asJava))
+            new DataSourceOptions(options.asJava))
           nextSourceId += 1
           StreamingExecutionRelation(reader, output)(sparkSession)
         })
@@ -447,7 +447,7 @@ class MicroBatchExecution(
           s"$runId",
           newAttributePlan.schema,
           outputMode,
-          new DataSourceV2Options(extraOptions.asJava))
+          new DataSourceOptions(extraOptions.asJava))
         if (writer.isInstanceOf[SupportsWriteInternalRow]) {
           WriteToDataSourceV2(
             new InternalRowMicroBatchWriter(currentBatchId, writer), newAttributePlan)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
index 66eb0169ac1ec..5e3fee633f591 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
@@ -111,7 +111,7 @@ class RateSourceProvider extends StreamSourceProvider with DataSourceRegister
   override def createContinuousReader(
       schema: Optional[StructType],
       checkpointLocation: String,
-      options: DataSourceV2Options): ContinuousReader = {
+      options: DataSourceOptions): ContinuousReader = {
     new RateStreamContinuousReader(options)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
index d5ac0bd1df52b..3f5bb489d6528 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.streaming
 import org.apache.spark.sql._
 import org.apache.spark.sql.execution.streaming.sources.ConsoleWriter
 import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister}
-import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
+import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2}
 import org.apache.spark.sql.sources.v2.streaming.StreamWriteSupport
 import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
 import org.apache.spark.sql.streaming.OutputMode
@@ -40,7 +40,7 @@ class ConsoleSinkProvider extends DataSourceV2
       queryId: String,
       schema: StructType,
       mode: OutputMode,
-      options: DataSourceV2Options): StreamWriter = {
+      options: DataSourceOptions): StreamWriter = {
     new ConsoleWriter(schema, options)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
index 60f880f9c73b8..9402d7c1dcefd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -31,7 +31,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, StreamingDataSourceV2Relation, WriteToDataSourceV2}
 import org.apache.spark.sql.execution.streaming.{ContinuousExecutionRelation, StreamingRelationV2, _}
-import org.apache.spark.sql.sources.v2.DataSourceV2Options
+import org.apache.spark.sql.sources.v2.DataSourceOptions
 import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, StreamWriteSupport}
 import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousReader, PartitionOffset}
 import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime, Trigger}
@@ -160,7 +160,7 @@ class ContinuousExecution(
         dataSource.createContinuousReader(
           java.util.Optional.empty[StructType](),
           metadataPath,
-          new DataSourceV2Options(extraReaderOptions.asJava))
+          new DataSourceOptions(extraReaderOptions.asJava))
     }
     uniqueSources = continuousSources.distinct
 
@@ -198,7 +198,7 @@ class ContinuousExecution(
       s"$runId",
       triggerLogicalPlan.schema,
       outputMode,
-      new DataSourceV2Options(extraOptions.asJava))
+      new DataSourceOptions(extraOptions.asJava))
     val withSink = WriteToDataSourceV2(writer, triggerLogicalPlan)
 
     val reader = withSink.collect {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
index 61304480f4721..ff028ebc4236a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
@@ -23,19 +23,18 @@ import org.json4s.DefaultFormats
 import org.json4s.jackson.Serialization
 
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.streaming.{RateSourceProvider, RateStreamOffset, ValueRunTimeMsPair}
 import org.apache.spark.sql.execution.streaming.sources.RateStreamSourceV2
-import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
+import org.apache.spark.sql.sources.v2.DataSourceOptions
 import org.apache.spark.sql.sources.v2.reader._
 import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousDataReader, ContinuousReader, Offset, PartitionOffset}
-import org.apache.spark.sql.types.{LongType, StructField, StructType, TimestampType}
+import org.apache.spark.sql.types.StructType
 
 case class RateStreamPartitionOffset(
    partition: Int, currentValue: Long, currentTimeMs: Long) extends PartitionOffset
 
-class RateStreamContinuousReader(options: DataSourceV2Options)
+class RateStreamContinuousReader(options: DataSourceOptions)
   extends ContinuousReader {
   implicit val defaultFormats: DefaultFormats = DefaultFormats
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala
index 7c1700f1de48c..d46f4d7b86360 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala
@@ -19,13 +19,13 @@ package org.apache.spark.sql.execution.streaming.sources
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{Row, SparkSession}
-import org.apache.spark.sql.sources.v2.DataSourceV2Options
+import org.apache.spark.sql.sources.v2.DataSourceOptions
 import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
 import org.apache.spark.sql.sources.v2.writer.{DataWriterFactory, WriterCommitMessage}
 import org.apache.spark.sql.types.StructType
 
 /** Common methods used to create writes for the the console sink */
-class ConsoleWriter(schema: StructType, options: DataSourceV2Options)
+class ConsoleWriter(schema: StructType, options: DataSourceOptions)
     extends StreamWriter with Logging {
 
   // Number of rows to display, by default 20 rows
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/MicroBatchWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/MicroBatchWriter.scala
index d7f3ba8856982..d7ce9a7b84479 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/MicroBatchWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/MicroBatchWriter.scala
@@ -20,14 +20,14 @@ package org.apache.spark.sql.execution.streaming.sources
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
-import org.apache.spark.sql.sources.v2.writer.{DataSourceV2Writer, DataWriterFactory, SupportsWriteInternalRow, WriterCommitMessage}
+import org.apache.spark.sql.sources.v2.writer.{DataSourceWriter, DataWriterFactory, SupportsWriteInternalRow, WriterCommitMessage}
 
 /**
- * A [[DataSourceV2Writer]] used to hook V2 stream writers into a microbatch plan. It implements
+ * A [[DataSourceWriter]] used to hook V2 stream writers into a microbatch plan. It implements
  * the non-streaming interface, forwarding the batch ID determined at construction to a wrapped
  * streaming writer.
  */
-class MicroBatchWriter(batchId: Long, writer: StreamWriter) extends DataSourceV2Writer {
+class MicroBatchWriter(batchId: Long, writer: StreamWriter) extends DataSourceWriter {
   override def commit(messages: Array[WriterCommitMessage]): Unit = {
     writer.commit(batchId, messages)
   }
@@ -38,7 +38,7 @@ class MicroBatchWriter(batchId: Long, writer: StreamWriter) extends DataSourceV2
 }
 
 class InternalRowMicroBatchWriter(batchId: Long, writer: StreamWriter)
-  extends DataSourceV2Writer with SupportsWriteInternalRow {
+  extends DataSourceWriter with SupportsWriteInternalRow {
   override def commit(messages: Array[WriterCommitMessage]): Unit = {
     writer.commit(batchId, messages)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala
index 9282ba05bdb7b..248295e401a0d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala
@@ -21,11 +21,11 @@ import scala.collection.mutable
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.sources.v2.writer.{DataWriter, DataWriterFactory, WriterCommitMessage}
+import org.apache.spark.sql.sources.v2.writer.{DataSourceWriter, DataWriter, DataWriterFactory, WriterCommitMessage}
 
 /**
  * A simple [[DataWriterFactory]] whose tasks just pack rows into the commit message for delivery
- * to a [[org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer]] on the driver.
+ * to a [[DataSourceWriter]] on the driver.
  *
  * Note that, because it sends all rows to the driver, this factory will generally be unsuitable
  * for production-quality sinks. It's intended for use in tests.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
index a25cc4f3b06f8..43949e6180aaa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.streaming.{RateStreamOffset, ValueRunTimeMsPair}
 import org.apache.spark.sql.sources.DataSourceRegister
-import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
+import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2}
 import org.apache.spark.sql.sources.v2.reader._
 import org.apache.spark.sql.sources.v2.streaming.MicroBatchReadSupport
 import org.apache.spark.sql.sources.v2.streaming.reader.{MicroBatchReader, Offset}
@@ -44,14 +44,14 @@ class RateSourceProviderV2 extends DataSourceV2 with MicroBatchReadSupport with
   override def createMicroBatchReader(
       schema: Optional[StructType],
       checkpointLocation: String,
-      options: DataSourceV2Options): MicroBatchReader = {
+      options: DataSourceOptions): MicroBatchReader = {
     new RateStreamMicroBatchReader(options)
   }
 
   override def shortName(): String = "ratev2"
 }
 
-class RateStreamMicroBatchReader(options: DataSourceV2Options)
+class RateStreamMicroBatchReader(options: DataSourceOptions)
   extends MicroBatchReader {
   implicit val defaultFormats: DefaultFormats = DefaultFormats
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
index ce55e44d932bd..58767261dc684 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics}
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes.{Append, Complete, Update}
 import org.apache.spark.sql.execution.streaming.Sink
-import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
+import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2}
 import org.apache.spark.sql.sources.v2.streaming.StreamWriteSupport
 import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
 import org.apache.spark.sql.sources.v2.writer._
@@ -45,7 +45,7 @@ class MemorySinkV2 extends DataSourceV2 with StreamWriteSupport with Logging {
       queryId: String,
       schema: StructType,
       mode: OutputMode,
-      options: DataSourceV2Options): StreamWriter = {
+      options: DataSourceOptions): StreamWriter = {
     new MemoryStreamWriter(this, mode)
   }
 
@@ -114,7 +114,7 @@ class MemorySinkV2 extends DataSourceV2 with StreamWriteSupport with Logging {
 case class MemoryWriterCommitMessage(partition: Int, data: Seq[Row]) extends WriterCommitMessage {}
 
 class MemoryWriter(sink: MemorySinkV2, batchId: Long, outputMode: OutputMode)
-  extends DataSourceV2Writer with Logging {
+  extends DataSourceWriter with Logging {
 
   override def createWriterFactory: MemoryWriterFactory = MemoryWriterFactory(outputMode)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index 9f5ca9f914284..f1b3f93c4e1fc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming.{StreamingRelation, StreamingRelationV2}
 import org.apache.spark.sql.sources.StreamSourceProvider
-import org.apache.spark.sql.sources.v2.DataSourceV2Options
+import org.apache.spark.sql.sources.v2.DataSourceOptions
 import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, MicroBatchReadSupport}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
@@ -158,7 +158,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
     }
 
     val ds = DataSource.lookupDataSource(source, sparkSession.sqlContext.conf).newInstance()
-    val options = new DataSourceV2Options(extraOptions.asJava)
+    val options = new DataSourceOptions(extraOptions.asJava)
     // We need to generate the V1 data source so we can pass it to the V2 relation as a shim.
     // We can't be sure at this point whether we'll actually want to use V2, since we don't know the
     // writer or whether the query is continuous.
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java
index 4026ee44bfdb7..d421f7d19563f 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java
@@ -24,15 +24,15 @@
 import org.apache.spark.sql.catalyst.expressions.GenericRow;
 import org.apache.spark.sql.sources.Filter;
 import org.apache.spark.sql.sources.GreaterThan;
+import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.DataSourceV2;
-import org.apache.spark.sql.sources.v2.DataSourceV2Options;
 import org.apache.spark.sql.sources.v2.ReadSupport;
 import org.apache.spark.sql.sources.v2.reader.*;
 import org.apache.spark.sql.types.StructType;
 
 public class JavaAdvancedDataSourceV2 implements DataSourceV2, ReadSupport {
 
-  class Reader implements DataSourceV2Reader, SupportsPushDownRequiredColumns,
+  class Reader implements DataSourceReader, SupportsPushDownRequiredColumns,
       SupportsPushDownFilters {
 
     private StructType requiredSchema = new StructType().add("i", "int").add("j", "int");
@@ -131,7 +131,7 @@ public void close() throws IOException {
 
 
   @Override
-  public DataSourceV2Reader createReader(DataSourceV2Options options) {
+  public DataSourceReader createReader(DataSourceOptions options) {
     return new Reader();
   }
 }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java
index 34e6c63801064..c55093768105b 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java
@@ -21,8 +21,8 @@
 import java.util.List;
 
 import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector;
+import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.DataSourceV2;
-import org.apache.spark.sql.sources.v2.DataSourceV2Options;
 import org.apache.spark.sql.sources.v2.ReadSupport;
 import org.apache.spark.sql.sources.v2.reader.*;
 import org.apache.spark.sql.types.DataTypes;
@@ -33,7 +33,7 @@
 
 public class JavaBatchDataSourceV2 implements DataSourceV2, ReadSupport {
 
-  class Reader implements DataSourceV2Reader, SupportsScanColumnarBatch {
+  class Reader implements DataSourceReader, SupportsScanColumnarBatch {
     private final StructType schema = new StructType().add("i", "int").add("j", "int");
 
     @Override
@@ -108,7 +108,7 @@ public void close() throws IOException {
 
 
   @Override
-  public DataSourceV2Reader createReader(DataSourceV2Options options) {
+  public DataSourceReader createReader(DataSourceOptions options) {
     return new Reader();
   }
 }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaPartitionAwareDataSource.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaPartitionAwareDataSource.java
index d0c87503ab455..99cca0f6dd626 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaPartitionAwareDataSource.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaPartitionAwareDataSource.java
@@ -23,15 +23,15 @@
 
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.catalyst.expressions.GenericRow;
+import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.DataSourceV2;
-import org.apache.spark.sql.sources.v2.DataSourceV2Options;
 import org.apache.spark.sql.sources.v2.ReadSupport;
 import org.apache.spark.sql.sources.v2.reader.*;
 import org.apache.spark.sql.types.StructType;
 
 public class JavaPartitionAwareDataSource implements DataSourceV2, ReadSupport {
 
-  class Reader implements DataSourceV2Reader, SupportsReportPartitioning {
+  class Reader implements DataSourceReader, SupportsReportPartitioning {
     private final StructType schema = new StructType().add("a", "int").add("b", "int");
 
     @Override
@@ -104,7 +104,7 @@ public DataReader<Row> createDataReader() {
   }
 
   @Override
-  public DataSourceV2Reader createReader(DataSourceV2Options options) {
+  public DataSourceReader createReader(DataSourceOptions options) {
     return new Reader();
   }
 }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSchemaRequiredDataSource.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSchemaRequiredDataSource.java
index f997366af1a64..048d078dfaac4 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSchemaRequiredDataSource.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSchemaRequiredDataSource.java
@@ -20,16 +20,16 @@
 import java.util.List;
 
 import org.apache.spark.sql.Row;
+import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.DataSourceV2;
-import org.apache.spark.sql.sources.v2.DataSourceV2Options;
 import org.apache.spark.sql.sources.v2.ReadSupportWithSchema;
-import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
+import org.apache.spark.sql.sources.v2.reader.DataSourceReader;
 import org.apache.spark.sql.sources.v2.reader.DataReaderFactory;
 import org.apache.spark.sql.types.StructType;
 
 public class JavaSchemaRequiredDataSource implements DataSourceV2, ReadSupportWithSchema {
 
-  class Reader implements DataSourceV2Reader {
+  class Reader implements DataSourceReader {
     private final StructType schema;
 
     Reader(StructType schema) {
@@ -48,7 +48,7 @@ public List<DataReaderFactory<Row>> createDataReaderFactories() {
   }
 
   @Override
-  public DataSourceV2Reader createReader(StructType schema, DataSourceV2Options options) {
+  public DataSourceReader createReader(StructType schema, DataSourceOptions options) {
     return new Reader(schema);
   }
 }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleDataSourceV2.java
index 2beed431d301f..96f55b8a76811 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleDataSourceV2.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleDataSourceV2.java
@@ -23,16 +23,16 @@
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.catalyst.expressions.GenericRow;
 import org.apache.spark.sql.sources.v2.DataSourceV2;
-import org.apache.spark.sql.sources.v2.DataSourceV2Options;
+import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.ReadSupport;
 import org.apache.spark.sql.sources.v2.reader.DataReader;
 import org.apache.spark.sql.sources.v2.reader.DataReaderFactory;
-import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
+import org.apache.spark.sql.sources.v2.reader.DataSourceReader;
 import org.apache.spark.sql.types.StructType;
 
 public class JavaSimpleDataSourceV2 implements DataSourceV2, ReadSupport {
 
-  class Reader implements DataSourceV2Reader {
+  class Reader implements DataSourceReader {
     private final StructType schema = new StructType().add("i", "int").add("j", "int");
 
     @Override
@@ -80,7 +80,7 @@ public void close() throws IOException {
   }
 
   @Override
-  public DataSourceV2Reader createReader(DataSourceV2Options options) {
+  public DataSourceReader createReader(DataSourceOptions options) {
     return new Reader();
   }
 }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaUnsafeRowDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaUnsafeRowDataSourceV2.java
index e8187524ea871..c3916e0b370b5 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaUnsafeRowDataSourceV2.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaUnsafeRowDataSourceV2.java
@@ -21,15 +21,15 @@
 import java.util.List;
 
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
+import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.DataSourceV2;
-import org.apache.spark.sql.sources.v2.DataSourceV2Options;
 import org.apache.spark.sql.sources.v2.ReadSupport;
 import org.apache.spark.sql.sources.v2.reader.*;
 import org.apache.spark.sql.types.StructType;
 
 public class JavaUnsafeRowDataSourceV2 implements DataSourceV2, ReadSupport {
 
-  class Reader implements DataSourceV2Reader, SupportsScanUnsafeRow {
+  class Reader implements DataSourceReader, SupportsScanUnsafeRow {
     private final StructType schema = new StructType().add("i", "int").add("j", "int");
 
     @Override
@@ -83,7 +83,7 @@ public void close() throws IOException {
   }
 
   @Override
-  public DataSourceV2Reader createReader(DataSourceV2Options options) {
+  public DataSourceReader createReader(DataSourceOptions options) {
     return new Reader();
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
index d2cfe7905f6fa..b060aeeef811d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming.continuous._
 import org.apache.spark.sql.execution.streaming.sources.{RateStreamBatchTask, RateStreamMicroBatchReader, RateStreamSourceV2}
-import org.apache.spark.sql.sources.v2.DataSourceV2Options
+import org.apache.spark.sql.sources.v2.DataSourceOptions
 import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, MicroBatchReadSupport}
 import org.apache.spark.sql.streaming.StreamTest
 import org.apache.spark.util.ManualClock
@@ -49,7 +49,7 @@ class RateSourceV2Suite extends StreamTest {
   test("microbatch in registry") {
     DataSource.lookupDataSource("ratev2", spark.sqlContext.conf).newInstance() match {
       case ds: MicroBatchReadSupport =>
-        val reader = ds.createMicroBatchReader(Optional.empty(), "", DataSourceV2Options.empty())
+        val reader = ds.createMicroBatchReader(Optional.empty(), "", DataSourceOptions.empty())
         assert(reader.isInstanceOf[RateStreamMicroBatchReader])
       case _ =>
         throw new IllegalStateException("Could not find v2 read support for rate")
@@ -76,14 +76,14 @@ class RateSourceV2Suite extends StreamTest {
 
   test("microbatch - numPartitions propagated") {
     val reader = new RateStreamMicroBatchReader(
-      new DataSourceV2Options(Map("numPartitions" -> "11", "rowsPerSecond" -> "33").asJava))
+      new DataSourceOptions(Map("numPartitions" -> "11", "rowsPerSecond" -> "33").asJava))
     reader.setOffsetRange(Optional.empty(), Optional.empty())
     val tasks = reader.createDataReaderFactories()
     assert(tasks.size == 11)
   }
 
   test("microbatch - set offset") {
-    val reader = new RateStreamMicroBatchReader(DataSourceV2Options.empty())
+    val reader = new RateStreamMicroBatchReader(DataSourceOptions.empty())
     val startOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(0, 1000))))
     val endOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(0, 2000))))
     reader.setOffsetRange(Optional.of(startOffset), Optional.of(endOffset))
@@ -93,7 +93,7 @@ class RateSourceV2Suite extends StreamTest {
 
   test("microbatch - infer offsets") {
     val reader = new RateStreamMicroBatchReader(
-      new DataSourceV2Options(Map("numPartitions" -> "1", "rowsPerSecond" -> "100").asJava))
+      new DataSourceOptions(Map("numPartitions" -> "1", "rowsPerSecond" -> "100").asJava))
     reader.clock.waitTillTime(reader.clock.getTimeMillis() + 100)
     reader.setOffsetRange(Optional.empty(), Optional.empty())
     reader.getStartOffset() match {
@@ -114,7 +114,7 @@ class RateSourceV2Suite extends StreamTest {
 
   test("microbatch - predetermined batch size") {
     val reader = new RateStreamMicroBatchReader(
-      new DataSourceV2Options(Map("numPartitions" -> "1", "rowsPerSecond" -> "20").asJava))
+      new DataSourceOptions(Map("numPartitions" -> "1", "rowsPerSecond" -> "20").asJava))
     val startOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(0, 1000))))
     val endOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(20, 2000))))
     reader.setOffsetRange(Optional.of(startOffset), Optional.of(endOffset))
@@ -125,7 +125,7 @@ class RateSourceV2Suite extends StreamTest {
 
   test("microbatch - data read") {
     val reader = new RateStreamMicroBatchReader(
-      new DataSourceV2Options(Map("numPartitions" -> "11", "rowsPerSecond" -> "33").asJava))
+      new DataSourceOptions(Map("numPartitions" -> "11", "rowsPerSecond" -> "33").asJava))
     val startOffset = RateStreamSourceV2.createInitialOffset(11, reader.creationTimeMs)
     val endOffset = RateStreamOffset(startOffset.partitionToValueAndRunTimeMs.toSeq.map {
       case (part, ValueRunTimeMsPair(currentVal, currentReadTime)) =>
@@ -150,7 +150,7 @@ class RateSourceV2Suite extends StreamTest {
   test("continuous in registry") {
     DataSource.lookupDataSource("rate", spark.sqlContext.conf).newInstance() match {
       case ds: ContinuousReadSupport =>
-        val reader = ds.createContinuousReader(Optional.empty(), "", DataSourceV2Options.empty())
+        val reader = ds.createContinuousReader(Optional.empty(), "", DataSourceOptions.empty())
         assert(reader.isInstanceOf[RateStreamContinuousReader])
       case _ =>
         throw new IllegalStateException("Could not find v2 read support for rate")
@@ -159,7 +159,7 @@ class RateSourceV2Suite extends StreamTest {
 
   test("continuous data") {
     val reader = new RateStreamContinuousReader(
-      new DataSourceV2Options(Map("numPartitions" -> "2", "rowsPerSecond" -> "20").asJava))
+      new DataSourceOptions(Map("numPartitions" -> "2", "rowsPerSecond" -> "20").asJava))
     reader.setOffset(Optional.empty())
     val tasks = reader.createDataReaderFactories()
     assert(tasks.size == 2)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2OptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceOptionsSuite.scala
similarity index 80%
rename from sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2OptionsSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceOptionsSuite.scala
index 90d92864b26fa..31dfc55b23361 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2OptionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceOptionsSuite.scala
@@ -22,24 +22,24 @@ import scala.collection.JavaConverters._
 import org.apache.spark.SparkFunSuite
 
 /**
- * A simple test suite to verify `DataSourceV2Options`.
+ * A simple test suite to verify `DataSourceOptions`.
  */
-class DataSourceV2OptionsSuite extends SparkFunSuite {
+class DataSourceOptionsSuite extends SparkFunSuite {
 
   test("key is case-insensitive") {
-    val options = new DataSourceV2Options(Map("foo" -> "bar").asJava)
+    val options = new DataSourceOptions(Map("foo" -> "bar").asJava)
     assert(options.get("foo").get() == "bar")
     assert(options.get("FoO").get() == "bar")
     assert(!options.get("abc").isPresent)
   }
 
   test("value is case-sensitive") {
-    val options = new DataSourceV2Options(Map("foo" -> "bAr").asJava)
+    val options = new DataSourceOptions(Map("foo" -> "bAr").asJava)
     assert(options.get("foo").get == "bAr")
   }
 
   test("getInt") {
-    val options = new DataSourceV2Options(Map("numFOo" -> "1", "foo" -> "bar").asJava)
+    val options = new DataSourceOptions(Map("numFOo" -> "1", "foo" -> "bar").asJava)
     assert(options.getInt("numFOO", 10) == 1)
     assert(options.getInt("numFOO2", 10) == 10)
 
@@ -49,7 +49,7 @@ class DataSourceV2OptionsSuite extends SparkFunSuite {
   }
 
   test("getBoolean") {
-    val options = new DataSourceV2Options(
+    val options = new DataSourceOptions(
       Map("isFoo" -> "true", "isFOO2" -> "false", "foo" -> "bar").asJava)
     assert(options.getBoolean("isFoo", false))
     assert(!options.getBoolean("isFoo2", true))
@@ -59,7 +59,7 @@ class DataSourceV2OptionsSuite extends SparkFunSuite {
   }
 
   test("getLong") {
-    val options = new DataSourceV2Options(Map("numFoo" -> "9223372036854775807",
+    val options = new DataSourceOptions(Map("numFoo" -> "9223372036854775807",
       "foo" -> "bar").asJava)
     assert(options.getLong("numFOO", 0L) == 9223372036854775807L)
     assert(options.getLong("numFoo2", -1L) == -1L)
@@ -70,7 +70,7 @@ class DataSourceV2OptionsSuite extends SparkFunSuite {
   }
 
   test("getDouble") {
-    val options = new DataSourceV2Options(Map("numFoo" -> "922337.1",
+    val options = new DataSourceOptions(Map("numFoo" -> "922337.1",
       "foo" -> "bar").asJava)
     assert(options.getDouble("numFOO", 0d) == 922337.1d)
     assert(options.getDouble("numFoo2", -1.02d) == -1.02d)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
index 42c5d3bcea44b..ee50e8a92270b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
@@ -201,7 +201,7 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext {
 
 class SimpleDataSourceV2 extends DataSourceV2 with ReadSupport {
 
-  class Reader extends DataSourceV2Reader {
+  class Reader extends DataSourceReader {
     override def readSchema(): StructType = new StructType().add("i", "int").add("j", "int")
 
     override def createDataReaderFactories(): JList[DataReaderFactory[Row]] = {
@@ -209,7 +209,7 @@ class SimpleDataSourceV2 extends DataSourceV2 with ReadSupport {
     }
   }
 
-  override def createReader(options: DataSourceV2Options): DataSourceV2Reader = new Reader
+  override def createReader(options: DataSourceOptions): DataSourceReader = new Reader
 }
 
 class SimpleDataReaderFactory(start: Int, end: Int)
@@ -233,7 +233,7 @@ class SimpleDataReaderFactory(start: Int, end: Int)
 
 class AdvancedDataSourceV2 extends DataSourceV2 with ReadSupport {
 
-  class Reader extends DataSourceV2Reader
+  class Reader extends DataSourceReader
     with SupportsPushDownRequiredColumns with SupportsPushDownFilters {
 
     var requiredSchema = new StructType().add("i", "int").add("j", "int")
@@ -275,7 +275,7 @@ class AdvancedDataSourceV2 extends DataSourceV2 with ReadSupport {
     }
   }
 
-  override def createReader(options: DataSourceV2Options): DataSourceV2Reader = new Reader
+  override def createReader(options: DataSourceOptions): DataSourceReader = new Reader
 }
 
 class AdvancedDataReaderFactory(start: Int, end: Int, requiredSchema: StructType)
@@ -306,7 +306,7 @@ class AdvancedDataReaderFactory(start: Int, end: Int, requiredSchema: StructType
 
 class UnsafeRowDataSourceV2 extends DataSourceV2 with ReadSupport {
 
-  class Reader extends DataSourceV2Reader with SupportsScanUnsafeRow {
+  class Reader extends DataSourceReader with SupportsScanUnsafeRow {
     override def readSchema(): StructType = new StructType().add("i", "int").add("j", "int")
 
     override def createUnsafeRowReaderFactories(): JList[DataReaderFactory[UnsafeRow]] = {
@@ -315,7 +315,7 @@ class UnsafeRowDataSourceV2 extends DataSourceV2 with ReadSupport {
     }
   }
 
-  override def createReader(options: DataSourceV2Options): DataSourceV2Reader = new Reader
+  override def createReader(options: DataSourceOptions): DataSourceReader = new Reader
 }
 
 class UnsafeRowDataReaderFactory(start: Int, end: Int)
@@ -343,18 +343,18 @@ class UnsafeRowDataReaderFactory(start: Int, end: Int)
 
 class SchemaRequiredDataSource extends DataSourceV2 with ReadSupportWithSchema {
 
-  class Reader(val readSchema: StructType) extends DataSourceV2Reader {
+  class Reader(val readSchema: StructType) extends DataSourceReader {
     override def createDataReaderFactories(): JList[DataReaderFactory[Row]] =
       java.util.Collections.emptyList()
   }
 
-  override def createReader(schema: StructType, options: DataSourceV2Options): DataSourceV2Reader =
+  override def createReader(schema: StructType, options: DataSourceOptions): DataSourceReader =
     new Reader(schema)
 }
 
 class BatchDataSourceV2 extends DataSourceV2 with ReadSupport {
 
-  class Reader extends DataSourceV2Reader with SupportsScanColumnarBatch {
+  class Reader extends DataSourceReader with SupportsScanColumnarBatch {
     override def readSchema(): StructType = new StructType().add("i", "int").add("j", "int")
 
     override def createBatchDataReaderFactories(): JList[DataReaderFactory[ColumnarBatch]] = {
@@ -362,7 +362,7 @@ class BatchDataSourceV2 extends DataSourceV2 with ReadSupport {
     }
   }
 
-  override def createReader(options: DataSourceV2Options): DataSourceV2Reader = new Reader
+  override def createReader(options: DataSourceOptions): DataSourceReader = new Reader
 }
 
 class BatchDataReaderFactory(start: Int, end: Int)
@@ -406,7 +406,7 @@ class BatchDataReaderFactory(start: Int, end: Int)
 
 class PartitionAwareDataSource extends DataSourceV2 with ReadSupport {
 
-  class Reader extends DataSourceV2Reader with SupportsReportPartitioning {
+  class Reader extends DataSourceReader with SupportsReportPartitioning {
     override def readSchema(): StructType = new StructType().add("a", "int").add("b", "int")
 
     override def createDataReaderFactories(): JList[DataReaderFactory[Row]] = {
@@ -428,7 +428,7 @@ class PartitionAwareDataSource extends DataSourceV2 with ReadSupport {
     }
   }
 
-  override def createReader(options: DataSourceV2Options): DataSourceV2Reader = new Reader
+  override def createReader(options: DataSourceOptions): DataSourceReader = new Reader
 }
 
 class SpecificDataReaderFactory(i: Array[Int], j: Array[Int])
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala
index 3310d6dd199d6..a131b16953e3b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala
@@ -28,7 +28,7 @@ import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, Path}
 import org.apache.spark.SparkContext
 import org.apache.spark.sql.{Row, SaveMode}
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.sources.v2.reader.{DataReader, DataReaderFactory, DataSourceV2Reader}
+import org.apache.spark.sql.sources.v2.reader.{DataReader, DataReaderFactory, DataSourceReader}
 import org.apache.spark.sql.sources.v2.writer._
 import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.util.SerializableConfiguration
@@ -42,7 +42,7 @@ class SimpleWritableDataSource extends DataSourceV2 with ReadSupport with WriteS
 
   private val schema = new StructType().add("i", "long").add("j", "long")
 
-  class Reader(path: String, conf: Configuration) extends DataSourceV2Reader {
+  class Reader(path: String, conf: Configuration) extends DataSourceReader {
     override def readSchema(): StructType = schema
 
     override def createDataReaderFactories(): JList[DataReaderFactory[Row]] = {
@@ -64,7 +64,7 @@ class SimpleWritableDataSource extends DataSourceV2 with ReadSupport with WriteS
     }
   }
 
-  class Writer(jobId: String, path: String, conf: Configuration) extends DataSourceV2Writer {
+  class Writer(jobId: String, path: String, conf: Configuration) extends DataSourceWriter {
     override def createWriterFactory(): DataWriterFactory[Row] = {
       new SimpleCSVDataWriterFactory(path, jobId, new SerializableConfiguration(conf))
     }
@@ -104,7 +104,7 @@ class SimpleWritableDataSource extends DataSourceV2 with ReadSupport with WriteS
     }
   }
 
-  override def createReader(options: DataSourceV2Options): DataSourceV2Reader = {
+  override def createReader(options: DataSourceOptions): DataSourceReader = {
     val path = new Path(options.get("path").get())
     val conf = SparkContext.getActive.get.hadoopConfiguration
     new Reader(path.toUri.toString, conf)
@@ -114,7 +114,7 @@ class SimpleWritableDataSource extends DataSourceV2 with ReadSupport with WriteS
       jobId: String,
       schema: StructType,
       mode: SaveMode,
-      options: DataSourceV2Options): Optional[DataSourceV2Writer] = {
+      options: DataSourceOptions): Optional[DataSourceWriter] = {
     assert(DataType.equalsStructurally(schema.asNullable, this.schema.asNullable))
     assert(!SparkContext.getActive.get.conf.getBoolean("spark.speculation", false))
 
@@ -141,7 +141,7 @@ class SimpleWritableDataSource extends DataSourceV2 with ReadSupport with WriteS
   }
 
   private def createWriter(
-      jobId: String, path: Path, conf: Configuration, internal: Boolean): DataSourceV2Writer = {
+      jobId: String, path: Path, conf: Configuration, internal: Boolean): DataSourceWriter = {
     val pathStr = path.toUri.toString
     if (internal) {
       new InternalRowWriter(jobId, pathStr, conf)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
index dc8c857018457..3127d664d32dc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.execution.streaming.{RateStreamOffset, Sink, Streami
 import org.apache.spark.sql.execution.streaming.continuous.ContinuousTrigger
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider}
-import org.apache.spark.sql.sources.v2.DataSourceV2Options
+import org.apache.spark.sql.sources.v2.DataSourceOptions
 import org.apache.spark.sql.sources.v2.reader.DataReaderFactory
 import org.apache.spark.sql.sources.v2.streaming._
 import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousReader, MicroBatchReader, Offset, PartitionOffset}
@@ -54,14 +54,14 @@ trait FakeMicroBatchReadSupport extends MicroBatchReadSupport {
   override def createMicroBatchReader(
       schema: Optional[StructType],
       checkpointLocation: String,
-      options: DataSourceV2Options): MicroBatchReader = FakeReader()
+      options: DataSourceOptions): MicroBatchReader = FakeReader()
 }
 
 trait FakeContinuousReadSupport extends ContinuousReadSupport {
   override def createContinuousReader(
       schema: Optional[StructType],
       checkpointLocation: String,
-      options: DataSourceV2Options): ContinuousReader = FakeReader()
+      options: DataSourceOptions): ContinuousReader = FakeReader()
 }
 
 trait FakeStreamWriteSupport extends StreamWriteSupport {
@@ -69,7 +69,7 @@ trait FakeStreamWriteSupport extends StreamWriteSupport {
       queryId: String,
       schema: StructType,
       mode: OutputMode,
-      options: DataSourceV2Options): StreamWriter = {
+      options: DataSourceOptions): StreamWriter = {
     throw new IllegalStateException("fake sink - cannot actually write")
   }
 }

From 7d96dc1acf7d7049a6e6c35de726f800c8160422 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Tue, 30 Jan 2018 21:00:29 +0800
Subject: [PATCH 215/566] [SPARK-23222][SQL] Make DataFrameRangeSuite not flaky

## What changes were proposed in this pull request?

It is reported that the test `Cancelling stage in a query with Range` in `DataFrameRangeSuite` fails a few times in unrelated PRs. I personally also saw it too in my PR.

This test is not very flaky actually but only fails occasionally. Based on how the test works, I guess that is because `range` finishes before the listener calls `cancelStage`.

I increase the range number from `1000000000L` to `100000000000L` and count the range in one partition. I also reduce the `interval` of checking stage id. Hopefully it can make the test not flaky anymore.

## How was this patch tested?

The modified tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #20431 from viirya/SPARK-23222.

(cherry picked from commit 84bcf9dc88ffeae6fba4cfad9455ad75bed6e6f6)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../scala/org/apache/spark/sql/DataFrameRangeSuite.scala     | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameRangeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameRangeSuite.scala
index 45afbd29d1907..57a930dfaf320 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameRangeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameRangeSuite.scala
@@ -154,7 +154,7 @@ class DataFrameRangeSuite extends QueryTest with SharedSQLContext with Eventuall
   test("Cancelling stage in a query with Range.") {
     val listener = new SparkListener {
       override def onJobStart(jobStart: SparkListenerJobStart): Unit = {
-        eventually(timeout(10.seconds)) {
+        eventually(timeout(10.seconds), interval(1.millis)) {
           assert(DataFrameRangeSuite.stageToKill > 0)
         }
         sparkContext.cancelStage(DataFrameRangeSuite.stageToKill)
@@ -166,7 +166,7 @@ class DataFrameRangeSuite extends QueryTest with SharedSQLContext with Eventuall
       withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> codegen.toString()) {
         DataFrameRangeSuite.stageToKill = -1
         val ex = intercept[SparkException] {
-          spark.range(1000000000L).map { x =>
+          spark.range(0, 100000000000L, 1, 1).map { x =>
             DataFrameRangeSuite.stageToKill = TaskContext.get().stageId()
             x
           }.toDF("id").agg(sum("id")).collect()
@@ -184,6 +184,7 @@ class DataFrameRangeSuite extends QueryTest with SharedSQLContext with Eventuall
         assert(sparkContext.statusTracker.getExecutorInfos.map(_.numRunningTasks()).sum == 0)
       }
     }
+    sparkContext.removeSparkListener(listener)
   }
 
   test("SPARK-20430 Initialize Range parameters in a driver side") {

From 2e0c1e5f3e47e4e35c14732b93a29d1a25e15662 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 30 Jan 2018 11:33:30 -0800
Subject: [PATCH 216/566] [SPARK-23267][SQL] Increase
 spark.sql.codegen.hugeMethodLimit to 65535

## What changes were proposed in this pull request?
Still saw the performance regression introduced by `spark.sql.codegen.hugeMethodLimit` in our internal workloads. There are two major issues in the current solution.
- The size of the complied byte code is not identical to the bytecode size of the method. The detection is still not accurate.
- The bytecode size of a single operator (e.g., `SerializeFromObject`) could still exceed 8K limit. We saw the performance regression in such scenario.

Since it is close to the release of 2.3, we decide to increase it to 64K for avoiding the perf regression.

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20434 from gatorsmile/revertConf.

(cherry picked from commit 31c00ad8b090d7eddc4622e73dc4440cd32624de)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../scala/org/apache/spark/sql/internal/SQLConf.scala | 11 ++++++-----
 .../spark/sql/execution/WholeStageCodegenSuite.scala  |  4 ++--
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 54a35594f505e..7394a0d7cf983 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -660,12 +660,13 @@ object SQLConf {
   val WHOLESTAGE_HUGE_METHOD_LIMIT = buildConf("spark.sql.codegen.hugeMethodLimit")
     .internal()
     .doc("The maximum bytecode size of a single compiled Java function generated by whole-stage " +
-      "codegen. When the compiled function exceeds this threshold, " +
-      "the whole-stage codegen is deactivated for this subtree of the current query plan. " +
-      s"The default value is ${CodeGenerator.DEFAULT_JVM_HUGE_METHOD_LIMIT} and " +
-      "this is a limit in the OpenJDK JVM implementation.")
+      "codegen. When the compiled function exceeds this threshold, the whole-stage codegen is " +
+      "deactivated for this subtree of the current query plan. The default value is 65535, which " +
+      "is the largest bytecode size possible for a valid Java method. When running on HotSpot, " +
+      s"it may be preferable to set the value to ${CodeGenerator.DEFAULT_JVM_HUGE_METHOD_LIMIT} " +
+      "to match HotSpot's implementation.")
     .intConf
-    .createWithDefault(CodeGenerator.DEFAULT_JVM_HUGE_METHOD_LIMIT)
+    .createWithDefault(65535)
 
   val WHOLESTAGE_SPLIT_CONSUME_FUNC_BY_OPERATOR =
     buildConf("spark.sql.codegen.splitConsumeFuncByOperator")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
index 28ad712feaae6..6e8d5a70d5a8f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -202,7 +202,7 @@ class WholeStageCodegenSuite extends QueryTest with SharedSQLContext {
     wholeStageCodeGenExec.get.asInstanceOf[WholeStageCodegenExec].doCodeGen()._2
   }
 
-  test("SPARK-21871 check if we can get large code size when compiling too long functions") {
+  ignore("SPARK-21871 check if we can get large code size when compiling too long functions") {
     val codeWithShortFunctions = genGroupByCode(3)
     val (_, maxCodeSize1) = CodeGenerator.compile(codeWithShortFunctions)
     assert(maxCodeSize1 < SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.defaultValue.get)
@@ -211,7 +211,7 @@ class WholeStageCodegenSuite extends QueryTest with SharedSQLContext {
     assert(maxCodeSize2 > SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.defaultValue.get)
   }
 
-  test("bytecode of batch file scan exceeds the limit of WHOLESTAGE_HUGE_METHOD_LIMIT") {
+  ignore("bytecode of batch file scan exceeds the limit of WHOLESTAGE_HUGE_METHOD_LIMIT") {
     import testImplicits._
     withTempPath { dir =>
       val path = dir.getCanonicalPath

From f4802dc8866d0316a6b555b0ab58a56d56d8c6fe Mon Sep 17 00:00:00 2001
From: Dilip Biswal <dbiswal@us.ibm.com>
Date: Tue, 30 Jan 2018 14:11:06 -0800
Subject: [PATCH 217/566] [SPARK-23275][SQL] hive/tests have been failing when
 run locally on the laptop (Mac) with OOM

## What changes were proposed in this pull request?
hive tests have been failing when they are run locally (Mac Os) after a recent change in the trunk. After running the tests for some time, the test fails with OOM with Error: unable to create new native thread.

I noticed the thread count goes all the way up to 2000+ after which we start getting these OOM errors. Most of the threads seem to be related to the connection pool in hive metastore (BoneCP-xxxxx-xxxx ). This behaviour change is happening after we made the following change to HiveClientImpl.reset()

``` SQL
 def reset(): Unit = withHiveState {
    try {
      // code
    } finally {
      runSqlHive("USE default")  ===> this is causing the issue
    }
```
I am proposing to temporarily back-out part of a fix made to address SPARK-23000 to resolve this issue while we work-out the exact reason for this sudden increase in thread counts.

## How was this patch tested?
Ran hive/test multiple times in different machines.

(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Dilip Biswal <dbiswal@us.ibm.com>

Closes #20441 from dilipbiswal/hive_tests.

(cherry picked from commit 58fcb5a95ee0b91300138cd23f3ce2165fab597f)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../sql/hive/client/HiveClientImpl.scala      | 26 ++++++++-----------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 4b923f5235a90..2e6628a7eafda 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -823,23 +823,19 @@ private[hive] class HiveClientImpl(
   }
 
   def reset(): Unit = withHiveState {
-    try {
-      client.getAllTables("default").asScala.foreach { t =>
-        logDebug(s"Deleting table $t")
-        val table = client.getTable("default", t)
-        client.getIndexes("default", t, 255).asScala.foreach { index =>
-          shim.dropIndex(client, "default", t, index.getIndexName)
-        }
-        if (!table.isIndexTable) {
-          client.dropTable("default", t)
-        }
+    client.getAllTables("default").asScala.foreach { t =>
+      logDebug(s"Deleting table $t")
+      val table = client.getTable("default", t)
+      client.getIndexes("default", t, 255).asScala.foreach { index =>
+        shim.dropIndex(client, "default", t, index.getIndexName)
       }
-      client.getAllDatabases.asScala.filterNot(_ == "default").foreach { db =>
-        logDebug(s"Dropping Database: $db")
-        client.dropDatabase(db, true, false, true)
+      if (!table.isIndexTable) {
+        client.dropTable("default", t)
       }
-    } finally {
-      runSqlHive("USE default")
+    }
+    client.getAllDatabases.asScala.filterNot(_ == "default").foreach { db =>
+      logDebug(s"Dropping Database: $db")
+      client.dropDatabase(db, true, false, true)
     }
   }
 }

From 7b9fe08658b529901a5f22bf81ffdd4410180809 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 30 Jan 2018 17:08:30 -0800
Subject: [PATCH 218/566] [SPARK-23261][PYSPARK][BACKPORT-2.3] Rename Pandas
 UDFs

This PR is to backport https://github.com/apache/spark/pull/20428 to Spark 2.3 without adding the changes regarding `GROUPED AGG PANDAS UDF`

---

## What changes were proposed in this pull request?
Rename the public APIs and names of pandas udfs.

- `PANDAS SCALAR UDF` -> `SCALAR PANDAS UDF`
- `PANDAS GROUP MAP UDF` -> `GROUPED MAP PANDAS UDF`

## How was this patch tested?
The existing tests

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20439 from gatorsmile/backport2.3.
---
 .../spark/api/python/PythonRunner.scala       |  8 +-
 docs/sql-programming-guide.md                 |  8 +-
 examples/src/main/python/sql/arrow.py         | 12 +--
 python/pyspark/rdd.py                         |  4 +-
 python/pyspark/sql/functions.py               | 24 +++---
 python/pyspark/sql/group.py                   | 10 +--
 python/pyspark/sql/tests.py                   | 80 +++++++++----------
 python/pyspark/sql/udf.py                     | 18 ++---
 python/pyspark/worker.py                      | 16 ++--
 .../spark/sql/RelationalGroupedDataset.scala  |  4 +-
 .../python/ArrowEvalPythonExec.scala          |  2 +-
 .../execution/python/ExtractPythonUDFs.scala  |  4 +-
 .../python/FlatMapGroupsInPandasExec.scala    |  2 +-
 13 files changed, 96 insertions(+), 96 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
index 1ec0e717fac29..719ce5b9e3698 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -37,14 +37,14 @@ private[spark] object PythonEvalType {
 
   val SQL_BATCHED_UDF = 100
 
-  val SQL_PANDAS_SCALAR_UDF = 200
-  val SQL_PANDAS_GROUP_MAP_UDF = 201
+  val SQL_SCALAR_PANDAS_UDF = 200
+  val SQL_GROUPED_MAP_PANDAS_UDF = 201
 
   def toString(pythonEvalType: Int): String = pythonEvalType match {
     case NON_UDF => "NON_UDF"
     case SQL_BATCHED_UDF => "SQL_BATCHED_UDF"
-    case SQL_PANDAS_SCALAR_UDF => "SQL_PANDAS_SCALAR_UDF"
-    case SQL_PANDAS_GROUP_MAP_UDF => "SQL_PANDAS_GROUP_MAP_UDF"
+    case SQL_SCALAR_PANDAS_UDF => "SQL_SCALAR_PANDAS_UDF"
+    case SQL_GROUPED_MAP_PANDAS_UDF => "SQL_GROUPED_MAP_PANDAS_UDF"
   }
 }
 
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index d49c8d869cba6..a0e221b39cc34 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1684,7 +1684,7 @@ Spark will fall back to create the DataFrame without Arrow.
 Pandas UDFs are user defined functions that are executed by Spark using Arrow to transfer data and
 Pandas to work with the data. A Pandas UDF is defined using the keyword `pandas_udf` as a decorator
 or to wrap the function, no additional configuration is required. Currently, there are two types of
-Pandas UDF: Scalar and Group Map.
+Pandas UDF: Scalar and Grouped Map.
 
 ### Scalar
 
@@ -1702,8 +1702,8 @@ The following example shows how to create a scalar Pandas UDF that computes the
 </div>
 </div>
 
-### Group Map
-Group map Pandas UDFs are used with `groupBy().apply()` which implements the "split-apply-combine" pattern.
+### Grouped Map
+Grouped map Pandas UDFs are used with `groupBy().apply()` which implements the "split-apply-combine" pattern.
 Split-apply-combine consists of three steps:
 * Split the data into groups by using `DataFrame.groupBy`.
 * Apply a function on each group. The input and output of the function are both `pandas.DataFrame`. The
@@ -1723,7 +1723,7 @@ The following example shows how to use `groupby().apply()` to subtract the mean
 
 <div class="codetabs">
 <div data-lang="python" markdown="1">
-{% include_example group_map_pandas_udf python/sql/arrow.py %}
+{% include_example grouped_map_pandas_udf python/sql/arrow.py %}
 </div>
 </div>
 
diff --git a/examples/src/main/python/sql/arrow.py b/examples/src/main/python/sql/arrow.py
index 6c0028b3f1c1f..4c5aefb6ff4a6 100644
--- a/examples/src/main/python/sql/arrow.py
+++ b/examples/src/main/python/sql/arrow.py
@@ -86,15 +86,15 @@ def multiply_func(a, b):
     # $example off:scalar_pandas_udf$
 
 
-def group_map_pandas_udf_example(spark):
-    # $example on:group_map_pandas_udf$
+def grouped_map_pandas_udf_example(spark):
+    # $example on:grouped_map_pandas_udf$
     from pyspark.sql.functions import pandas_udf, PandasUDFType
 
     df = spark.createDataFrame(
         [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
         ("id", "v"))
 
-    @pandas_udf("id long, v double", PandasUDFType.GROUP_MAP)
+    @pandas_udf("id long, v double", PandasUDFType.GROUPED_MAP)
     def substract_mean(pdf):
         # pdf is a pandas.DataFrame
         v = pdf.v
@@ -110,7 +110,7 @@ def substract_mean(pdf):
     # |  2|-1.0|
     # |  2| 4.0|
     # +---+----+
-    # $example off:group_map_pandas_udf$
+    # $example off:grouped_map_pandas_udf$
 
 
 if __name__ == "__main__":
@@ -123,7 +123,7 @@ def substract_mean(pdf):
     dataframe_with_arrow_example(spark)
     print("Running pandas_udf scalar example")
     scalar_pandas_udf_example(spark)
-    print("Running pandas_udf group map example")
-    group_map_pandas_udf_example(spark)
+    print("Running pandas_udf grouped map example")
+    grouped_map_pandas_udf_example(spark)
 
     spark.stop()
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 340bc3a6b7470..9a6010e87679c 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -68,8 +68,8 @@ class PythonEvalType(object):
 
     SQL_BATCHED_UDF = 100
 
-    SQL_PANDAS_SCALAR_UDF = 200
-    SQL_PANDAS_GROUP_MAP_UDF = 201
+    SQL_SCALAR_PANDAS_UDF = 200
+    SQL_GROUPED_MAP_PANDAS_UDF = 201
 
 
 def portable_hash(x):
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 961b3267b44cf..b637707ac3cf5 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1737,8 +1737,8 @@ def translate(srcCol, matching, replace):
 def create_map(*cols):
     """Creates a new map column.
 
-    :param cols: list of column names (string) or list of :class:`Column` expressions that grouped
-        as key-value pairs, e.g. (key1, value1, key2, value2, ...).
+    :param cols: list of column names (string) or list of :class:`Column` expressions that are
+        grouped as key-value pairs, e.g. (key1, value1, key2, value2, ...).
 
     >>> df.select(create_map('name', 'age').alias("map")).collect()
     [Row(map={u'Alice': 2}), Row(map={u'Bob': 5})]
@@ -2085,9 +2085,9 @@ def map_values(col):
 class PandasUDFType(object):
     """Pandas UDF Types. See :meth:`pyspark.sql.functions.pandas_udf`.
     """
-    SCALAR = PythonEvalType.SQL_PANDAS_SCALAR_UDF
+    SCALAR = PythonEvalType.SQL_SCALAR_PANDAS_UDF
 
-    GROUP_MAP = PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF
+    GROUPED_MAP = PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF
 
 
 @since(1.3)
@@ -2191,20 +2191,20 @@ def pandas_udf(f=None, returnType=None, functionType=None):
            Therefore, this can be used, for example, to ensure the length of each returned
            `pandas.Series`, and can not be used as the column length.
 
-    2. GROUP_MAP
+    2. GROUPED_MAP
 
-       A group map UDF defines transformation: A `pandas.DataFrame` -> A `pandas.DataFrame`
+       A grouped map UDF defines transformation: A `pandas.DataFrame` -> A `pandas.DataFrame`
        The returnType should be a :class:`StructType` describing the schema of the returned
        `pandas.DataFrame`.
        The length of the returned `pandas.DataFrame` can be arbitrary.
 
-       Group map UDFs are used with :meth:`pyspark.sql.GroupedData.apply`.
+       Grouped map UDFs are used with :meth:`pyspark.sql.GroupedData.apply`.
 
        >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
        >>> df = spark.createDataFrame(
        ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
        ...     ("id", "v"))  # doctest: +SKIP
-       >>> @pandas_udf("id long, v double", PandasUDFType.GROUP_MAP)  # doctest: +SKIP
+       >>> @pandas_udf("id long, v double", PandasUDFType.GROUPED_MAP)  # doctest: +SKIP
        ... def normalize(pdf):
        ...     v = pdf.v
        ...     return pdf.assign(v=(v - v.mean()) / v.std())
@@ -2254,20 +2254,20 @@ def pandas_udf(f=None, returnType=None, functionType=None):
             eval_type = returnType
         else:
             # @pandas_udf(dataType) or @pandas_udf(returnType=dataType)
-            eval_type = PythonEvalType.SQL_PANDAS_SCALAR_UDF
+            eval_type = PythonEvalType.SQL_SCALAR_PANDAS_UDF
     else:
         return_type = returnType
 
         if functionType is not None:
             eval_type = functionType
         else:
-            eval_type = PythonEvalType.SQL_PANDAS_SCALAR_UDF
+            eval_type = PythonEvalType.SQL_SCALAR_PANDAS_UDF
 
     if return_type is None:
         raise ValueError("Invalid returnType: returnType can not be None")
 
-    if eval_type not in [PythonEvalType.SQL_PANDAS_SCALAR_UDF,
-                         PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF]:
+    if eval_type not in [PythonEvalType.SQL_SCALAR_PANDAS_UDF,
+                         PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF]:
         raise ValueError("Invalid functionType: "
                          "functionType must be one the values from PandasUDFType")
 
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index 22061b83eb78c..330faf2addcbe 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -212,14 +212,14 @@ def apply(self, udf):
         This function does not support partial aggregation, and requires shuffling all the data in
         the :class:`DataFrame`.
 
-        :param udf: a group map user-defined function returned by
-            :meth:`pyspark.sql.functions.pandas_udf`.
+        :param udf: a grouped map user-defined function returned by
+            :func:`pyspark.sql.functions.pandas_udf`.
 
         >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
         >>> df = spark.createDataFrame(
         ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
         ...     ("id", "v"))
-        >>> @pandas_udf("id long, v double", PandasUDFType.GROUP_MAP)  # doctest: +SKIP
+        >>> @pandas_udf("id long, v double", PandasUDFType.GROUPED_MAP)  # doctest: +SKIP
         ... def normalize(pdf):
         ...     v = pdf.v
         ...     return pdf.assign(v=(v - v.mean()) / v.std())
@@ -239,9 +239,9 @@ def apply(self, udf):
         """
         # Columns are special because hasattr always return True
         if isinstance(udf, Column) or not hasattr(udf, 'func') \
-           or udf.evalType != PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF:
+           or udf.evalType != PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
             raise ValueError("Invalid udf: the udf argument must be a pandas_udf of type "
-                             "GROUP_MAP.")
+                             "GROUPED_MAP.")
         df = self._df
         udf_column = udf(*[df[col] for col in df.columns])
         jdf = self._jgd.flatMapGroupsInPandas(udf_column._jc.expr())
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index a8e337d8cd69f..0d5bc137abf47 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3621,34 +3621,34 @@ def test_pandas_udf_basic(self):
 
         udf = pandas_udf(lambda x: x, DoubleType())
         self.assertEqual(udf.returnType, DoubleType())
-        self.assertEqual(udf.evalType, PythonEvalType.SQL_PANDAS_SCALAR_UDF)
+        self.assertEqual(udf.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF)
 
         udf = pandas_udf(lambda x: x, DoubleType(), PandasUDFType.SCALAR)
         self.assertEqual(udf.returnType, DoubleType())
-        self.assertEqual(udf.evalType, PythonEvalType.SQL_PANDAS_SCALAR_UDF)
+        self.assertEqual(udf.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF)
 
         udf = pandas_udf(lambda x: x, 'double', PandasUDFType.SCALAR)
         self.assertEqual(udf.returnType, DoubleType())
-        self.assertEqual(udf.evalType, PythonEvalType.SQL_PANDAS_SCALAR_UDF)
+        self.assertEqual(udf.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF)
 
         udf = pandas_udf(lambda x: x, StructType([StructField("v", DoubleType())]),
-                         PandasUDFType.GROUP_MAP)
+                         PandasUDFType.GROUPED_MAP)
         self.assertEqual(udf.returnType, StructType([StructField("v", DoubleType())]))
-        self.assertEqual(udf.evalType, PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF)
+        self.assertEqual(udf.evalType, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF)
 
-        udf = pandas_udf(lambda x: x, 'v double', PandasUDFType.GROUP_MAP)
+        udf = pandas_udf(lambda x: x, 'v double', PandasUDFType.GROUPED_MAP)
         self.assertEqual(udf.returnType, StructType([StructField("v", DoubleType())]))
-        self.assertEqual(udf.evalType, PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF)
+        self.assertEqual(udf.evalType, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF)
 
         udf = pandas_udf(lambda x: x, 'v double',
-                         functionType=PandasUDFType.GROUP_MAP)
+                         functionType=PandasUDFType.GROUPED_MAP)
         self.assertEqual(udf.returnType, StructType([StructField("v", DoubleType())]))
-        self.assertEqual(udf.evalType, PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF)
+        self.assertEqual(udf.evalType, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF)
 
         udf = pandas_udf(lambda x: x, returnType='v double',
-                         functionType=PandasUDFType.GROUP_MAP)
+                         functionType=PandasUDFType.GROUPED_MAP)
         self.assertEqual(udf.returnType, StructType([StructField("v", DoubleType())]))
-        self.assertEqual(udf.evalType, PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF)
+        self.assertEqual(udf.evalType, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF)
 
     def test_pandas_udf_decorator(self):
         from pyspark.rdd import PythonEvalType
@@ -3659,45 +3659,45 @@ def test_pandas_udf_decorator(self):
         def foo(x):
             return x
         self.assertEqual(foo.returnType, DoubleType())
-        self.assertEqual(foo.evalType, PythonEvalType.SQL_PANDAS_SCALAR_UDF)
+        self.assertEqual(foo.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF)
 
         @pandas_udf(returnType=DoubleType())
         def foo(x):
             return x
         self.assertEqual(foo.returnType, DoubleType())
-        self.assertEqual(foo.evalType, PythonEvalType.SQL_PANDAS_SCALAR_UDF)
+        self.assertEqual(foo.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF)
 
         schema = StructType([StructField("v", DoubleType())])
 
-        @pandas_udf(schema, PandasUDFType.GROUP_MAP)
+        @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
         def foo(x):
             return x
         self.assertEqual(foo.returnType, schema)
-        self.assertEqual(foo.evalType, PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF)
+        self.assertEqual(foo.evalType, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF)
 
-        @pandas_udf('v double', PandasUDFType.GROUP_MAP)
+        @pandas_udf('v double', PandasUDFType.GROUPED_MAP)
         def foo(x):
             return x
         self.assertEqual(foo.returnType, schema)
-        self.assertEqual(foo.evalType, PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF)
+        self.assertEqual(foo.evalType, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF)
 
-        @pandas_udf(schema, functionType=PandasUDFType.GROUP_MAP)
+        @pandas_udf(schema, functionType=PandasUDFType.GROUPED_MAP)
         def foo(x):
             return x
         self.assertEqual(foo.returnType, schema)
-        self.assertEqual(foo.evalType, PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF)
+        self.assertEqual(foo.evalType, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF)
 
         @pandas_udf(returnType='v double', functionType=PandasUDFType.SCALAR)
         def foo(x):
             return x
         self.assertEqual(foo.returnType, schema)
-        self.assertEqual(foo.evalType, PythonEvalType.SQL_PANDAS_SCALAR_UDF)
+        self.assertEqual(foo.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF)
 
-        @pandas_udf(returnType=schema, functionType=PandasUDFType.GROUP_MAP)
+        @pandas_udf(returnType=schema, functionType=PandasUDFType.GROUPED_MAP)
         def foo(x):
             return x
         self.assertEqual(foo.returnType, schema)
-        self.assertEqual(foo.evalType, PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF)
+        self.assertEqual(foo.evalType, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF)
 
     def test_udf_wrong_arg(self):
         from pyspark.sql.functions import pandas_udf, PandasUDFType
@@ -3724,15 +3724,15 @@ def zero_with_type():
                     return 1
 
             with self.assertRaisesRegexp(TypeError, 'Invalid returnType'):
-                @pandas_udf(returnType=PandasUDFType.GROUP_MAP)
+                @pandas_udf(returnType=PandasUDFType.GROUPED_MAP)
                 def foo(df):
                     return df
             with self.assertRaisesRegexp(ValueError, 'Invalid returnType'):
-                @pandas_udf(returnType='double', functionType=PandasUDFType.GROUP_MAP)
+                @pandas_udf(returnType='double', functionType=PandasUDFType.GROUPED_MAP)
                 def foo(df):
                     return df
             with self.assertRaisesRegexp(ValueError, 'Invalid function'):
-                @pandas_udf(returnType='k int, v double', functionType=PandasUDFType.GROUP_MAP)
+                @pandas_udf(returnType='k int, v double', functionType=PandasUDFType.GROUPED_MAP)
                 def foo(k, v):
                     return k
 
@@ -3804,11 +3804,11 @@ def test_register_nondeterministic_vectorized_udf_basic(self):
         random_pandas_udf = pandas_udf(
             lambda x: random.randint(6, 6) + x, IntegerType()).asNondeterministic()
         self.assertEqual(random_pandas_udf.deterministic, False)
-        self.assertEqual(random_pandas_udf.evalType, PythonEvalType.SQL_PANDAS_SCALAR_UDF)
+        self.assertEqual(random_pandas_udf.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF)
         nondeterministic_pandas_udf = self.spark.catalog.registerFunction(
             "randomPandasUDF", random_pandas_udf)
         self.assertEqual(nondeterministic_pandas_udf.deterministic, False)
-        self.assertEqual(nondeterministic_pandas_udf.evalType, PythonEvalType.SQL_PANDAS_SCALAR_UDF)
+        self.assertEqual(nondeterministic_pandas_udf.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF)
         [row] = self.spark.sql("SELECT randomPandasUDF(1)").collect()
         self.assertEqual(row[0], 7)
 
@@ -4206,7 +4206,7 @@ def test_register_vectorized_udf_basic(self):
             col('id').cast('int').alias('b'))
         original_add = pandas_udf(lambda x, y: x + y, IntegerType())
         self.assertEqual(original_add.deterministic, True)
-        self.assertEqual(original_add.evalType, PythonEvalType.SQL_PANDAS_SCALAR_UDF)
+        self.assertEqual(original_add.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF)
         new_add = self.spark.catalog.registerFunction("add1", original_add)
         res1 = df.select(new_add(col('a'), col('b')))
         res2 = self.spark.sql(
@@ -4243,20 +4243,20 @@ def test_simple(self):
                  StructField('v', IntegerType()),
                  StructField('v1', DoubleType()),
                  StructField('v2', LongType())]),
-            PandasUDFType.GROUP_MAP
+            PandasUDFType.GROUPED_MAP
         )
 
         result = df.groupby('id').apply(foo_udf).sort('id').toPandas()
         expected = df.toPandas().groupby('id').apply(foo_udf.func).reset_index(drop=True)
         self.assertFramesEqual(expected, result)
 
-    def test_register_group_map_udf(self):
+    def test_register_grouped_map_udf(self):
         from pyspark.sql.functions import pandas_udf, PandasUDFType
 
-        foo_udf = pandas_udf(lambda x: x, "id long", PandasUDFType.GROUP_MAP)
+        foo_udf = pandas_udf(lambda x: x, "id long", PandasUDFType.GROUPED_MAP)
         with QuietTest(self.sc):
             with self.assertRaisesRegexp(ValueError, 'f must be either SQL_BATCHED_UDF or '
-                                                     'SQL_PANDAS_SCALAR_UDF'):
+                                                     'SQL_SCALAR_PANDAS_UDF'):
                 self.spark.catalog.registerFunction("foo_udf", foo_udf)
 
     def test_decorator(self):
@@ -4265,7 +4265,7 @@ def test_decorator(self):
 
         @pandas_udf(
             'id long, v int, v1 double, v2 long',
-            PandasUDFType.GROUP_MAP
+            PandasUDFType.GROUPED_MAP
         )
         def foo(pdf):
             return pdf.assign(v1=pdf.v * pdf.id * 1.0, v2=pdf.v + pdf.id)
@@ -4281,7 +4281,7 @@ def test_coerce(self):
         foo = pandas_udf(
             lambda pdf: pdf,
             'id long, v double',
-            PandasUDFType.GROUP_MAP
+            PandasUDFType.GROUPED_MAP
         )
 
         result = df.groupby('id').apply(foo).sort('id').toPandas()
@@ -4295,7 +4295,7 @@ def test_complex_groupby(self):
 
         @pandas_udf(
             'id long, v int, norm double',
-            PandasUDFType.GROUP_MAP
+            PandasUDFType.GROUPED_MAP
         )
         def normalize(pdf):
             v = pdf.v
@@ -4314,7 +4314,7 @@ def test_empty_groupby(self):
 
         @pandas_udf(
             'id long, v int, norm double',
-            PandasUDFType.GROUP_MAP
+            PandasUDFType.GROUPED_MAP
         )
         def normalize(pdf):
             v = pdf.v
@@ -4334,7 +4334,7 @@ def test_datatype_string(self):
         foo_udf = pandas_udf(
             lambda pdf: pdf.assign(v1=pdf.v * pdf.id * 1.0, v2=pdf.v + pdf.id),
             'id long, v int, v1 double, v2 long',
-            PandasUDFType.GROUP_MAP
+            PandasUDFType.GROUPED_MAP
         )
 
         result = df.groupby('id').apply(foo_udf).sort('id').toPandas()
@@ -4348,7 +4348,7 @@ def test_wrong_return_type(self):
         foo = pandas_udf(
             lambda pdf: pdf,
             'id long, v map<int, int>',
-            PandasUDFType.GROUP_MAP
+            PandasUDFType.GROUPED_MAP
         )
 
         with QuietTest(self.sc):
@@ -4374,7 +4374,7 @@ def test_wrong_args(self):
             with self.assertRaisesRegexp(ValueError, 'Invalid udf'):
                 df.groupby('id').apply(
                     pandas_udf(lambda x, y: x, StructType([StructField("d", DoubleType())])))
-            with self.assertRaisesRegexp(ValueError, 'Invalid udf.*GROUP_MAP'):
+            with self.assertRaisesRegexp(ValueError, 'Invalid udf.*GROUPED_MAP'):
                 df.groupby('id').apply(
                     pandas_udf(lambda x, y: x, StructType([StructField("d", DoubleType())]),
                                PandasUDFType.SCALAR))
@@ -4385,7 +4385,7 @@ def test_unsupported_types(self):
             [StructField("id", LongType(), True),
              StructField("map", MapType(StringType(), IntegerType()), True)])
         df = self.spark.createDataFrame([(1, None,)], schema=schema)
-        f = pandas_udf(lambda x: x, df.schema, PandasUDFType.GROUP_MAP)
+        f = pandas_udf(lambda x: x, df.schema, PandasUDFType.GROUPED_MAP)
         with QuietTest(self.sc):
             with self.assertRaisesRegexp(Exception, 'Unsupported data type'):
                 df.groupby('id').apply(f).collect()
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index 07952558a036e..82a28c859525c 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -36,24 +36,24 @@ def _wrap_function(sc, func, returnType):
 
 def _create_udf(f, returnType, evalType):
 
-    if evalType == PythonEvalType.SQL_PANDAS_SCALAR_UDF or \
-            evalType == PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF:
+    if evalType in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
+                    PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF):
         import inspect
         from pyspark.sql.utils import require_minimum_pyarrow_version
 
         require_minimum_pyarrow_version()
         argspec = inspect.getargspec(f)
 
-        if evalType == PythonEvalType.SQL_PANDAS_SCALAR_UDF and len(argspec.args) == 0 and \
+        if evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF and len(argspec.args) == 0 and \
                 argspec.varargs is None:
             raise ValueError(
                 "Invalid function: 0-arg pandas_udfs are not supported. "
                 "Instead, create a 1-arg pandas_udf and ignore the arg in your function."
             )
 
-        if evalType == PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF and len(argspec.args) != 1:
+        if evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF and len(argspec.args) != 1:
             raise ValueError(
-                "Invalid function: pandas_udfs with function type GROUP_MAP "
+                "Invalid function: pandas_udfs with function type GROUPED_MAP "
                 "must take a single arg that is a pandas DataFrame."
             )
 
@@ -109,10 +109,10 @@ def returnType(self):
             else:
                 self._returnType_placeholder = _parse_datatype_string(self._returnType)
 
-        if self.evalType == PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF \
+        if self.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF \
                 and not isinstance(self._returnType_placeholder, StructType):
             raise ValueError("Invalid returnType: returnType must be a StructType for "
-                             "pandas_udf with function type GROUP_MAP")
+                             "pandas_udf with function type GROUPED_MAP")
 
         return self._returnType_placeholder
 
@@ -285,9 +285,9 @@ def register(self, name, f, returnType=None):
                     "Invalid returnType: data type can not be specified when f is"
                     "a user-defined function, but got %s." % returnType)
             if f.evalType not in [PythonEvalType.SQL_BATCHED_UDF,
-                                  PythonEvalType.SQL_PANDAS_SCALAR_UDF]:
+                                  PythonEvalType.SQL_SCALAR_PANDAS_UDF]:
                 raise ValueError(
-                    "Invalid f: f must be either SQL_BATCHED_UDF or SQL_PANDAS_SCALAR_UDF")
+                    "Invalid f: f must be either SQL_BATCHED_UDF or SQL_SCALAR_PANDAS_UDF")
             register_udf = UserDefinedFunction(f.func, returnType=f.returnType, name=name,
                                                evalType=f.evalType,
                                                deterministic=f.deterministic)
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index e6737ae1c1285..8c41c3d1d6183 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -74,7 +74,7 @@ def wrap_udf(f, return_type):
         return lambda *a: f(*a)
 
 
-def wrap_pandas_scalar_udf(f, return_type):
+def wrap_scalar_pandas_udf(f, return_type):
     arrow_return_type = to_arrow_type(return_type)
 
     def verify_result_length(*a):
@@ -90,7 +90,7 @@ def verify_result_length(*a):
     return lambda *a: (verify_result_length(*a), arrow_return_type)
 
 
-def wrap_pandas_group_map_udf(f, return_type):
+def wrap_grouped_map_pandas_udf(f, return_type):
     def wrapped(*series):
         import pandas as pd
 
@@ -122,10 +122,10 @@ def read_single_udf(pickleSer, infile, eval_type):
             row_func = chain(row_func, f)
 
     # the last returnType will be the return type of UDF
-    if eval_type == PythonEvalType.SQL_PANDAS_SCALAR_UDF:
-        return arg_offsets, wrap_pandas_scalar_udf(row_func, return_type)
-    elif eval_type == PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF:
-        return arg_offsets, wrap_pandas_group_map_udf(row_func, return_type)
+    if eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF:
+        return arg_offsets, wrap_scalar_pandas_udf(row_func, return_type)
+    elif eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
+        return arg_offsets, wrap_grouped_map_pandas_udf(row_func, return_type)
     else:
         return arg_offsets, wrap_udf(row_func, return_type)
 
@@ -148,8 +148,8 @@ def read_udfs(pickleSer, infile, eval_type):
 
     func = lambda _, it: map(mapper, it)
 
-    if eval_type == PythonEvalType.SQL_PANDAS_SCALAR_UDF \
-       or eval_type == PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF:
+    if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
+                     PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF):
         timezone = utf8_deserializer.loads(infile)
         ser = ArrowStreamPandasSerializer(timezone)
     else:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
index a009c00b0abc5..4e0785a19de34 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
@@ -450,8 +450,8 @@ class RelationalGroupedDataset protected[sql](
    * workers.
    */
   private[sql] def flatMapGroupsInPandas(expr: PythonUDF): DataFrame = {
-    require(expr.evalType == PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF,
-      "Must pass a group map udf")
+    require(expr.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
+      "Must pass a grouped map udf")
     require(expr.dataType.isInstanceOf[StructType],
       "The returnType of the udf must be a StructType")
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
index 47b146f076b62..c4de214679ae4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
@@ -81,7 +81,7 @@ case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chi
 
     val columnarBatchIter = new ArrowPythonRunner(
         funcs, bufferSize, reuseWorker,
-        PythonEvalType.SQL_PANDAS_SCALAR_UDF, argOffsets, schema,
+        PythonEvalType.SQL_SCALAR_PANDAS_UDF, argOffsets, schema,
         sessionLocalTimeZone, pandasRespectSessionTimeZone)
       .compute(batchIter, context.partitionId(), context)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
index e6616d013cba6..78521526476b9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
@@ -152,7 +152,7 @@ object ExtractPythonUDFs extends Rule[SparkPlan] with PredicateHelper {
         if (validUdfs.nonEmpty) {
           require(validUdfs.forall(udf =>
             udf.evalType == PythonEvalType.SQL_BATCHED_UDF ||
-            udf.evalType == PythonEvalType.SQL_PANDAS_SCALAR_UDF
+            udf.evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF
           ), "Can only extract scalar vectorized udf or sql batch udf")
 
           val resultAttrs = udfs.zipWithIndex.map { case (u, i) =>
@@ -160,7 +160,7 @@ object ExtractPythonUDFs extends Rule[SparkPlan] with PredicateHelper {
           }
 
           val evaluation = validUdfs.partition(
-            _.evalType == PythonEvalType.SQL_PANDAS_SCALAR_UDF
+            _.evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF
           ) match {
             case (vectorizedUdfs, plainUdfs) if plainUdfs.isEmpty =>
               ArrowEvalPythonExec(vectorizedUdfs, child.output ++ resultAttrs, child)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala
index 59db66bd7adf1..c798fe5a92c54 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala
@@ -96,7 +96,7 @@ case class FlatMapGroupsInPandasExec(
 
       val columnarBatchIter = new ArrowPythonRunner(
         chainedFunc, bufferSize, reuseWorker,
-        PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF, argOffsets, schema,
+        PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, argOffsets, schema,
         sessionLocalTimeZone, pandasRespectSessionTimeZone)
           .compute(grouped, context.partitionId(), context)
 

From 6ed0d57f86e76f37c4ca1c6d721fc235dcec520e Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 30 Jan 2018 17:14:17 -0800
Subject: [PATCH 219/566] [SPARK-23276][SQL][TEST] Enable UDT tests in
 (Hive)OrcHadoopFsRelationSuite

## What changes were proposed in this pull request?

Like Parquet, ORC test suites should enable UDT tests.

## How was this patch tested?

Pass the Jenkins with newly enabled test cases.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20440 from dongjoon-hyun/SPARK-23276.

(cherry picked from commit 77866167330a665e174ae08a2f8902ef9dc3438b)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala   | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
index f87162f94c01a..ee421c150b644 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
@@ -33,11 +33,10 @@ class OrcHadoopFsRelationSuite extends HadoopFsRelationTest {
   override val dataSourceName: String =
     classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat].getCanonicalName
 
-  // ORC does not play well with NullType and UDT.
+  // ORC does not play well with NullType.
   override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
     case _: NullType => false
     case _: CalendarIntervalType => false
-    case _: UserDefinedType[_] => false
     case _ => true
   }
 

From b8778321bbb443f2d51ab7e2b1aff3d1e4236e35 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 30 Jan 2018 20:05:57 -0800
Subject: [PATCH 220/566] [SPARK-23274][SQL] Fix ReplaceExceptWithFilter when
 the right's Filter contains the references that are not in the left output

## What changes were proposed in this pull request?
This PR is to fix the `ReplaceExceptWithFilter` rule when the right's Filter contains the references that are not in the left output.

Before this PR, we got the error like
```
java.util.NoSuchElementException: key not found: a
  at scala.collection.MapLike$class.default(MapLike.scala:228)
  at scala.collection.AbstractMap.default(Map.scala:59)
  at scala.collection.MapLike$class.apply(MapLike.scala:141)
  at scala.collection.AbstractMap.apply(Map.scala:59)
```

After this PR, `ReplaceExceptWithFilter ` will not take an effect in this case.

## How was this patch tested?
Added tests

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20444 from gatorsmile/fixReplaceExceptWithFilter.

(cherry picked from commit ca04c3ff2387bf0a4308a4b010154e6761827278)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../optimizer/ReplaceExceptWithFilter.scala     | 17 +++++++++++++----
 .../optimizer/ReplaceOperatorSuite.scala        | 15 +++++++++++++++
 .../org/apache/spark/sql/DataFrameSuite.scala   |  8 ++++++++
 3 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceExceptWithFilter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceExceptWithFilter.scala
index 89bfcee078fba..45edf266bbce4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceExceptWithFilter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceExceptWithFilter.scala
@@ -46,18 +46,27 @@ object ReplaceExceptWithFilter extends Rule[LogicalPlan] {
     }
 
     plan.transform {
-      case Except(left, right) if isEligible(left, right) =>
-        Distinct(Filter(Not(transformCondition(left, skipProject(right))), left))
+      case e @ Except(left, right) if isEligible(left, right) =>
+        val newCondition = transformCondition(left, skipProject(right))
+        newCondition.map { c =>
+          Distinct(Filter(Not(c), left))
+        }.getOrElse {
+          e
+        }
     }
   }
 
-  private def transformCondition(left: LogicalPlan, right: LogicalPlan): Expression = {
+  private def transformCondition(left: LogicalPlan, right: LogicalPlan): Option[Expression] = {
     val filterCondition =
       InferFiltersFromConstraints(combineFilters(right)).asInstanceOf[Filter].condition
 
     val attributeNameMap: Map[String, Attribute] = left.output.map(x => (x.name, x)).toMap
 
-    filterCondition.transform { case a : AttributeReference => attributeNameMap(a.name) }
+    if (filterCondition.references.forall(r => attributeNameMap.contains(r.name))) {
+      Some(filterCondition.transform { case a: AttributeReference => attributeNameMap(a.name) })
+    } else {
+      None
+    }
   }
 
   // TODO: This can be further extended in the future.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala
index e9701ffd2c54b..52dc2e9fb076c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala
@@ -168,6 +168,21 @@ class ReplaceOperatorSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
+  test("replace Except with Filter when only right filter can be applied to the left") {
+    val table = LocalRelation(Seq('a.int, 'b.int))
+    val left = table.where('b < 1).select('a).as("left")
+    val right = table.where('b < 3).select('a).as("right")
+
+    val query = Except(left, right)
+    val optimized = Optimize.execute(query.analyze)
+
+    val correctAnswer =
+      Aggregate(left.output, right.output,
+        Join(left, right, LeftAnti, Option($"left.a" <=> $"right.a"))).analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
   test("replace Distinct with Aggregate") {
     val input = LocalRelation('a.int, 'b.int)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 33707080c1301..8b66f77b2f923 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -589,6 +589,14 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       Nil)
   }
 
+  test("SPARK-23274: except between two projects without references used in filter") {
+    val df = Seq((1, 2, 4), (1, 3, 5), (2, 2, 3), (2, 4, 5)).toDF("a", "b", "c")
+    val df1 = df.filter($"a" === 1)
+    val df2 = df.filter($"a" === 2)
+    checkAnswer(df1.select("b").except(df2.select("b")), Row(3) :: Nil)
+    checkAnswer(df1.select("b").except(df2.select("c")), Row(2) :: Nil)
+  }
+
   test("except distinct - SQL compliance") {
     val df_left = Seq(1, 2, 2, 3, 3, 4).toDF("id")
     val df_right = Seq(1, 3).toDF("id")

From ab5a5105502c545bed951538f0ce9409cfbde154 Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Wed, 31 Jan 2018 13:59:21 +0800
Subject: [PATCH 221/566] [SPARK-23279][SS] Avoid triggering distributed job
 for Console sink

## What changes were proposed in this pull request?

Console sink will redistribute collected local data and trigger a distributed job in each batch, this is not necessary, so here change to local job.

## How was this patch tested?

Existing UT and manual verification.

Author: jerryshao <sshao@hortonworks.com>

Closes #20447 from jerryshao/console-minor.

(cherry picked from commit 8c6a9c90a36a938372f28ee8be72178192fbc313)
Signed-off-by: jerryshao <sshao@hortonworks.com>
---
 .../spark/sql/execution/streaming/sources/ConsoleWriter.scala | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala
index d46f4d7b86360..c57bdc4a28905 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution.streaming.sources
 
+import scala.collection.JavaConverters._
+
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.sources.v2.DataSourceOptions
@@ -61,7 +63,7 @@ class ConsoleWriter(schema: StructType, options: DataSourceOptions)
     println("-------------------------------------------")
     // scalastyle:off println
     spark
-      .createDataFrame(spark.sparkContext.parallelize(rows), schema)
+      .createDataFrame(rows.toList.asJava, schema)
       .show(numRowsToShow, isTruncated)
   }
 

From 7ec8ad7ba5b45658b55dd278b4c7ca2e35acfdd3 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 31 Jan 2018 15:13:15 +0800
Subject: [PATCH 222/566] [SPARK-23272][SQL] add calendar interval type support
 to ColumnVector

## What changes were proposed in this pull request?

`ColumnVector` is aimed to support all the data types, but `CalendarIntervalType` is missing. Actually we do support interval type for inner fields, e.g. `ColumnarRow`, `ColumnarArray` both support interval type. It's weird if we don't support interval type at the top level.

This PR adds the interval type support.

This PR also makes `ColumnVector.getChild` protect. We need it public because `MutableColumnaRow.getInterval` needs it. Now the interval implementation is in `ColumnVector.getInterval`.

## How was this patch tested?

a new test.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20438 from cloud-fan/interval.

(cherry picked from commit 695f7146bca342a0ee192d8c7f5ec48d4d8577a8)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../vectorized/MutableColumnarRow.java        |  4 +-
 .../sql/vectorized/ArrowColumnVector.java     |  2 +-
 .../spark/sql/vectorized/ColumnVector.java    | 26 ++++++++++-
 .../spark/sql/vectorized/ColumnarArray.java   |  4 +-
 .../spark/sql/vectorized/ColumnarRow.java     |  4 +-
 .../vectorized/ColumnarBatchSuite.scala       | 45 +++++++++++++++++--
 6 files changed, 70 insertions(+), 15 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
index 2bab095d4d951..66668f3753604 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
@@ -146,9 +146,7 @@ public byte[] getBinary(int ordinal) {
   @Override
   public CalendarInterval getInterval(int ordinal) {
     if (columns[ordinal].isNullAt(rowId)) return null;
-    final int months = columns[ordinal].getChild(0).getInt(rowId);
-    final long microseconds = columns[ordinal].getChild(1).getLong(rowId);
-    return new CalendarInterval(months, microseconds);
+    return columns[ordinal].getInterval(rowId);
   }
 
   @Override
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
index 9803c3dec6de2..a75d76bd0f82e 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
@@ -28,7 +28,7 @@
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
- * A column vector backed by Apache Arrow. Currently time interval type and map type are not
+ * A column vector backed by Apache Arrow. Currently calendar interval type and map type are not
  * supported.
  */
 @InterfaceStability.Evolving
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
index 4b955ceddd0f2..111f5d9b358d4 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
@@ -20,6 +20,7 @@
 import org.apache.spark.sql.catalyst.util.MapData;
 import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.Decimal;
+import org.apache.spark.unsafe.types.CalendarInterval;
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
@@ -195,6 +196,7 @@ public double[] getDoubles(int rowId, int count) {
    * struct field.
    */
   public final ColumnarRow getStruct(int rowId) {
+    if (isNullAt(rowId)) return null;
     return new ColumnarRow(this, rowId);
   }
 
@@ -236,9 +238,29 @@ public MapData getMap(int ordinal) {
   public abstract byte[] getBinary(int rowId);
 
   /**
-   * Returns the ordinal's child column vector.
+   * Returns the calendar interval type value for rowId.
+   *
+   * In Spark, calendar interval type value is basically an integer value representing the number of
+   * months in this interval, and a long value representing the number of microseconds in this
+   * interval. An interval type vector is the same as a struct type vector with 2 fields: `months`
+   * and `microseconds`.
+   *
+   * To support interval type, implementations must implement {@link #getChild(int)} and define 2
+   * child vectors: the first child vector is an int type vector, containing all the month values of
+   * all the interval values in this vector. The second child vector is a long type vector,
+   * containing all the microsecond values of all the interval values in this vector.
+   */
+  public final CalendarInterval getInterval(int rowId) {
+    if (isNullAt(rowId)) return null;
+    final int months = getChild(0).getInt(rowId);
+    final long microseconds = getChild(1).getLong(rowId);
+    return new CalendarInterval(months, microseconds);
+  }
+
+  /**
+   * @return child [[ColumnVector]] at the given ordinal.
    */
-  public abstract ColumnVector getChild(int ordinal);
+  protected abstract ColumnVector getChild(int ordinal);
 
   /**
    * Data type for this column.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
index 0d2c3ec8648d3..72c07ee7cad3f 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
@@ -135,9 +135,7 @@ public byte[] getBinary(int ordinal) {
 
   @Override
   public CalendarInterval getInterval(int ordinal) {
-    int month = data.getChild(0).getInt(offset + ordinal);
-    long microseconds = data.getChild(1).getLong(offset + ordinal);
-    return new CalendarInterval(month, microseconds);
+    return data.getInterval(offset + ordinal);
   }
 
   @Override
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
index 25db7e09d20d0..6ca749d7c6e85 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
@@ -139,9 +139,7 @@ public byte[] getBinary(int ordinal) {
   @Override
   public CalendarInterval getInterval(int ordinal) {
     if (data.getChild(ordinal).isNullAt(rowId)) return null;
-    final int months = data.getChild(ordinal).getChild(0).getInt(rowId);
-    final long microseconds = data.getChild(ordinal).getChild(1).getLong(rowId);
-    return new CalendarInterval(months, microseconds);
+    return data.getChild(ordinal).getInterval(rowId);
   }
 
   @Override
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index 1873c24ab063c..925c101fe1fee 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -620,6 +620,39 @@ class ColumnarBatchSuite extends SparkFunSuite {
       assert(column.arrayData().elementsAppended == 0)
   }
 
+  testVector("CalendarInterval APIs", 4, CalendarIntervalType) {
+    column =>
+      val reference = mutable.ArrayBuffer.empty[CalendarInterval]
+
+      val months = column.getChild(0)
+      val microseconds = column.getChild(1)
+      assert(months.dataType() == IntegerType)
+      assert(microseconds.dataType() == LongType)
+
+      months.putInt(0, 1)
+      microseconds.putLong(0, 100)
+      reference += new CalendarInterval(1, 100)
+
+      months.putInt(1, 0)
+      microseconds.putLong(1, 2000)
+      reference += new CalendarInterval(0, 2000)
+
+      column.putNull(2)
+      reference += null
+
+      months.putInt(3, 20)
+      microseconds.putLong(3, 0)
+      reference += new CalendarInterval(20, 0)
+
+      reference.zipWithIndex.foreach { case (v, i) =>
+        val errMsg = "VectorType=" + column.getClass.getSimpleName
+        assert(v == column.getInterval(i), errMsg)
+        if (v == null) assert(column.isNullAt(i), errMsg)
+      }
+
+      column.close()
+  }
+
   testVector("Int Array", 10, new ArrayType(IntegerType, true)) {
     column =>
 
@@ -739,14 +772,20 @@ class ColumnarBatchSuite extends SparkFunSuite {
 
       c1.putInt(0, 123)
       c2.putDouble(0, 3.45)
-      c1.putInt(1, 456)
-      c2.putDouble(1, 5.67)
+
+      column.putNull(1)
+
+      c1.putInt(2, 456)
+      c2.putDouble(2, 5.67)
 
       val s = column.getStruct(0)
       assert(s.getInt(0) == 123)
       assert(s.getDouble(1) == 3.45)
 
-      val s2 = column.getStruct(1)
+      assert(column.isNullAt(1))
+      assert(column.getStruct(1) == null)
+
+      val s2 = column.getStruct(2)
       assert(s2.getInt(0) == 456)
       assert(s2.getDouble(1) == 5.67)
   }

From c83246c9a3fe7557e7e1d226edf18a7e96730d18 Mon Sep 17 00:00:00 2001
From: Nick Pentreath <nickp@za.ibm.com>
Date: Wed, 31 Jan 2018 10:37:37 +0200
Subject: [PATCH 223/566] [SPARK-23112][DOC] Update ML migration guide with
 breaking and behavior changes.

Add breaking changes, as well as update behavior changes, to `2.3` ML migration guide.

## How was this patch tested?

Doc only

Author: Nick Pentreath <nickp@za.ibm.com>

Closes #20421 from MLnick/SPARK-23112-ml-guide.

(cherry picked from commit 161a3f2ae324271a601500e3d2900db9359ee2ef)
Signed-off-by: Nick Pentreath <nickp@za.ibm.com>
---
 docs/ml-guide.md | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index b957445579ffd..702bcf748fc74 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -108,7 +108,13 @@ and the migration guide below will explain all changes between releases.
 
 ### Breaking changes
 
-There are no breaking changes.
+* The class and trait hierarchy for logistic regression model summaries was changed to be cleaner
+and better accommodate the addition of the multi-class summary. This is a breaking change for user
+code that casts a `LogisticRegressionTrainingSummary` to a
+` BinaryLogisticRegressionTrainingSummary`. Users should instead use the `model.binarySummary`
+method. See [SPARK-17139](https://issues.apache.org/jira/browse/SPARK-17139) for more detail 
+(_note_ this is an `Experimental` API). This _does not_ affect the Python `summary` method, which
+will still work correctly for both multinomial and binary cases.
 
 ### Deprecations and changes of behavior
 
@@ -123,8 +129,19 @@ new [`OneHotEncoderEstimator`](ml-features.html#onehotencoderestimator)
 **Changes of behavior**
 
 * [SPARK-21027](https://issues.apache.org/jira/browse/SPARK-21027):
- We are now setting the default parallelism used in `OneVsRest` to be 1 (i.e. serial). In 2.2 and
+ The default parallelism used in `OneVsRest` is now set to 1 (i.e. serial). In `2.2` and
  earlier versions, the level of parallelism was set to the default threadpool size in Scala.
+* [SPARK-22156](https://issues.apache.org/jira/browse/SPARK-22156):
+ The learning rate update for `Word2Vec` was incorrect when `numIterations` was set greater than
+ `1`. This will cause training results to be different between `2.3` and earlier versions.
+* [SPARK-21681](https://issues.apache.org/jira/browse/SPARK-21681):
+ Fixed an edge case bug in multinomial logistic regression that resulted in incorrect coefficients
+ when some features had zero variance.
+* [SPARK-16957](https://issues.apache.org/jira/browse/SPARK-16957):
+ Tree algorithms now use mid-points for split values. This may change results from model training.
+* [SPARK-14657](https://issues.apache.org/jira/browse/SPARK-14657):
+ Fixed an issue where the features generated by `RFormula` without an intercept were inconsistent
+ with the output in R. This may change results from model training in this scenario.
   
 ## Previous Spark versions
 

From 33f17b28b3448a6f0389d2ee93bb5d49a02f288c Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 1 Feb 2018 00:24:42 +0800
Subject: [PATCH 224/566] revert [SPARK-22785][SQL] remove
 ColumnVector.anyNullsSet

## What changes were proposed in this pull request?

In https://github.com/apache/spark/pull/19980 , we thought `anyNullsSet` can be simply implemented by `numNulls() > 0`. This is logically true, but may have performance problems.

`OrcColumnVector` is an example. It doesn't have the `numNulls` property, only has a `noNulls` property. We will lose a lot of performance if we use `numNulls() > 0` to check null.

This PR simply revert #19980, with a renaming to call it `hasNull`. Better name suggestions are welcome, e.g. `nullable`?

## How was this patch tested?

existing test

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20452 from cloud-fan/null.

(cherry picked from commit 48dd6a4c79e33a8f2dba8349b58aa07e4796a925)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../execution/datasources/orc/OrcColumnVector.java   |  5 +++++
 .../execution/vectorized/OffHeapColumnVector.java    |  2 +-
 .../sql/execution/vectorized/OnHeapColumnVector.java |  2 +-
 .../execution/vectorized/WritableColumnVector.java   |  7 ++++++-
 .../spark/sql/vectorized/ArrowColumnVector.java      |  5 +++++
 .../apache/spark/sql/vectorized/ColumnVector.java    |  5 +++++
 .../vectorized/ArrowColumnVectorSuite.scala          | 12 ++++++++++++
 .../execution/vectorized/ColumnarBatchSuite.scala    |  9 +++++++++
 8 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
index 5078bc7922ee2..78203e3145c62 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
@@ -77,6 +77,11 @@ public void close() {
 
   }
 
+  @Override
+  public boolean hasNull() {
+    return !baseData.noNulls;
+  }
+
   @Override
   public int numNulls() {
     if (baseData.isRepeating) {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
index 1c45b846790b6..fa52e4a354786 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
@@ -123,7 +123,7 @@ public void putNulls(int rowId, int count) {
 
   @Override
   public void putNotNulls(int rowId, int count) {
-    if (numNulls == 0) return;
+    if (!hasNull()) return;
     long offset = nulls + rowId;
     for (int i = 0; i < count; ++i, ++offset) {
       Platform.putByte(null, offset, (byte) 0);
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
index 1d538fe4181b7..cccef78aebdc8 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
@@ -119,7 +119,7 @@ public void putNulls(int rowId, int count) {
 
   @Override
   public void putNotNulls(int rowId, int count) {
-    if (numNulls == 0) return;
+    if (!hasNull()) return;
     for (int i = 0; i < count; ++i) {
       nulls[rowId + i] = (byte)0;
     }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index a8ec8ef2aadf8..8ebc1adf59c8b 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -59,8 +59,8 @@ public void reset() {
     elementsAppended = 0;
     if (numNulls > 0) {
       putNotNulls(0, capacity);
+      numNulls = 0;
     }
-    numNulls = 0;
   }
 
   @Override
@@ -102,6 +102,11 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
     throw new RuntimeException(message, cause);
   }
 
+  @Override
+  public boolean hasNull() {
+    return numNulls > 0;
+  }
+
   @Override
   public int numNulls() { return numNulls; }
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
index a75d76bd0f82e..5ff6474c161f3 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
@@ -37,6 +37,11 @@ public final class ArrowColumnVector extends ColumnVector {
   private final ArrowVectorAccessor accessor;
   private ArrowColumnVector[] childColumns;
 
+  @Override
+  public boolean hasNull() {
+    return accessor.getNullCount() > 0;
+  }
+
   @Override
   public int numNulls() {
     return accessor.getNullCount();
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
index 111f5d9b358d4..d588956208047 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
@@ -65,6 +65,11 @@ public abstract class ColumnVector implements AutoCloseable {
   @Override
   public abstract void close();
 
+  /**
+   * Returns true if this column vector contains any null values.
+   */
+  public abstract boolean hasNull();
+
   /**
    * Returns the number of nulls in this column vector.
    */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
index e794f50781ff2..b55489cb2678a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
@@ -42,6 +42,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === BooleanType)
+    assert(columnVector.hasNull)
     assert(columnVector.numNulls === 1)
 
     (0 until 10).foreach { i =>
@@ -69,6 +70,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === ByteType)
+    assert(columnVector.hasNull)
     assert(columnVector.numNulls === 1)
 
     (0 until 10).foreach { i =>
@@ -96,6 +98,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === ShortType)
+    assert(columnVector.hasNull)
     assert(columnVector.numNulls === 1)
 
     (0 until 10).foreach { i =>
@@ -123,6 +126,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === IntegerType)
+    assert(columnVector.hasNull)
     assert(columnVector.numNulls === 1)
 
     (0 until 10).foreach { i =>
@@ -150,6 +154,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === LongType)
+    assert(columnVector.hasNull)
     assert(columnVector.numNulls === 1)
 
     (0 until 10).foreach { i =>
@@ -177,6 +182,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === FloatType)
+    assert(columnVector.hasNull)
     assert(columnVector.numNulls === 1)
 
     (0 until 10).foreach { i =>
@@ -204,6 +210,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === DoubleType)
+    assert(columnVector.hasNull)
     assert(columnVector.numNulls === 1)
 
     (0 until 10).foreach { i =>
@@ -232,6 +239,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === StringType)
+    assert(columnVector.hasNull)
     assert(columnVector.numNulls === 1)
 
     (0 until 10).foreach { i =>
@@ -258,6 +266,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === BinaryType)
+    assert(columnVector.hasNull)
     assert(columnVector.numNulls === 1)
 
     (0 until 10).foreach { i =>
@@ -300,6 +309,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === ArrayType(IntegerType))
+    assert(columnVector.hasNull)
     assert(columnVector.numNulls === 1)
 
     val array0 = columnVector.getArray(0)
@@ -344,6 +354,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === schema)
+    assert(!columnVector.hasNull)
     assert(columnVector.numNulls === 0)
 
     val row0 = columnVector.getStruct(0)
@@ -396,6 +407,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === schema)
+    assert(columnVector.hasNull)
     assert(columnVector.numNulls === 1)
 
     val row0 = columnVector.getStruct(0)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index 925c101fe1fee..168bc5e3e480b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -66,22 +66,27 @@ class ColumnarBatchSuite extends SparkFunSuite {
     column =>
       val reference = mutable.ArrayBuffer.empty[Boolean]
       var idx = 0
+      assert(!column.hasNull)
       assert(column.numNulls() == 0)
 
       column.appendNotNull()
       reference += false
+      assert(!column.hasNull)
       assert(column.numNulls() == 0)
 
       column.appendNotNulls(3)
       (1 to 3).foreach(_ => reference += false)
+      assert(!column.hasNull)
       assert(column.numNulls() == 0)
 
       column.appendNull()
       reference += true
+      assert(column.hasNull)
       assert(column.numNulls() == 1)
 
       column.appendNulls(3)
       (1 to 3).foreach(_ => reference += true)
+      assert(column.hasNull)
       assert(column.numNulls() == 4)
 
       idx = column.elementsAppended
@@ -89,11 +94,13 @@ class ColumnarBatchSuite extends SparkFunSuite {
       column.putNotNull(idx)
       reference += false
       idx += 1
+      assert(column.hasNull)
       assert(column.numNulls() == 4)
 
       column.putNull(idx)
       reference += true
       idx += 1
+      assert(column.hasNull)
       assert(column.numNulls() == 5)
 
       column.putNulls(idx, 3)
@@ -101,6 +108,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       reference += true
       reference += true
       idx += 3
+      assert(column.hasNull)
       assert(column.numNulls() == 8)
 
       column.putNotNulls(idx, 4)
@@ -109,6 +117,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       reference += false
       reference += false
       idx += 4
+      assert(column.hasNull)
       assert(column.numNulls() == 8)
 
       reference.zipWithIndex.foreach { v =>

From f5f21e8c4261c0dfe8e3e788a30b38b188a18f67 Mon Sep 17 00:00:00 2001
From: Glen Takahashi <gtakahashi@palantir.com>
Date: Thu, 1 Feb 2018 01:14:01 +0800
Subject: [PATCH 225/566] [SPARK-23249][SQL] Improved block merging logic for
 partitions

## What changes were proposed in this pull request?

Change DataSourceScanExec so that when grouping blocks together into partitions, also checks the end of the sorted list of splits to more efficiently fill out partitions.

## How was this patch tested?

Updated old test to reflect the new logic, which causes the # of partitions to drop from 4 -> 3
Also, a current test exists to test large non-splittable files at https://github.com/glentakahashi/spark/blob/c575977a5952bf50b605be8079c9be1e30f3bd36/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala#L346

## Rationale

The current bin-packing method of next-fit descending for blocks into partitions is sub-optimal in a lot of cases and will result in extra partitions, un-even distribution of block-counts across partitions, and un-even distribution of partition sizes.

As an example, 128 files ranging from 1MB, 2MB,...127MB,128MB. will result in 82 partitions with the current algorithm, but only 64 using this algorithm. Also in this example, the max # of blocks per partition in NFD is 13, while in this algorithm is is 2.

More generally, running a simulation of 1000 runs using 128MB blocksize, between 1-1000 normally distributed file sizes between 1-500Mb, you can see an improvement of approx 5% reduction of partition counts, and a large reduction in standard deviation of blocks per partition.

This algorithm also runs in O(n) time as NFD does, and in every case is strictly better results than NFD.

Overall, the more even distribution of blocks across partitions and therefore reduced partition counts should result in a small but significant performance increase across the board

Author: Glen Takahashi <gtakahashi@palantir.com>

Closes #20372 from glentakahashi/feature/improved-block-merging.

(cherry picked from commit 8c21170decfb9ca4d3233e1ea13bd1b6e3199ed9)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/execution/DataSourceScanExec.scala    | 29 ++++++++++++++-----
 .../datasources/FileSourceStrategySuite.scala | 15 ++++------
 2 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index aa66ee7e948ea..f7732e2098c29 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -445,16 +445,29 @@ case class FileSourceScanExec(
       currentSize = 0
     }
 
-    // Assign files to partitions using "Next Fit Decreasing"
-    splitFiles.foreach { file =>
-      if (currentSize + file.length > maxSplitBytes) {
-        closePartition()
+    def addFile(file: PartitionedFile): Unit = {
+        currentFiles += file
+        currentSize += file.length + openCostInBytes
+    }
+
+    var frontIndex = 0
+    var backIndex = splitFiles.length - 1
+
+    while (frontIndex <= backIndex) {
+      addFile(splitFiles(frontIndex))
+      frontIndex += 1
+      while (frontIndex <= backIndex &&
+             currentSize + splitFiles(frontIndex).length <= maxSplitBytes) {
+        addFile(splitFiles(frontIndex))
+        frontIndex += 1
+      }
+      while (backIndex > frontIndex &&
+             currentSize + splitFiles(backIndex).length <= maxSplitBytes) {
+        addFile(splitFiles(backIndex))
+        backIndex -= 1
       }
-      // Add the given file to the current partition.
-      currentSize += file.length + openCostInBytes
-      currentFiles += file
+      closePartition()
     }
-    closePartition()
 
     new FileScanRDD(fsRelation.sparkSession, readFile, partitions)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
index c1d61b843d899..bfccc9335b361 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
@@ -141,16 +141,17 @@ class FileSourceStrategySuite extends QueryTest with SharedSQLContext with Predi
     withSQLConf(SQLConf.FILES_MAX_PARTITION_BYTES.key -> "4",
         SQLConf.FILES_OPEN_COST_IN_BYTES.key -> "1") {
       checkScan(table.select('c1)) { partitions =>
-        // Files should be laid out [(file1), (file2, file3), (file4, file5), (file6)]
-        assert(partitions.size == 4, "when checking partitions")
-        assert(partitions(0).files.size == 1, "when checking partition 1")
+        // Files should be laid out [(file1, file6), (file2, file3), (file4, file5)]
+        assert(partitions.size == 3, "when checking partitions")
+        assert(partitions(0).files.size == 2, "when checking partition 1")
         assert(partitions(1).files.size == 2, "when checking partition 2")
         assert(partitions(2).files.size == 2, "when checking partition 3")
-        assert(partitions(3).files.size == 1, "when checking partition 4")
 
-        // First partition reads (file1)
+        // First partition reads (file1, file6)
         assert(partitions(0).files(0).start == 0)
         assert(partitions(0).files(0).length == 2)
+        assert(partitions(0).files(1).start == 0)
+        assert(partitions(0).files(1).length == 1)
 
         // Second partition reads (file2, file3)
         assert(partitions(1).files(0).start == 0)
@@ -163,10 +164,6 @@ class FileSourceStrategySuite extends QueryTest with SharedSQLContext with Predi
         assert(partitions(2).files(0).length == 1)
         assert(partitions(2).files(1).start == 0)
         assert(partitions(2).files(1).length == 1)
-
-        // Final partition reads (file6)
-        assert(partitions(3).files(0).start == 0)
-        assert(partitions(3).files(0).length == 1)
       }
 
       checkPartitionSchema(StructType(Nil))

From 8ee3a71c9c1b8ed51c5916635d008fdd49cf891a Mon Sep 17 00:00:00 2001
From: Dilip Biswal <dbiswal@us.ibm.com>
Date: Wed, 31 Jan 2018 13:52:47 -0800
Subject: [PATCH 226/566] [SPARK-23281][SQL] Query produces results in
 incorrect order when a composite order by clause refers to both original
 columns and aliases

## What changes were proposed in this pull request?
Here is the test snippet.
``` SQL
scala> Seq[(Integer, Integer)](
     |         (1, 1),
     |         (1, 3),
     |         (2, 3),
     |         (3, 3),
     |         (4, null),
     |         (5, null)
     |       ).toDF("key", "value").createOrReplaceTempView("src")

scala> sql(
     |         """
     |           |SELECT MAX(value) as value, key as col2
     |           |FROM src
     |           |GROUP BY key
     |           |ORDER BY value desc, key
     |         """.stripMargin).show
+-----+----+
|value|col2|
+-----+----+
|    3|   3|
|    3|   2|
|    3|   1|
| null|   5|
| null|   4|
+-----+----+
```SQL
Here is the explain output :

```SQL
== Parsed Logical Plan ==
'Sort ['value DESC NULLS LAST, 'key ASC NULLS FIRST], true
+- 'Aggregate ['key], ['MAX('value) AS value#9, 'key AS col2#10]
   +- 'UnresolvedRelation `src`

== Analyzed Logical Plan ==
value: int, col2: int
Project [value#9, col2#10]
+- Sort [value#9 DESC NULLS LAST, col2#10 DESC NULLS LAST], true
   +- Aggregate [key#5], [max(value#6) AS value#9, key#5 AS col2#10]
      +- SubqueryAlias src
         +- Project [_1#2 AS key#5, _2#3 AS value#6]
            +- LocalRelation [_1#2, _2#3]
``` SQL
The sort direction is being wrongly changed from ASC to DSC while resolving ```Sort``` in
resolveAggregateFunctions.

The above testcase models TPCDS-Q71 and thus we have the same issue in Q71 as well.

## How was this patch tested?
A few tests are added in SQLQuerySuite.

Author: Dilip Biswal <dbiswal@us.ibm.com>

Closes #20453 from dilipbiswal/local_spark.
---
 .../sql/catalyst/analysis/Analyzer.scala      |  2 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 41 ++++++++++++++++++-
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 91cb0365a0856..251099f750cf6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1493,7 +1493,7 @@ class Analyzer(
           // to push down this ordering expression and can reference the original aggregate
           // expression instead.
           val needsPushDown = ArrayBuffer.empty[NamedExpression]
-          val evaluatedOrderings = resolvedAliasedOrdering.zip(sortOrder).map {
+          val evaluatedOrderings = resolvedAliasedOrdering.zip(unresolvedSortOrders).map {
             case (evaluated, order) =>
               val index = originalAggExprs.indexWhere {
                 case Alias(child, _) => child semanticEquals evaluated.child
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index a79ab47f0197e..97d03b2cd496e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql
 
 import java.io.File
-import java.math.MathContext
 import java.net.{MalformedURLException, URL}
 import java.sql.Timestamp
 import java.util.concurrent.atomic.AtomicBoolean
@@ -1616,6 +1615,46 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     }
   }
 
+  test("SPARK-23281: verify the correctness of sort direction on composite order by clause") {
+    withTempView("src") {
+      Seq[(Integer, Integer)](
+        (1, 1),
+        (1, 3),
+        (2, 3),
+        (3, 3),
+        (4, null),
+        (5, null)
+      ).toDF("key", "value").createOrReplaceTempView("src")
+
+      checkAnswer(sql(
+        """
+          |SELECT MAX(value) as value, key as col2
+          |FROM src
+          |GROUP BY key
+          |ORDER BY value desc, key
+        """.stripMargin),
+        Seq(Row(3, 1), Row(3, 2), Row(3, 3), Row(null, 4), Row(null, 5)))
+
+      checkAnswer(sql(
+        """
+          |SELECT MAX(value) as value, key as col2
+          |FROM src
+          |GROUP BY key
+          |ORDER BY value desc, key desc
+        """.stripMargin),
+        Seq(Row(3, 3), Row(3, 2), Row(3, 1), Row(null, 5), Row(null, 4)))
+
+      checkAnswer(sql(
+        """
+          |SELECT MAX(value) as value, key as col2
+          |FROM src
+          |GROUP BY key
+          |ORDER BY value asc, key desc
+        """.stripMargin),
+        Seq(Row(null, 5), Row(null, 4), Row(3, 3), Row(3, 2), Row(3, 1)))
+    }
+  }
+
   test("run sql directly on files") {
     val df = spark.range(100).toDF()
     withTempPath(f => {

From 7ccfc753086c3859abe358c87f2e7b7a30422d5e Mon Sep 17 00:00:00 2001
From: Henry Robinson <henry@cloudera.com>
Date: Thu, 1 Feb 2018 11:15:17 +0900
Subject: [PATCH 227/566] [SPARK-23157][SQL][FOLLOW-UP] DataFrame ->
 SparkDataFrame in R comment

Author: Henry Robinson <henry@cloudera.com>

Closes #20443 from henryr/SPARK-23157.

(cherry picked from commit f470df2fcf14e6234c577dc1bdfac27d49b441f5)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 R/pkg/R/DataFrame.R             | 4 ++--
 python/pyspark/sql/dataframe.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 547b5ea48a555..41c3c3a89fa72 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2090,8 +2090,8 @@ setMethod("selectExpr",
 #'
 #' @param x a SparkDataFrame.
 #' @param colName a column name.
-#' @param col a Column expression (which must refer only to this DataFrame), or an atomic vector in
-#' the length of 1 as literal value.
+#' @param col a Column expression (which must refer only to this SparkDataFrame), or an atomic
+#' vector in the length of 1 as literal value.
 #' @return A SparkDataFrame with the new column added or the existing column replaced.
 #' @family SparkDataFrame functions
 #' @aliases withColumn,SparkDataFrame,character-method
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 055b2c4a0ffec..1496cba91b90e 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1829,7 +1829,7 @@ def withColumn(self, colName, col):
         Returns a new :class:`DataFrame` by adding a column or replacing the
         existing column that has the same name.
 
-        The column expression must be an expression over this dataframe; attempting to add
+        The column expression must be an expression over this DataFrame; attempting to add
         a column from some other dataframe will raise an error.
 
         :param colName: string, name of the new column.

From 0d0f5793686b98305f98a3d9e494bbfcee9cff13 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 1 Feb 2018 11:56:06 +0800
Subject: [PATCH 228/566] [SPARK-23280][SQL] add map type support to
 ColumnVector

## What changes were proposed in this pull request?

Fill the last missing piece of `ColumnVector`: the map type support.

The idea is similar to the array type support. A map is basically 2 arrays: keys and values. We ask the implementations to provide a key array, a value array, and an offset and length to specify the range of this map in the key/value array.

In `WritableColumnVector`, we put the key array in first child vector, and value array in second child vector, and offsets and lengths in the current vector, which is very similar to how array type is implemented here.

## How was this patch tested?

a new test

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20450 from cloud-fan/map.

(cherry picked from commit 52e00f70663a87b5837235bdf72a3e6f84e11411)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../datasources/orc/OrcColumnVector.java      |  6 ++
 .../vectorized/ColumnVectorUtils.java         | 15 ++++
 .../vectorized/OffHeapColumnVector.java       |  4 +-
 .../vectorized/OnHeapColumnVector.java        |  4 +-
 .../vectorized/WritableColumnVector.java      | 13 ++++
 .../sql/vectorized/ArrowColumnVector.java     |  5 ++
 .../spark/sql/vectorized/ColumnVector.java    | 14 +++-
 .../spark/sql/vectorized/ColumnarArray.java   |  4 +-
 .../spark/sql/vectorized/ColumnarMap.java     | 53 ++++++++++++++
 .../spark/sql/vectorized/ColumnarRow.java     |  5 +-
 .../vectorized/ColumnarBatchSuite.scala       | 70 ++++++++++++++-----
 11 files changed, 166 insertions(+), 27 deletions(-)
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarMap.java

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
index 78203e3145c62..c8add4c9f486c 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
@@ -25,6 +25,7 @@
 import org.apache.spark.sql.types.Decimal;
 import org.apache.spark.sql.types.TimestampType;
 import org.apache.spark.sql.vectorized.ColumnarArray;
+import org.apache.spark.sql.vectorized.ColumnarMap;
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
@@ -177,6 +178,11 @@ public ColumnarArray getArray(int rowId) {
     throw new UnsupportedOperationException();
   }
 
+  @Override
+  public ColumnarMap getMap(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
   @Override
   public org.apache.spark.sql.vectorized.ColumnVector getChild(int ordinal) {
     throw new UnsupportedOperationException();
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
index a2853bbadc92b..829f3ce750fe6 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
@@ -20,8 +20,10 @@
 import java.math.BigInteger;
 import java.nio.charset.StandardCharsets;
 import java.sql.Date;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Map;
 
 import org.apache.spark.memory.MemoryMode;
 import org.apache.spark.sql.Row;
@@ -30,6 +32,7 @@
 import org.apache.spark.sql.types.*;
 import org.apache.spark.sql.vectorized.ColumnarArray;
 import org.apache.spark.sql.vectorized.ColumnarBatch;
+import org.apache.spark.sql.vectorized.ColumnarMap;
 import org.apache.spark.unsafe.types.CalendarInterval;
 import org.apache.spark.unsafe.types.UTF8String;
 
@@ -109,6 +112,18 @@ public static int[] toJavaIntArray(ColumnarArray array) {
     return array.toIntArray();
   }
 
+  public static Map<Integer, Integer> toJavaIntMap(ColumnarMap map) {
+    int[] keys = toJavaIntArray(map.keyArray());
+    int[] values = toJavaIntArray(map.valueArray());
+    assert keys.length == values.length;
+
+    Map<Integer, Integer> result = new HashMap<>();
+    for (int i = 0; i < keys.length; i++) {
+      result.put(keys[i], values[i]);
+    }
+    return result;
+  }
+
   private static void appendValue(WritableColumnVector dst, DataType t, Object o) {
     if (o == null) {
       if (t instanceof CalendarIntervalType) {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
index fa52e4a354786..754c26579ff08 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
@@ -60,7 +60,7 @@ public static OffHeapColumnVector[] allocateColumns(int capacity, StructField[]
   private long nulls;
   private long data;
 
-  // Set iff the type is array.
+  // Only set if type is Array or Map.
   private long lengthData;
   private long offsetData;
 
@@ -530,7 +530,7 @@ public int putByteArray(int rowId, byte[] value, int offset, int length) {
   @Override
   protected void reserveInternal(int newCapacity) {
     int oldCapacity = (nulls == 0L) ? 0 : capacity;
-    if (isArray()) {
+    if (isArray() || type instanceof MapType) {
       this.lengthData =
           Platform.reallocateMemory(lengthData, oldCapacity * 4, newCapacity * 4);
       this.offsetData =
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
index cccef78aebdc8..23dcc104e67c4 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
@@ -69,7 +69,7 @@ public static OnHeapColumnVector[] allocateColumns(int capacity, StructField[] f
   private float[] floatData;
   private double[] doubleData;
 
-  // Only set if type is Array.
+  // Only set if type is Array or Map.
   private int[] arrayLengths;
   private int[] arrayOffsets;
 
@@ -503,7 +503,7 @@ public int putByteArray(int rowId, byte[] value, int offset, int length) {
   // Spilt this function out since it is the slow path.
   @Override
   protected void reserveInternal(int newCapacity) {
-    if (isArray()) {
+    if (isArray() || type instanceof MapType) {
       int[] newLengths = new int[newCapacity];
       int[] newOffsets = new int[newCapacity];
       if (this.arrayLengths != null) {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index 8ebc1adf59c8b..c2e595455549c 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -25,6 +25,7 @@
 import org.apache.spark.sql.types.*;
 import org.apache.spark.sql.vectorized.ColumnVector;
 import org.apache.spark.sql.vectorized.ColumnarArray;
+import org.apache.spark.sql.vectorized.ColumnarMap;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.types.UTF8String;
 
@@ -612,6 +613,13 @@ public final ColumnarArray getArray(int rowId) {
     return new ColumnarArray(arrayData(), getArrayOffset(rowId), getArrayLength(rowId));
   }
 
+  // `WritableColumnVector` puts the key array in the first child column vector, value array in the
+  // second child column vector, and puts the offsets and lengths in the current column vector.
+  @Override
+  public final ColumnarMap getMap(int rowId) {
+    return new ColumnarMap(getChild(0), getChild(1), getArrayOffset(rowId), getArrayLength(rowId));
+  }
+
   public WritableColumnVector arrayData() {
     return childColumns[0];
   }
@@ -705,6 +713,11 @@ protected WritableColumnVector(int capacity, DataType type) {
       for (int i = 0; i < childColumns.length; ++i) {
         this.childColumns[i] = reserveNewColumn(capacity, st.fields()[i].dataType());
       }
+    } else if (type instanceof MapType) {
+      MapType mapType = (MapType) type;
+      this.childColumns = new WritableColumnVector[2];
+      this.childColumns[0] = reserveNewColumn(capacity, mapType.keyType());
+      this.childColumns[1] = reserveNewColumn(capacity, mapType.valueType());
     } else if (type instanceof CalendarIntervalType) {
       // Two columns. Months as int. Microseconds as Long.
       this.childColumns = new WritableColumnVector[2];
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
index 5ff6474c161f3..f3ece538c3b80 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
@@ -119,6 +119,11 @@ public ColumnarArray getArray(int rowId) {
     return accessor.getArray(rowId);
   }
 
+  @Override
+  public ColumnarMap getMap(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
   @Override
   public ArrowColumnVector getChild(int ordinal) { return childColumns[ordinal]; }
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
index d588956208047..05271ec1f46ab 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
@@ -220,10 +220,18 @@ public final ColumnarRow getStruct(int rowId) {
 
   /**
    * Returns the map type value for rowId.
+   *
+   * In Spark, map type value is basically a key data array and a value data array. A key from the
+   * key array with a index and a value from the value array with the same index contribute to
+   * an entry of this map type value.
+   *
+   * To support map type, implementations must construct an {@link ColumnarMap} and return it in
+   * this method. {@link ColumnarMap} requires a {@link ColumnVector} that stores the data of all
+   * the keys of all the maps in this vector, and another {@link ColumnVector} that stores the data
+   * of all the values of all the maps in this vector, and a pair of offset and length which
+   * specify the range of the key/value array that belongs to the map type value at rowId.
    */
-  public MapData getMap(int ordinal) {
-    throw new UnsupportedOperationException();
-  }
+  public abstract ColumnarMap getMap(int ordinal);
 
   /**
    * Returns the decimal type value for rowId.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
index 72c07ee7cad3f..7c7a1c806a2b7 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
@@ -149,8 +149,8 @@ public ColumnarArray getArray(int ordinal) {
   }
 
   @Override
-  public MapData getMap(int ordinal) {
-    throw new UnsupportedOperationException();
+  public ColumnarMap getMap(int ordinal) {
+    return data.getMap(offset + ordinal);
   }
 
   @Override
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarMap.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarMap.java
new file mode 100644
index 0000000000000..35648e386c4f1
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarMap.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.vectorized;
+
+import org.apache.spark.sql.catalyst.util.MapData;
+
+/**
+ * Map abstraction in {@link ColumnVector}.
+ */
+public final class ColumnarMap extends MapData {
+  private final ColumnarArray keys;
+  private final ColumnarArray values;
+  private final int length;
+
+  public ColumnarMap(ColumnVector keys, ColumnVector values, int offset, int length) {
+    this.length = length;
+    this.keys = new ColumnarArray(keys, offset, length);
+    this.values = new ColumnarArray(values, offset, length);
+  }
+
+  @Override
+  public int numElements() { return length; }
+
+  @Override
+  public ColumnarArray keyArray() {
+    return keys;
+  }
+
+  @Override
+  public ColumnarArray valueArray() {
+    return values;
+  }
+
+  @Override
+  public ColumnarMap copy() {
+    throw new UnsupportedOperationException();
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
index 6ca749d7c6e85..0c9e92ed11fbd 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
@@ -155,8 +155,9 @@ public ColumnarArray getArray(int ordinal) {
   }
 
   @Override
-  public MapData getMap(int ordinal) {
-    throw new UnsupportedOperationException();
+  public ColumnarMap getMap(int ordinal) {
+    if (data.getChild(ordinal).isNullAt(rowId)) return null;
+    return data.getChild(ordinal).getMap(rowId);
   }
 
   @Override
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index 168bc5e3e480b..8fe2985836f2e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -673,35 +673,37 @@ class ColumnarBatchSuite extends SparkFunSuite {
         i += 1
       }
 
-      // Populate it with arrays [0], [1, 2], [], [3, 4, 5]
+      // Populate it with arrays [0], [1, 2], null, [], [3, 4, 5]
       column.putArray(0, 0, 1)
       column.putArray(1, 1, 2)
-      column.putArray(2, 2, 0)
-      column.putArray(3, 3, 3)
+      column.putNull(2)
+      column.putArray(3, 3, 0)
+      column.putArray(4, 3, 3)
+
+      assert(column.getArray(0).numElements == 1)
+      assert(column.getArray(1).numElements == 2)
+      assert(column.isNullAt(2))
+      assert(column.getArray(3).numElements == 0)
+      assert(column.getArray(4).numElements == 3)
 
       val a1 = ColumnVectorUtils.toJavaIntArray(column.getArray(0))
       val a2 = ColumnVectorUtils.toJavaIntArray(column.getArray(1))
-      val a3 = ColumnVectorUtils.toJavaIntArray(column.getArray(2))
-      val a4 = ColumnVectorUtils.toJavaIntArray(column.getArray(3))
+      val a3 = ColumnVectorUtils.toJavaIntArray(column.getArray(3))
+      val a4 = ColumnVectorUtils.toJavaIntArray(column.getArray(4))
       assert(a1 === Array(0))
       assert(a2 === Array(1, 2))
       assert(a3 === Array.empty[Int])
       assert(a4 === Array(3, 4, 5))
 
-      // Verify the ArrayData APIs
-      assert(column.getArray(0).numElements() == 1)
+      // Verify the ArrayData get APIs
       assert(column.getArray(0).getInt(0) == 0)
 
-      assert(column.getArray(1).numElements() == 2)
       assert(column.getArray(1).getInt(0) == 1)
       assert(column.getArray(1).getInt(1) == 2)
 
-      assert(column.getArray(2).numElements() == 0)
-
-      assert(column.getArray(3).numElements() == 3)
-      assert(column.getArray(3).getInt(0) == 3)
-      assert(column.getArray(3).getInt(1) == 4)
-      assert(column.getArray(3).getInt(2) == 5)
+      assert(column.getArray(4).getInt(0) == 3)
+      assert(column.getArray(4).getInt(1) == 4)
+      assert(column.getArray(4).getInt(2) == 5)
 
       // Add a longer array which requires resizing
       column.reset()
@@ -711,8 +713,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       assert(data.capacity == array.length * 2)
       data.putInts(0, array.length, array, 0)
       column.putArray(0, 0, array.length)
-      assert(ColumnVectorUtils.toJavaIntArray(column.getArray(0))
-        === array)
+      assert(ColumnVectorUtils.toJavaIntArray(column.getArray(0)) === array)
   }
 
   test("toArray for primitive types") {
@@ -770,6 +771,43 @@ class ColumnarBatchSuite extends SparkFunSuite {
     }
   }
 
+  test("Int Map") {
+    (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode =>
+      val column = allocate(10, new MapType(IntegerType, IntegerType, false), memMode)
+      (0 to 1).foreach { colIndex =>
+        val data = column.getChild(colIndex)
+        (0 to 5).foreach {i =>
+          data.putInt(i, i * (colIndex + 1))
+        }
+      }
+
+      // Populate it with maps [0->0], [1->2, 2->4], null, [], [3->6, 4->8, 5->10]
+      column.putArray(0, 0, 1)
+      column.putArray(1, 1, 2)
+      column.putNull(2)
+      column.putArray(3, 3, 0)
+      column.putArray(4, 3, 3)
+
+      assert(column.getMap(0).numElements == 1)
+      assert(column.getMap(1).numElements == 2)
+      assert(column.isNullAt(2))
+      assert(column.getMap(3).numElements == 0)
+      assert(column.getMap(4).numElements == 3)
+
+      val a1 = ColumnVectorUtils.toJavaIntMap(column.getMap(0))
+      val a2 = ColumnVectorUtils.toJavaIntMap(column.getMap(1))
+      val a4 = ColumnVectorUtils.toJavaIntMap(column.getMap(3))
+      val a5 = ColumnVectorUtils.toJavaIntMap(column.getMap(4))
+
+      assert(a1.asScala == Map(0 -> 0))
+      assert(a2.asScala == Map(1 -> 2, 2 -> 4))
+      assert(a4.asScala == Map())
+      assert(a5.asScala == Map(3 -> 6, 4 -> 8, 5 -> 10))
+
+      column.close()
+    }
+  }
+
   testVector(
     "Struct Column",
     10,

From 59e89a2990e4f66839c91f48f41157dac6e670ad Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Wed, 31 Jan 2018 20:33:51 -0800
Subject: [PATCH 229/566] [SPARK-23268][SQL] Reorganize packages in data source
 V2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?
1. create a new package for partitioning/distribution related classes.
    As Spark will add new concrete implementations of `Distribution` in new releases, it is good to
    have a new package for partitioning/distribution related classes.

2. move streaming related class to package `org.apache.spark.sql.sources.v2.reader/writer.streaming`, instead of `org.apache.spark.sql.sources.v2.streaming.reader/writer`.
So that the there won't be package reader/writer inside package streaming, which is quite confusing.
Before change:
```
v2
├── reader
├── streaming
│   ├── reader
│   └── writer
└── writer
```

After change:
```
v2
├── reader
│   └── streaming
└── writer
    └── streaming
```
## How was this patch tested?
Unit test.

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #20435 from gengliangwang/new_pkg.

(cherry picked from commit 56ae32657e9e5d1e30b62afe77d9e14eb07cf4fb)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../spark/sql/kafka010/KafkaContinuousReader.scala     |  2 +-
 .../apache/spark/sql/kafka010/KafkaSourceOffset.scala  |  2 +-
 .../spark/sql/kafka010/KafkaSourceProvider.scala       |  5 +++--
 .../apache/spark/sql/kafka010/KafkaStreamWriter.scala  |  2 +-
 .../{streaming => reader}/ContinuousReadSupport.java   |  4 ++--
 .../{streaming => reader}/MicroBatchReadSupport.java   |  4 ++--
 .../sources/v2/reader/SupportsReportPartitioning.java  |  1 +
 .../{ => partitioning}/ClusteredDistribution.java      |  3 ++-
 .../v2/reader/{ => partitioning}/Distribution.java     |  3 ++-
 .../v2/reader/{ => partitioning}/Partitioning.java     |  4 +++-
 .../streaming}/ContinuousDataReader.java               |  2 +-
 .../reader => reader/streaming}/ContinuousReader.java  |  2 +-
 .../reader => reader/streaming}/MicroBatchReader.java  |  2 +-
 .../{streaming/reader => reader/streaming}/Offset.java |  2 +-
 .../reader => reader/streaming}/PartitionOffset.java   |  2 +-
 .../spark/sql/sources/v2/writer/DataSourceWriter.java  |  2 +-
 .../v2/{streaming => writer}/StreamWriteSupport.java   |  5 ++---
 .../writer => writer/streaming}/StreamWriter.java      |  2 +-
 .../datasources/v2/DataSourcePartitioning.scala        |  2 +-
 .../datasources/v2/DataSourceV2ScanExec.scala          |  2 +-
 .../execution/datasources/v2/WriteToDataSourceV2.scala |  2 +-
 .../sql/execution/streaming/MicroBatchExecution.scala  |  6 +++---
 .../sql/execution/streaming/RateSourceProvider.scala   |  5 ++---
 .../sql/execution/streaming/RateStreamOffset.scala     |  2 +-
 .../sql/execution/streaming/StreamingRelation.scala    |  2 +-
 .../apache/spark/sql/execution/streaming/console.scala |  4 ++--
 .../continuous/ContinuousDataSourceRDDIter.scala       | 10 +++-------
 .../streaming/continuous/ContinuousExecution.scala     |  5 +++--
 .../continuous/ContinuousRateStreamSource.scala        |  2 +-
 .../streaming/continuous/EpochCoordinator.scala        |  4 ++--
 .../execution/streaming/sources/ConsoleWriter.scala    |  2 +-
 .../execution/streaming/sources/MicroBatchWriter.scala |  2 +-
 .../streaming/sources/RateStreamSourceV2.scala         |  3 +--
 .../sql/execution/streaming/sources/memoryV2.scala     |  3 +--
 .../apache/spark/sql/streaming/DataStreamReader.scala  |  2 +-
 .../apache/spark/sql/streaming/DataStreamWriter.scala  |  2 +-
 .../spark/sql/streaming/StreamingQueryManager.scala    |  2 +-
 .../sql/sources/v2/JavaPartitionAwareDataSource.java   |  3 +++
 .../sql/execution/streaming/RateSourceV2Suite.scala    |  2 +-
 .../spark/sql/sources/v2/DataSourceV2Suite.scala       |  1 +
 .../streaming/sources/StreamingDataSourceV2Suite.scala |  8 ++++----
 41 files changed, 64 insertions(+), 61 deletions(-)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{streaming => reader}/ContinuousReadSupport.java (94%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{streaming => reader}/MicroBatchReadSupport.java (95%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/{ => partitioning}/ClusteredDistribution.java (92%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/{ => partitioning}/Distribution.java (93%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/{ => partitioning}/Partitioning.java (90%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{streaming/reader => reader/streaming}/ContinuousDataReader.java (96%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{streaming/reader => reader/streaming}/ContinuousReader.java (98%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{streaming/reader => reader/streaming}/MicroBatchReader.java (98%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{streaming/reader => reader/streaming}/Offset.java (97%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{streaming/reader => reader/streaming}/PartitionOffset.java (95%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{streaming => writer}/StreamWriteSupport.java (93%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{streaming/writer => writer/streaming}/StreamWriter.java (98%)

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
index 8c733426b256f..41c443bc12120 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
@@ -31,7 +31,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRo
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.kafka010.KafkaSource.{INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE, INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE}
 import org.apache.spark.sql.sources.v2.reader._
-import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousDataReader, ContinuousReader, Offset, PartitionOffset}
+import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousDataReader, ContinuousReader, Offset, PartitionOffset}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.unsafe.types.UTF8String
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
index c82154cfbad7f..8d41c0da2b133 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.kafka010
 import org.apache.kafka.common.TopicPartition
 
 import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset}
-import org.apache.spark.sql.sources.v2.streaming.reader.{Offset => OffsetV2, PartitionOffset}
+import org.apache.spark.sql.sources.v2.reader.streaming.{Offset => OffsetV2, PartitionOffset}
 
 /**
  * An [[Offset]] for the [[KafkaSource]]. This one tracks all partitions of subscribed topics and
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
index 85e96b6783327..694ca76e24964 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
@@ -31,8 +31,9 @@ import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SparkSessio
 import org.apache.spark.sql.execution.streaming.{Sink, Source}
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.sources.v2.DataSourceOptions
-import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, StreamWriteSupport}
-import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
+import org.apache.spark.sql.sources.v2.reader.ContinuousReadSupport
+import org.apache.spark.sql.sources.v2.writer.StreamWriteSupport
+import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaStreamWriter.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaStreamWriter.scala
index a24efdefa4464..9307bfc001c03 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaStreamWriter.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaStreamWriter.scala
@@ -22,8 +22,8 @@ import scala.collection.JavaConverters._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.kafka010.KafkaWriter.validateQuery
-import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
 import org.apache.spark.sql.sources.v2.writer._
+import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
 import org.apache.spark.sql.types.StructType
 
 /**
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousReadSupport.java
similarity index 94%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousReadSupport.java
index f79424e036a52..0c1d5d1a9577a 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousReadSupport.java
@@ -15,14 +15,14 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.streaming;
+package org.apache.spark.sql.sources.v2.reader;
 
 import java.util.Optional;
 
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.sources.v2.DataSourceV2;
 import org.apache.spark.sql.sources.v2.DataSourceOptions;
-import org.apache.spark.sql.sources.v2.streaming.reader.ContinuousReader;
+import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousReader;
 import org.apache.spark.sql.types.StructType;
 
 /**
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/MicroBatchReadSupport.java
similarity index 95%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchReadSupport.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/MicroBatchReadSupport.java
index 22660e42ad850..5e8f0c0dafdcf 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchReadSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/MicroBatchReadSupport.java
@@ -15,14 +15,14 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.streaming;
+package org.apache.spark.sql.sources.v2.reader;
 
 import java.util.Optional;
 
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.DataSourceV2;
-import org.apache.spark.sql.sources.v2.streaming.reader.MicroBatchReader;
+import org.apache.spark.sql.sources.v2.reader.streaming.MicroBatchReader;
 import org.apache.spark.sql.types.StructType;
 
 /**
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java
index a2383a9d7d680..5405a916951b8 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.sources.v2.reader;
 
 import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.sources.v2.reader.partitioning.Partitioning;
 
 /**
  * A mix in interface for {@link DataSourceReader}. Data source readers can implement this
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ClusteredDistribution.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/partitioning/ClusteredDistribution.java
similarity index 92%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ClusteredDistribution.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/partitioning/ClusteredDistribution.java
index 27905e325df87..2d0ee50212b56 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ClusteredDistribution.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/partitioning/ClusteredDistribution.java
@@ -15,9 +15,10 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.reader;
+package org.apache.spark.sql.sources.v2.reader.partitioning;
 
 import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.sources.v2.reader.DataReader;
 
 /**
  * A concrete implementation of {@link Distribution}. Represents a distribution where records that
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Distribution.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/partitioning/Distribution.java
similarity index 93%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Distribution.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/partitioning/Distribution.java
index b37562167d9ef..f6b111fdf220d 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Distribution.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/partitioning/Distribution.java
@@ -15,9 +15,10 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.reader;
+package org.apache.spark.sql.sources.v2.reader.partitioning;
 
 import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.sources.v2.reader.DataReader;
 
 /**
  * An interface to represent data distribution requirement, which specifies how the records should
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Partitioning.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/partitioning/Partitioning.java
similarity index 90%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Partitioning.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/partitioning/Partitioning.java
index 5e334d13a1215..309d9e5de0a0f 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Partitioning.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/partitioning/Partitioning.java
@@ -15,9 +15,11 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.reader;
+package org.apache.spark.sql.sources.v2.reader.partitioning;
 
 import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.sources.v2.reader.DataReaderFactory;
+import org.apache.spark.sql.sources.v2.reader.SupportsReportPartitioning;
 
 /**
  * An interface to represent the output data partitioning for a data source, which is returned by
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousDataReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/ContinuousDataReader.java
similarity index 96%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousDataReader.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/ContinuousDataReader.java
index 3f13a4dbf5793..47d26440841fd 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousDataReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/ContinuousDataReader.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.streaming.reader;
+package org.apache.spark.sql.sources.v2.reader.streaming;
 
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.sources.v2.reader.DataReader;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/ContinuousReader.java
similarity index 98%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/ContinuousReader.java
index 6e5177ee83a62..d1d1e7ffd1dd4 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/ContinuousReader.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.streaming.reader;
+package org.apache.spark.sql.sources.v2.reader.streaming;
 
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.execution.streaming.BaseStreamingSource;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/MicroBatchReader.java
similarity index 98%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/MicroBatchReader.java
index fcec446d892f5..67ebde30d61a9 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/MicroBatchReader.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.streaming.reader;
+package org.apache.spark.sql.sources.v2.reader.streaming;
 
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.sources.v2.reader.DataSourceReader;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/Offset.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/Offset.java
similarity index 97%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/Offset.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/Offset.java
index abba3e7188b13..e41c0351edc82 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/Offset.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/Offset.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.streaming.reader;
+package org.apache.spark.sql.sources.v2.reader.streaming;
 
 import org.apache.spark.annotation.InterfaceStability;
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/PartitionOffset.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/PartitionOffset.java
similarity index 95%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/PartitionOffset.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/PartitionOffset.java
index 4688b85f49f5f..383e73db6762b 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/PartitionOffset.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/PartitionOffset.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.streaming.reader;
+package org.apache.spark.sql.sources.v2.reader.streaming;
 
 import java.io.Serializable;
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceWriter.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceWriter.java
index d89d27d0e5b1b..7096aec0d22c2 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceWriter.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceWriter.java
@@ -28,7 +28,7 @@
 /**
  * A data source writer that is returned by
  * {@link WriteSupport#createWriter(String, StructType, SaveMode, DataSourceOptions)}/
- * {@link org.apache.spark.sql.sources.v2.streaming.StreamWriteSupport#createStreamWriter(
+ * {@link StreamWriteSupport#createStreamWriter(
  * String, StructType, OutputMode, DataSourceOptions)}.
  * It can mix in various writing optimization interfaces to speed up the data saving. The actual
  * writing logic is delegated to {@link DataWriter}.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/StreamWriteSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/StreamWriteSupport.java
similarity index 93%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/StreamWriteSupport.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/StreamWriteSupport.java
index 7c5f304425093..1c0e2e12f8d51 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/StreamWriteSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/StreamWriteSupport.java
@@ -15,14 +15,13 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.streaming;
+package org.apache.spark.sql.sources.v2.writer;
 
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.execution.streaming.BaseStreamingSink;
 import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.DataSourceV2;
-import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter;
-import org.apache.spark.sql.sources.v2.writer.DataSourceWriter;
+import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter;
 import org.apache.spark.sql.streaming.OutputMode;
 import org.apache.spark.sql.types.StructType;
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/writer/StreamWriter.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/streaming/StreamWriter.java
similarity index 98%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/writer/StreamWriter.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/streaming/StreamWriter.java
index 915ee6c4fb390..4913341bd505d 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/writer/StreamWriter.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/streaming/StreamWriter.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.streaming.writer;
+package org.apache.spark.sql.sources.v2.writer.streaming;
 
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.sources.v2.writer.DataSourceWriter;
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourcePartitioning.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourcePartitioning.scala
index 943d0100aca56..017a6737161a6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourcePartitioning.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourcePartitioning.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.v2
 
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression}
 import org.apache.spark.sql.catalyst.plans.physical
-import org.apache.spark.sql.sources.v2.reader.{ClusteredDistribution, Partitioning}
+import org.apache.spark.sql.sources.v2.reader.partitioning.{ClusteredDistribution, Partitioning}
 
 /**
  * An adapter from public data source partitioning to catalyst internal `Partitioning`.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
index ee085820b0775..df469af2c262a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.plans.physical
 import org.apache.spark.sql.execution.{ColumnarBatchScan, LeafExecNode, WholeStageCodegenExec}
 import org.apache.spark.sql.execution.streaming.continuous._
 import org.apache.spark.sql.sources.v2.reader._
-import org.apache.spark.sql.sources.v2.streaming.reader.ContinuousReader
+import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousReader
 import org.apache.spark.sql.types.StructType
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
index c544adbf32cdf..6592bd72fa338 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
@@ -27,8 +27,8 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.streaming.continuous.{CommitPartitionEpoch, ContinuousExecution, EpochCoordinatorRef, SetWriterPartitions}
-import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
 import org.apache.spark.sql.sources.v2.writer._
+import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
index 93572f7a63132..d9aa8573ba930 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
@@ -30,9 +30,9 @@ import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.datasources.v2.{StreamingDataSourceV2Relation, WriteToDataSourceV2}
 import org.apache.spark.sql.execution.streaming.sources.{InternalRowMicroBatchWriter, MicroBatchWriter}
 import org.apache.spark.sql.sources.v2.DataSourceOptions
-import org.apache.spark.sql.sources.v2.streaming.{MicroBatchReadSupport, StreamWriteSupport}
-import org.apache.spark.sql.sources.v2.streaming.reader.{MicroBatchReader, Offset => OffsetV2}
-import org.apache.spark.sql.sources.v2.writer.SupportsWriteInternalRow
+import org.apache.spark.sql.sources.v2.reader.MicroBatchReadSupport
+import org.apache.spark.sql.sources.v2.reader.streaming.{MicroBatchReader, Offset => OffsetV2}
+import org.apache.spark.sql.sources.v2.writer.{StreamWriteSupport, SupportsWriteInternalRow}
 import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime, Trigger}
 import org.apache.spark.util.{Clock, Utils}
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
index 5e3fee633f591..ce5e63f5bde85 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
@@ -30,11 +30,10 @@ import org.apache.spark.sql.{AnalysisException, DataFrame, SQLContext}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
 import org.apache.spark.sql.execution.streaming.continuous.RateStreamContinuousReader
-import org.apache.spark.sql.execution.streaming.sources.RateStreamMicroBatchReader
 import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
 import org.apache.spark.sql.sources.v2._
-import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, MicroBatchReadSupport}
-import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousReader, MicroBatchReader}
+import org.apache.spark.sql.sources.v2.reader.ContinuousReadSupport
+import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousReader
 import org.apache.spark.sql.types._
 import org.apache.spark.util.{ManualClock, SystemClock}
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala
index 261d69bbd9843..02fed50485b94 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala
@@ -23,7 +23,7 @@ import org.json4s.jackson.Serialization
 import org.apache.spark.sql.sources.v2
 
 case class RateStreamOffset(partitionToValueAndRunTimeMs: Map[Int, ValueRunTimeMsPair])
-  extends v2.streaming.reader.Offset {
+  extends v2.reader.streaming.Offset {
   implicit val defaultFormats: DefaultFormats = DefaultFormats
   override val json = Serialization.write(partitionToValueAndRunTimeMs)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
index a0ee683a895d8..845c8d2c14e43 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.plans.logical.Statistics
 import org.apache.spark.sql.execution.LeafExecNode
 import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.sources.v2.DataSourceV2
-import org.apache.spark.sql.sources.v2.streaming.ContinuousReadSupport
+import org.apache.spark.sql.sources.v2.reader.ContinuousReadSupport
 
 object StreamingRelation {
   def apply(dataSource: DataSource): StreamingRelation = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
index 3f5bb489d6528..db600866067bc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
@@ -21,8 +21,8 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.execution.streaming.sources.ConsoleWriter
 import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister}
 import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2}
-import org.apache.spark.sql.sources.v2.streaming.StreamWriteSupport
-import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
+import org.apache.spark.sql.sources.v2.writer.StreamWriteSupport
+import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
index 8a7a38b22caca..cf02c0dda25d7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
@@ -18,23 +18,19 @@
 package org.apache.spark.sql.execution.streaming.continuous
 
 import java.util.concurrent.{ArrayBlockingQueue, BlockingQueue, TimeUnit}
-import java.util.concurrent.atomic.{AtomicBoolean, AtomicLong}
+import java.util.concurrent.atomic.AtomicBoolean
 
 import scala.collection.JavaConverters._
 
 import org.apache.spark._
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
-import org.apache.spark.rpc.RpcEndpointRef
 import org.apache.spark.sql.{Row, SQLContext}
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.execution.datasources.v2.{DataSourceRDDPartition, RowToUnsafeDataReader}
-import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.execution.streaming.continuous._
 import org.apache.spark.sql.sources.v2.reader._
-import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousDataReader, PartitionOffset}
-import org.apache.spark.sql.streaming.ProcessingTime
-import org.apache.spark.util.{SystemClock, ThreadUtils}
+import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousDataReader, PartitionOffset}
+import org.apache.spark.util.ThreadUtils
 
 class ContinuousDataSourceRDD(
     sc: SparkContext,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
index 9402d7c1dcefd..08c81419a9d34 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -32,8 +32,9 @@ import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, StreamingDataSourceV2Relation, WriteToDataSourceV2}
 import org.apache.spark.sql.execution.streaming.{ContinuousExecutionRelation, StreamingRelationV2, _}
 import org.apache.spark.sql.sources.v2.DataSourceOptions
-import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, StreamWriteSupport}
-import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousReader, PartitionOffset}
+import org.apache.spark.sql.sources.v2.reader.ContinuousReadSupport
+import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousReader, PartitionOffset}
+import org.apache.spark.sql.sources.v2.writer.StreamWriteSupport
 import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime, Trigger}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.{Clock, Utils}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
index ff028ebc4236a..0eaaa4889ba9e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.execution.streaming.{RateSourceProvider, RateStreamO
 import org.apache.spark.sql.execution.streaming.sources.RateStreamSourceV2
 import org.apache.spark.sql.sources.v2.DataSourceOptions
 import org.apache.spark.sql.sources.v2.reader._
-import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousDataReader, ContinuousReader, Offset, PartitionOffset}
+import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousDataReader, ContinuousReader, Offset, PartitionOffset}
 import org.apache.spark.sql.types.StructType
 
 case class RateStreamPartitionOffset(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
index 84d262116cb46..cc6808065c0cd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
@@ -23,9 +23,9 @@ import org.apache.spark.SparkEnv
 import org.apache.spark.internal.Logging
 import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, RpcEnv, ThreadSafeRpcEndpoint}
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousReader, PartitionOffset}
-import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
+import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousReader, PartitionOffset}
 import org.apache.spark.sql.sources.v2.writer.WriterCommitMessage
+import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
 import org.apache.spark.util.RpcUtils
 
 private[continuous] sealed trait EpochCoordinatorMessage extends Serializable
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala
index c57bdc4a28905..d276403190b3c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala
@@ -22,8 +22,8 @@ import scala.collection.JavaConverters._
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.sources.v2.DataSourceOptions
-import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
 import org.apache.spark.sql.sources.v2.writer.{DataWriterFactory, WriterCommitMessage}
+import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
 import org.apache.spark.sql.types.StructType
 
 /** Common methods used to create writes for the the console sink */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/MicroBatchWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/MicroBatchWriter.scala
index d7ce9a7b84479..56f7ff25cbed0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/MicroBatchWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/MicroBatchWriter.scala
@@ -19,8 +19,8 @@ package org.apache.spark.sql.execution.streaming.sources
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
 import org.apache.spark.sql.sources.v2.writer.{DataSourceWriter, DataWriterFactory, SupportsWriteInternalRow, WriterCommitMessage}
+import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
 
 /**
  * A [[DataSourceWriter]] used to hook V2 stream writers into a microbatch plan. It implements
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
index 43949e6180aaa..1315885da8a6f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
@@ -31,8 +31,7 @@ import org.apache.spark.sql.execution.streaming.{RateStreamOffset, ValueRunTimeM
 import org.apache.spark.sql.sources.DataSourceRegister
 import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2}
 import org.apache.spark.sql.sources.v2.reader._
-import org.apache.spark.sql.sources.v2.streaming.MicroBatchReadSupport
-import org.apache.spark.sql.sources.v2.streaming.reader.{MicroBatchReader, Offset}
+import org.apache.spark.sql.sources.v2.reader.streaming.{MicroBatchReader, Offset}
 import org.apache.spark.sql.types.{LongType, StructField, StructType, TimestampType}
 import org.apache.spark.util.{ManualClock, SystemClock}
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
index 58767261dc684..3411edbc53412 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
@@ -30,9 +30,8 @@ import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics}
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes.{Append, Complete, Update}
 import org.apache.spark.sql.execution.streaming.Sink
 import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2}
-import org.apache.spark.sql.sources.v2.streaming.StreamWriteSupport
-import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
 import org.apache.spark.sql.sources.v2.writer._
+import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index f1b3f93c4e1fc..116ac3da07b75 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming.{StreamingRelation, StreamingRelationV2}
 import org.apache.spark.sql.sources.StreamSourceProvider
 import org.apache.spark.sql.sources.v2.DataSourceOptions
-import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, MicroBatchReadSupport}
+import org.apache.spark.sql.sources.v2.reader.{ContinuousReadSupport, MicroBatchReadSupport}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
index 3b5b30d77945c..9aac360fd4bbc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.continuous.ContinuousTrigger
 import org.apache.spark.sql.execution.streaming.sources.{MemoryPlanV2, MemorySinkV2}
-import org.apache.spark.sql.sources.v2.streaming.StreamWriteSupport
+import org.apache.spark.sql.sources.v2.writer.StreamWriteSupport
 
 /**
  * Interface used to write a streaming `Dataset` to external storage systems (e.g. file systems,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
index fdd709cdb1f38..ddb1edc433d5a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.continuous.{ContinuousExecution, ContinuousTrigger}
 import org.apache.spark.sql.execution.streaming.state.StateStoreCoordinatorRef
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.sources.v2.streaming.StreamWriteSupport
+import org.apache.spark.sql.sources.v2.writer.StreamWriteSupport
 import org.apache.spark.util.{Clock, SystemClock, Utils}
 
 /**
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaPartitionAwareDataSource.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaPartitionAwareDataSource.java
index 99cca0f6dd626..32fad59b97ff6 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaPartitionAwareDataSource.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaPartitionAwareDataSource.java
@@ -27,6 +27,9 @@
 import org.apache.spark.sql.sources.v2.DataSourceV2;
 import org.apache.spark.sql.sources.v2.ReadSupport;
 import org.apache.spark.sql.sources.v2.reader.*;
+import org.apache.spark.sql.sources.v2.reader.partitioning.ClusteredDistribution;
+import org.apache.spark.sql.sources.v2.reader.partitioning.Distribution;
+import org.apache.spark.sql.sources.v2.reader.partitioning.Partitioning;
 import org.apache.spark.sql.types.StructType;
 
 public class JavaPartitionAwareDataSource implements DataSourceV2, ReadSupport {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
index b060aeeef811d..3158995ec62f1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming.continuous._
 import org.apache.spark.sql.execution.streaming.sources.{RateStreamBatchTask, RateStreamMicroBatchReader, RateStreamSourceV2}
 import org.apache.spark.sql.sources.v2.DataSourceOptions
-import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, MicroBatchReadSupport}
+import org.apache.spark.sql.sources.v2.reader.{ContinuousReadSupport, MicroBatchReadSupport}
 import org.apache.spark.sql.streaming.StreamTest
 import org.apache.spark.util.ManualClock
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
index ee50e8a92270b..2f49b07018aaf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
 import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector
 import org.apache.spark.sql.sources.{Filter, GreaterThan}
 import org.apache.spark.sql.sources.v2.reader._
+import org.apache.spark.sql.sources.v2.reader.partitioning.{ClusteredDistribution, Distribution, Partitioning}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types.{IntegerType, StructType}
 import org.apache.spark.sql.vectorized.ColumnarBatch
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
index 3127d664d32dc..cb873ab688e96 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
@@ -26,10 +26,10 @@ import org.apache.spark.sql.execution.streaming.continuous.ContinuousTrigger
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider}
 import org.apache.spark.sql.sources.v2.DataSourceOptions
-import org.apache.spark.sql.sources.v2.reader.DataReaderFactory
-import org.apache.spark.sql.sources.v2.streaming._
-import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousReader, MicroBatchReader, Offset, PartitionOffset}
-import org.apache.spark.sql.sources.v2.streaming.writer.StreamWriter
+import org.apache.spark.sql.sources.v2.reader.{ContinuousReadSupport, DataReaderFactory, MicroBatchReadSupport}
+import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousReader, MicroBatchReader, Offset, PartitionOffset}
+import org.apache.spark.sql.sources.v2.writer.StreamWriteSupport
+import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
 import org.apache.spark.sql.streaming.{OutputMode, StreamTest, Trigger}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils

From 871fd48dc381d48e67f7efcc45cc534d36e4ee6e Mon Sep 17 00:00:00 2001
From: Atallah Hezbor <atallahhezbor@gmail.com>
Date: Wed, 31 Jan 2018 20:45:55 -0800
Subject: [PATCH 230/566] [SPARK-21396][SQL] Fixes MatchError when UDTs are
 passed through Hive Thriftserver

Signed-off-by: Atallah Hezbor <atallahhezborgmail.com>

## What changes were proposed in this pull request?

This PR proposes modifying the match statement that gets the columns of a row in HiveThriftServer. There was previously no case for `UserDefinedType`, so querying a table that contained them would throw a match error. The changes catch that case and return the string representation.

## How was this patch tested?

While I would have liked to add a unit test, I couldn't easily incorporate UDTs into the ``HiveThriftServer2Suites`` pipeline. With some guidance I would be happy to push a commit with tests.

Instead I did a manual test by loading a `DataFrame` with Point UDT in a spark shell with a HiveThriftServer. Then in beeline, connecting to the server and querying that table.

Here is the result before the change
```
0: jdbc:hive2://localhost:10000> select * from chicago;
Error: scala.MatchError: org.apache.spark.sql.PointUDT2d980dc3 (of class org.apache.spark.sql.PointUDT) (state=,code=0)

```

And after the change:
```
0: jdbc:hive2://localhost:10000> select * from chicago;
+---------------------------------------+--------------+------------------------+---------------------+--+
|                __fid__                | case_number  |          dtg           |        geom         |
+---------------------------------------+--------------+------------------------+---------------------+--+
| 109602f9-54f8-414b-8c6f-42b1a337643e  | 2            | 2016-01-01 19:00:00.0  | POINT (-77 38)      |
| 709602f9-fcff-4429-8027-55649b6fd7ed  | 1            | 2015-12-31 19:00:00.0  | POINT (-76.5 38.5)  |
| 009602f9-fcb5-45b1-a867-eb8ba10cab40  | 3            | 2016-01-02 19:00:00.0  | POINT (-78 39)      |
+---------------------------------------+--------------+------------------------+---------------------+--+
```

Author: Atallah Hezbor <atallahhezbor@gmail.com>

Closes #20385 from atallahhezbor/udts_over_hive.

(cherry picked from commit b2e7677f4d3d8f47f5f148680af39d38f2b558f0)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../thriftserver/SparkExecuteStatementOperation.scala     | 2 +-
 .../main/scala/org/apache/spark/sql/hive/HiveUtils.scala  | 1 +
 .../scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala  | 8 +++++++-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index 664bc20601eaa..3cfc81b8a9579 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -102,7 +102,7 @@ private[hive] class SparkExecuteStatementOperation(
         to += from.getAs[Timestamp](ordinal)
       case BinaryType =>
         to += from.getAs[Array[Byte]](ordinal)
-      case _: ArrayType | _: StructType | _: MapType =>
+      case _: ArrayType | _: StructType | _: MapType | _: UserDefinedType[_] =>
         val hiveString = HiveUtils.toHiveString((from.get(ordinal), dataTypes(ordinal)))
         to += hiveString
     }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
index c7717d70c996f..d9627eb9790eb 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
@@ -460,6 +460,7 @@ private[spark] object HiveUtils extends Logging {
     case (decimal: java.math.BigDecimal, DecimalType()) =>
       // Hive strips trailing zeros so use its toString
       HiveDecimal.create(decimal).toString
+    case (other, _ : UserDefinedType[_]) => other.toString
     case (other, tpe) if primitiveTypes contains tpe => other.toString
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala
index 8697d47e89e89..f2b75e4b23f02 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala
@@ -25,7 +25,7 @@ import org.apache.spark.SparkConf
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.hive.test.TestHiveSingleton
-import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, SQLTestUtils}
 import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader}
 
 class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
@@ -62,4 +62,10 @@ class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton
       Thread.currentThread().setContextClassLoader(contextClassLoader)
     }
   }
+
+  test("toHiveString correctly handles UDTs") {
+    val point = new ExamplePoint(50.0, 50.0)
+    val tpe = new ExamplePointUDT()
+    assert(HiveUtils.toHiveString((point, tpe)) === "(50.0, 50.0)")
+  }
 }

From 205bce974b86bef9d9d507e1b89549cb01c7b535 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 1 Feb 2018 11:25:01 +0200
Subject: [PATCH 231/566] [SPARK-23107][ML] ML 2.3 QA: New Scala APIs, docs.

## What changes were proposed in this pull request?
Audit new APIs and docs in 2.3.0.

## How was this patch tested?
No test.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #20459 from yanboliang/SPARK-23107.

(cherry picked from commit e15da5b14c8d845028365a609c0c66731d024ee7)
Signed-off-by: Nick Pentreath <nickp@za.ibm.com>
---
 mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala | 2 +-
 .../scala/org/apache/spark/ml/regression/LinearRegression.scala | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index 1155ea5fdd85b..22e7b8bbf1ff5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -74,7 +74,7 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol with
    * @group param
    */
   @Since("2.3.0")
-  final override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid",
+  override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid",
     "How to handle invalid data (unseen or NULL values) in features and label column of string " +
     "type. Options are 'skip' (filter out rows with invalid data), error (throw an error), " +
     "or 'keep' (put invalid data in a special additional bucket, at index numLabels).",
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index a5873d03b4161..6d3fe7a6c748c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -645,7 +645,7 @@ class LinearRegressionModel private[ml] (
   extends RegressionModel[Vector, LinearRegressionModel]
   with LinearRegressionParams with MLWritable {
 
-  def this(uid: String, coefficients: Vector, intercept: Double) =
+  private[ml] def this(uid: String, coefficients: Vector, intercept: Double) =
     this(uid, coefficients, intercept, 1.0)
 
   private var trainingSummary: Option[LinearRegressionTrainingSummary] = None

From 6b6bc9c4ebeb4c1ebfea3f6ddff0d2f502011e0c Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Thu, 1 Feb 2018 21:25:02 +0900
Subject: [PATCH 232/566] [SPARK-23280][SQL][FOLLOWUP] Fix Java style check
 issues.

## What changes were proposed in this pull request?

This is a follow-up of #20450 which broke lint-java checks.
This pr fixes the lint-java issues.

```
[ERROR] src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java:[20,8] (imports) UnusedImports: Unused import - org.apache.spark.sql.catalyst.util.MapData.
[ERROR] src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java:[21,8] (imports) UnusedImports: Unused import - org.apache.spark.sql.catalyst.util.MapData.
[ERROR] src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java:[22,8] (imports) UnusedImports: Unused import - org.apache.spark.sql.catalyst.util.MapData.
```

## How was this patch tested?

Checked manually in my local environment.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #20468 from ueshin/issues/SPARK-23280/fup1.

(cherry picked from commit 8bb70b068ea782e799e45238fcb093a6acb0fc9f)
Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
---
 .../main/java/org/apache/spark/sql/vectorized/ColumnVector.java  | 1 -
 .../main/java/org/apache/spark/sql/vectorized/ColumnarArray.java | 1 -
 .../main/java/org/apache/spark/sql/vectorized/ColumnarRow.java   | 1 -
 3 files changed, 3 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
index 05271ec1f46ab..530d4d23d4eaf 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
@@ -17,7 +17,6 @@
 package org.apache.spark.sql.vectorized;
 
 import org.apache.spark.annotation.InterfaceStability;
-import org.apache.spark.sql.catalyst.util.MapData;
 import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.Decimal;
 import org.apache.spark.unsafe.types.CalendarInterval;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
index 7c7a1c806a2b7..72a192d089b9f 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
@@ -18,7 +18,6 @@
 
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.catalyst.util.ArrayData;
-import org.apache.spark.sql.catalyst.util.MapData;
 import org.apache.spark.sql.types.*;
 import org.apache.spark.unsafe.types.CalendarInterval;
 import org.apache.spark.unsafe.types.UTF8String;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
index 0c9e92ed11fbd..b400f7f93c1fe 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
@@ -19,7 +19,6 @@
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
-import org.apache.spark.sql.catalyst.util.MapData;
 import org.apache.spark.sql.types.*;
 import org.apache.spark.unsafe.types.CalendarInterval;
 import org.apache.spark.unsafe.types.UTF8String;

From 3aa780ef34492ab1746bbcde8a75bfa8c3d929e1 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Thu, 1 Feb 2018 21:28:53 +0900
Subject: [PATCH 233/566] [SPARK-23280][SQL][FOLLOWUP] Enable
 `MutableColumnarRow.getMap()`.

## What changes were proposed in this pull request?

This is a followup pr of #20450.
We should've enabled `MutableColumnarRow.getMap()` as well.

## How was this patch tested?

Existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #20471 from ueshin/issues/SPARK-23280/fup2.

(cherry picked from commit 89e8d556b93d1bf1b28fe153fd284f154045b0ee)
Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
---
 .../spark/sql/execution/vectorized/MutableColumnarRow.java | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
index 66668f3753604..307c19032dee5 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
@@ -21,10 +21,10 @@
 
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
-import org.apache.spark.sql.catalyst.util.MapData;
 import org.apache.spark.sql.types.*;
 import org.apache.spark.sql.vectorized.ColumnarArray;
 import org.apache.spark.sql.vectorized.ColumnarBatch;
+import org.apache.spark.sql.vectorized.ColumnarMap;
 import org.apache.spark.sql.vectorized.ColumnarRow;
 import org.apache.spark.sql.vectorized.ColumnVector;
 import org.apache.spark.unsafe.types.CalendarInterval;
@@ -162,8 +162,9 @@ public ColumnarArray getArray(int ordinal) {
   }
 
   @Override
-  public MapData getMap(int ordinal) {
-    throw new UnsupportedOperationException();
+  public ColumnarMap getMap(int ordinal) {
+    if (columns[ordinal].isNullAt(rowId)) return null;
+    return columns[ordinal].getMap(rowId);
   }
 
   @Override

From 2549beae20fe8761242f6fb9cda35ff06a652897 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <zsxwing@gmail.com>
Date: Thu, 1 Feb 2018 21:00:47 +0800
Subject: [PATCH 234/566] [SPARK-23289][CORE]
 OneForOneBlockFetcher.DownloadCallback.onData should write the buffer fully

## What changes were proposed in this pull request?

`channel.write(buf)` may not write the whole buffer since the underlying channel is a FileChannel, we should retry until the whole buffer is written.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <zsxwing@gmail.com>

Closes #20461 from zsxwing/SPARK-23289.

(cherry picked from commit ec63e2d0743a4f75e1cce21d0fe2b54407a86a4a)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/network/shuffle/OneForOneBlockFetcher.java  | 4 +++-
 core/src/test/scala/org/apache/spark/FileSuite.scala         | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java
index 9cac7d00cc6b6..0bc571874f07c 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java
@@ -171,7 +171,9 @@ private class DownloadCallback implements StreamCallback {
 
     @Override
     public void onData(String streamId, ByteBuffer buf) throws IOException {
-      channel.write(buf);
+      while (buf.hasRemaining()) {
+        channel.write(buf);
+      }
     }
 
     @Override
diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala
index e9539dc73f6fa..55a9122cf9026 100644
--- a/core/src/test/scala/org/apache/spark/FileSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FileSuite.scala
@@ -244,7 +244,10 @@ class FileSuite extends SparkFunSuite with LocalSparkContext {
     for (i <- 0 until testOutputCopies) {
       // Shift values by i so that they're different in the output
       val alteredOutput = testOutput.map(b => (b + i).toByte)
-      channel.write(ByteBuffer.wrap(alteredOutput))
+      val buffer = ByteBuffer.wrap(alteredOutput)
+      while (buffer.hasRemaining) {
+        channel.write(buffer)
+      }
     }
     channel.close()
     file.close()

From 2db7e49dbb58c5b44e22a6a3aae4fa04cd89e5e8 Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Thu, 1 Feb 2018 10:36:31 -0800
Subject: [PATCH 235/566] [SPARK-13983][SQL] Fix HiveThriftServer2 can not get
 "--hiveconf" and ''--hivevar" variables since 2.0

## What changes were proposed in this pull request?

`--hiveconf` and `--hivevar` variables no longer work since Spark 2.0. The `spark-sql` client has fixed by [SPARK-15730](https://issues.apache.org/jira/browse/SPARK-15730) and [SPARK-18086](https://issues.apache.org/jira/browse/SPARK-18086). but `beeline`/[`Spark SQL HiveThriftServer2`](https://github.com/apache/spark/blob/v2.1.1/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala) is still broken. This pull request fix it.

This pull request works for both `JDBC client` and `beeline`.

## How was this patch tested?

unit tests for  `JDBC client`
manual tests for `beeline`:
```
git checkout origin/pr/17886

dev/make-distribution.sh --mvn mvn  --tgz -Phive -Phive-thriftserver -Phadoop-2.6 -DskipTests

tar -zxf spark-2.3.0-SNAPSHOT-bin-2.6.5.tgz && cd spark-2.3.0-SNAPSHOT-bin-2.6.5

sbin/start-thriftserver.sh
```
```
cat <<EOF > test.sql
select '\${a}', '\${b}';
EOF

beeline -u jdbc:hive2://localhost:10000 --hiveconf a=avalue --hivevar b=bvalue -f test.sql

```

Author: Yuming Wang <wgyumg@gmail.com>

Closes #17886 from wangyum/SPARK-13983-dev.

(cherry picked from commit f051f834036e63d5e480d86440ce39924f979e82)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../service/cli/session/HiveSessionImpl.java  | 74 ++++++++++++++++++-
 .../server/SparkSQLOperationManager.scala     | 12 +++
 .../HiveThriftServer2Suites.scala             | 23 +++++-
 3 files changed, 105 insertions(+), 4 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java
index 108074cce3d6d..fc818bc69c761 100644
--- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java
+++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java
@@ -44,7 +44,7 @@
 import org.apache.hadoop.hive.ql.history.HiveHistory;
 import org.apache.hadoop.hive.ql.metadata.Hive;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
-import org.apache.hadoop.hive.ql.processors.SetProcessor;
+import org.apache.hadoop.hive.ql.parse.VariableSubstitution;
 import org.apache.hadoop.hive.ql.session.SessionState;
 import org.apache.hadoop.hive.shims.ShimLoader;
 import org.apache.hive.common.util.HiveVersionInfo;
@@ -71,6 +71,12 @@
 import org.apache.hive.service.cli.thrift.TProtocolVersion;
 import org.apache.hive.service.server.ThreadWithGarbageCleanup;
 
+import static org.apache.hadoop.hive.conf.SystemVariables.ENV_PREFIX;
+import static org.apache.hadoop.hive.conf.SystemVariables.HIVECONF_PREFIX;
+import static org.apache.hadoop.hive.conf.SystemVariables.HIVEVAR_PREFIX;
+import static org.apache.hadoop.hive.conf.SystemVariables.METACONF_PREFIX;
+import static org.apache.hadoop.hive.conf.SystemVariables.SYSTEM_PREFIX;
+
 /**
  * HiveSession
  *
@@ -209,7 +215,7 @@ private void configureSession(Map<String, String> sessionConfMap) throws HiveSQL
       String key = entry.getKey();
       if (key.startsWith("set:")) {
         try {
-          SetProcessor.setVariable(key.substring(4), entry.getValue());
+          setVariable(key.substring(4), entry.getValue());
         } catch (Exception e) {
           throw new HiveSQLException(e);
         }
@@ -221,6 +227,70 @@ private void configureSession(Map<String, String> sessionConfMap) throws HiveSQL
     }
   }
 
+  // Copy from org.apache.hadoop.hive.ql.processors.SetProcessor, only change:
+  // setConf(varname, propName, varvalue, true) when varname.startsWith(HIVECONF_PREFIX)
+  public static int setVariable(String varname, String varvalue) throws Exception {
+    SessionState ss = SessionState.get();
+    if (varvalue.contains("\n")){
+      ss.err.println("Warning: Value had a \\n character in it.");
+    }
+    varname = varname.trim();
+    if (varname.startsWith(ENV_PREFIX)){
+      ss.err.println("env:* variables can not be set.");
+      return 1;
+    } else if (varname.startsWith(SYSTEM_PREFIX)){
+      String propName = varname.substring(SYSTEM_PREFIX.length());
+      System.getProperties().setProperty(propName,
+              new VariableSubstitution().substitute(ss.getConf(),varvalue));
+    } else if (varname.startsWith(HIVECONF_PREFIX)){
+      String propName = varname.substring(HIVECONF_PREFIX.length());
+      setConf(varname, propName, varvalue, true);
+    } else if (varname.startsWith(HIVEVAR_PREFIX)) {
+      String propName = varname.substring(HIVEVAR_PREFIX.length());
+      ss.getHiveVariables().put(propName,
+              new VariableSubstitution().substitute(ss.getConf(),varvalue));
+    } else if (varname.startsWith(METACONF_PREFIX)) {
+      String propName = varname.substring(METACONF_PREFIX.length());
+      Hive hive = Hive.get(ss.getConf());
+      hive.setMetaConf(propName, new VariableSubstitution().substitute(ss.getConf(), varvalue));
+    } else {
+      setConf(varname, varname, varvalue, true);
+    }
+    return 0;
+  }
+
+  // returns non-null string for validation fail
+  private static void setConf(String varname, String key, String varvalue, boolean register)
+          throws IllegalArgumentException {
+    HiveConf conf = SessionState.get().getConf();
+    String value = new VariableSubstitution().substitute(conf, varvalue);
+    if (conf.getBoolVar(HiveConf.ConfVars.HIVECONFVALIDATION)) {
+      HiveConf.ConfVars confVars = HiveConf.getConfVars(key);
+      if (confVars != null) {
+        if (!confVars.isType(value)) {
+          StringBuilder message = new StringBuilder();
+          message.append("'SET ").append(varname).append('=').append(varvalue);
+          message.append("' FAILED because ").append(key).append(" expects ");
+          message.append(confVars.typeString()).append(" type value.");
+          throw new IllegalArgumentException(message.toString());
+        }
+        String fail = confVars.validate(value);
+        if (fail != null) {
+          StringBuilder message = new StringBuilder();
+          message.append("'SET ").append(varname).append('=').append(varvalue);
+          message.append("' FAILED in validation : ").append(fail).append('.');
+          throw new IllegalArgumentException(message.toString());
+        }
+      } else if (key.startsWith("hive.")) {
+        throw new IllegalArgumentException("hive configuration " + key + " does not exists.");
+      }
+    }
+    conf.verifyAndSet(key, value);
+    if (register) {
+      SessionState.get().getOverriddenConfigurations().put(key, value);
+    }
+  }
+
   @Override
   public void setOperationLogSessionDir(File operationLogRootDir) {
     if (!operationLogRootDir.exists()) {
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala
index a0e5012633f5e..bf7c01f60fb5c 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala
@@ -28,6 +28,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.hive.HiveUtils
 import org.apache.spark.sql.hive.thriftserver.{ReflectionUtils, SparkExecuteStatementOperation}
+import org.apache.spark.sql.internal.SQLConf
 
 /**
  * Executes queries using Spark SQL, and maintains a list of handles to active queries.
@@ -50,6 +51,9 @@ private[thriftserver] class SparkSQLOperationManager()
     require(sqlContext != null, s"Session handle: ${parentSession.getSessionHandle} has not been" +
       s" initialized or had already closed.")
     val conf = sqlContext.sessionState.conf
+    val hiveSessionState = parentSession.getSessionState
+    setConfMap(conf, hiveSessionState.getOverriddenConfigurations)
+    setConfMap(conf, hiveSessionState.getHiveVariables)
     val runInBackground = async && conf.getConf(HiveUtils.HIVE_THRIFT_SERVER_ASYNC)
     val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay,
       runInBackground)(sqlContext, sessionToActivePool)
@@ -58,4 +62,12 @@ private[thriftserver] class SparkSQLOperationManager()
       s"runInBackground=$runInBackground")
     operation
   }
+
+  def setConfMap(conf: SQLConf, confMap: java.util.Map[String, String]): Unit = {
+    val iterator = confMap.entrySet().iterator()
+    while (iterator.hasNext) {
+      val kv = iterator.next()
+      conf.setConfString(kv.getKey, kv.getValue)
+    }
+  }
 }
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
index 7289da71a3365..496f8c82a6c61 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
@@ -135,6 +135,22 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest {
     }
   }
 
+  test("Support beeline --hiveconf and --hivevar") {
+    withJdbcStatement() { statement =>
+      executeTest(hiveConfList)
+      executeTest(hiveVarList)
+      def executeTest(hiveList: String): Unit = {
+        hiveList.split(";").foreach{ m =>
+          val kv = m.split("=")
+          // select "${a}"; ---> avalue
+          val resultSet = statement.executeQuery("select \"${" + kv(0) + "}\"")
+          resultSet.next()
+          assert(resultSet.getString(1) === kv(1))
+        }
+      }
+    }
+  }
+
   test("JDBC query execution") {
     withJdbcStatement("test") { statement =>
       val queries = Seq(
@@ -740,10 +756,11 @@ abstract class HiveThriftJdbcTest extends HiveThriftServer2Test {
     s"""jdbc:hive2://localhost:$serverPort/
        |default?
        |hive.server2.transport.mode=http;
-       |hive.server2.thrift.http.path=cliservice
+       |hive.server2.thrift.http.path=cliservice;
+       |${hiveConfList}#${hiveVarList}
      """.stripMargin.split("\n").mkString.trim
   } else {
-    s"jdbc:hive2://localhost:$serverPort/"
+    s"jdbc:hive2://localhost:$serverPort/?${hiveConfList}#${hiveVarList}"
   }
 
   def withMultipleConnectionJdbcStatement(tableNames: String*)(fs: (Statement => Unit)*) {
@@ -779,6 +796,8 @@ abstract class HiveThriftServer2Test extends SparkFunSuite with BeforeAndAfterAl
   private var listeningPort: Int = _
   protected def serverPort: Int = listeningPort
 
+  protected val hiveConfList = "a=avalue;b=bvalue"
+  protected val hiveVarList = "c=cvalue;d=dvalue"
   protected def user = System.getProperty("user.name")
 
   protected var warehousePath: File = _

From 07a8f4ddfc2edccde9b1d28b4436a596d2f7db63 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 1 Feb 2018 10:48:34 -0800
Subject: [PATCH 236/566] [SPARK-23293][SQL] fix data source v2 self join

`DataSourceV2Relation` should extend `MultiInstanceRelation`, to take care of self-join.

a new test

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20466 from cloud-fan/dsv2-selfjoin.

(cherry picked from commit 73da3b6968630d9e2cafc742ccb6d4eb54957df4)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../execution/datasources/v2/DataSourceV2Relation.scala   | 8 +++++++-
 .../apache/spark/sql/sources/v2/DataSourceV2Suite.scala   | 6 ++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
index 3d4c64981373d..eebfa29f91b99 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
@@ -17,13 +17,15 @@
 
 package org.apache.spark.sql.execution.datasources.v2
 
+import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.expressions.AttributeReference
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics}
 import org.apache.spark.sql.sources.v2.reader._
 
 case class DataSourceV2Relation(
     fullOutput: Seq[AttributeReference],
-    reader: DataSourceReader) extends LeafNode with DataSourceReaderHolder {
+    reader: DataSourceReader)
+  extends LeafNode with MultiInstanceRelation with DataSourceReaderHolder {
 
   override def canEqual(other: Any): Boolean = other.isInstanceOf[DataSourceV2Relation]
 
@@ -33,6 +35,10 @@ case class DataSourceV2Relation(
     case _ =>
       Statistics(sizeInBytes = conf.defaultSizeInBytes)
   }
+
+  override def newInstance(): DataSourceV2Relation = {
+    copy(fullOutput = fullOutput.map(_.newInstance()))
+  }
 }
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
index 2f49b07018aaf..f34928f013570 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
@@ -198,6 +198,12 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  test("SPARK-23293: data source v2 self join") {
+    val df = spark.read.format(classOf[SimpleDataSourceV2].getName).load()
+    val df2 = df.select(($"i" + 1).as("k"), $"j")
+    checkAnswer(df.join(df2, "j"), (0 until 10).map(i => Row(-i, i, i + 1)))
+  }
 }
 
 class SimpleDataSourceV2 extends DataSourceV2 with ReadSupport {

From ab23785c70229acd6c22218f722337cf0a9cc55b Mon Sep 17 00:00:00 2001
From: Gera Shegalov <gera@apache.org>
Date: Thu, 1 Feb 2018 15:26:59 -0800
Subject: [PATCH 237/566] [SPARK-23296][YARN] Include stacktrace in YARN-app
 diagnostic

## What changes were proposed in this pull request?

Include stacktrace in the diagnostics message upon abnormal unregister from RM

## How was this patch tested?
Tested with a failing job, and confirmed a stacktrace in the client output and YARN webUI.

Author: Gera Shegalov <gera@apache.org>

Closes #20470 from gerashegalov/gera/stacktrace-diagnostics.

(cherry picked from commit 032c11b83f0d276bf8085992229b8c598f02798a)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 4d5e3bb043671..2f88feb0f1fdf 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -30,6 +30,7 @@ import scala.util.control.NonFatal
 
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.security.UserGroupInformation
+import org.apache.hadoop.util.StringUtils
 import org.apache.hadoop.yarn.api._
 import org.apache.hadoop.yarn.api.records._
 import org.apache.hadoop.yarn.conf.YarnConfiguration
@@ -718,7 +719,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends
                 logError("User class threw exception: " + cause, cause)
                 finish(FinalApplicationStatus.FAILED,
                   ApplicationMaster.EXIT_EXCEPTION_USER_CLASS,
-                  "User class threw exception: " + cause)
+                  "User class threw exception: " + StringUtils.stringifyException(cause))
             }
             sparkContextPromise.tryFailure(e.getCause())
         } finally {

From 7baae3aef34d16cb0a2d024d96027d8378a03927 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 2 Feb 2018 10:18:32 +0800
Subject: [PATCH 238/566] [SPARK-23284][SQL] Document the behavior of several
 ColumnVector's get APIs when accessing null slot

## What changes were proposed in this pull request?

For some ColumnVector get APIs such as getDecimal, getBinary, getStruct, getArray, getInterval, getUTF8String, we should clearly document their behaviors when accessing null slot. They should return null in this case. Then we can remove null checks from the places using above APIs.

For the APIs of primitive values like getInt, getInts, etc., this also documents their behaviors when accessing null slots. Their returning values are undefined and can be anything.

## How was this patch tested?

Added tests into `ColumnarBatchSuite`.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #20455 from viirya/SPARK-23272-followup.

(cherry picked from commit 90848d507457d30abb36e3ba07618dfc87c34cd6)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../datasources/orc/OrcColumnVector.java      |  3 +
 .../vectorized/MutableColumnarRow.java        |  7 --
 .../vectorized/WritableColumnVector.java      |  5 ++
 .../sql/vectorized/ArrowColumnVector.java     |  4 +
 .../spark/sql/vectorized/ColumnVector.java    | 63 ++++++++++------
 .../spark/sql/vectorized/ColumnarRow.java     |  7 --
 .../vectorized/ColumnarBatchSuite.scala       | 74 ++++++++++++++++++-
 7 files changed, 124 insertions(+), 39 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
index c8add4c9f486c..12f4d658b1868 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
@@ -154,12 +154,14 @@ public double getDouble(int rowId) {
 
   @Override
   public Decimal getDecimal(int rowId, int precision, int scale) {
+    if (isNullAt(rowId)) return null;
     BigDecimal data = decimalData.vector[getRowIndex(rowId)].getHiveDecimal().bigDecimalValue();
     return Decimal.apply(data, precision, scale);
   }
 
   @Override
   public UTF8String getUTF8String(int rowId) {
+    if (isNullAt(rowId)) return null;
     int index = getRowIndex(rowId);
     BytesColumnVector col = bytesData;
     return UTF8String.fromBytes(col.vector[index], col.start[index], col.length[index]);
@@ -167,6 +169,7 @@ public UTF8String getUTF8String(int rowId) {
 
   @Override
   public byte[] getBinary(int rowId) {
+    if (isNullAt(rowId)) return null;
     int index = getRowIndex(rowId);
     byte[] binary = new byte[bytesData.length[index]];
     System.arraycopy(bytesData.vector[index], bytesData.start[index], binary, 0, binary.length);
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
index 307c19032dee5..4e4242fe8d9b9 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
@@ -127,43 +127,36 @@ public boolean anyNull() {
 
   @Override
   public Decimal getDecimal(int ordinal, int precision, int scale) {
-    if (columns[ordinal].isNullAt(rowId)) return null;
     return columns[ordinal].getDecimal(rowId, precision, scale);
   }
 
   @Override
   public UTF8String getUTF8String(int ordinal) {
-    if (columns[ordinal].isNullAt(rowId)) return null;
     return columns[ordinal].getUTF8String(rowId);
   }
 
   @Override
   public byte[] getBinary(int ordinal) {
-    if (columns[ordinal].isNullAt(rowId)) return null;
     return columns[ordinal].getBinary(rowId);
   }
 
   @Override
   public CalendarInterval getInterval(int ordinal) {
-    if (columns[ordinal].isNullAt(rowId)) return null;
     return columns[ordinal].getInterval(rowId);
   }
 
   @Override
   public ColumnarRow getStruct(int ordinal, int numFields) {
-    if (columns[ordinal].isNullAt(rowId)) return null;
     return columns[ordinal].getStruct(rowId);
   }
 
   @Override
   public ColumnarArray getArray(int ordinal) {
-    if (columns[ordinal].isNullAt(rowId)) return null;
     return columns[ordinal].getArray(rowId);
   }
 
   @Override
   public ColumnarMap getMap(int ordinal) {
-    if (columns[ordinal].isNullAt(rowId)) return null;
     return columns[ordinal].getMap(rowId);
   }
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index c2e595455549c..48eebcf2f0746 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -338,6 +338,7 @@ public final int putByteArray(int rowId, byte[] value) {
 
   @Override
   public Decimal getDecimal(int rowId, int precision, int scale) {
+    if (isNullAt(rowId)) return null;
     if (precision <= Decimal.MAX_INT_DIGITS()) {
       return Decimal.createUnsafe(getInt(rowId), precision, scale);
     } else if (precision <= Decimal.MAX_LONG_DIGITS()) {
@@ -364,6 +365,7 @@ public void putDecimal(int rowId, Decimal value, int precision) {
 
   @Override
   public UTF8String getUTF8String(int rowId) {
+    if (isNullAt(rowId)) return null;
     if (dictionary == null) {
       return arrayData().getBytesAsUTF8String(getArrayOffset(rowId), getArrayLength(rowId));
     } else {
@@ -381,6 +383,7 @@ public UTF8String getUTF8String(int rowId) {
 
   @Override
   public byte[] getBinary(int rowId) {
+    if (isNullAt(rowId)) return null;
     if (dictionary == null) {
       return arrayData().getBytes(getArrayOffset(rowId), getArrayLength(rowId));
     } else {
@@ -610,6 +613,7 @@ public final int appendStruct(boolean isNull) {
   // array offsets and lengths in the current column vector.
   @Override
   public final ColumnarArray getArray(int rowId) {
+    if (isNullAt(rowId)) return null;
     return new ColumnarArray(arrayData(), getArrayOffset(rowId), getArrayLength(rowId));
   }
 
@@ -617,6 +621,7 @@ public final ColumnarArray getArray(int rowId) {
   // second child column vector, and puts the offsets and lengths in the current column vector.
   @Override
   public final ColumnarMap getMap(int rowId) {
+    if (isNullAt(rowId)) return null;
     return new ColumnarMap(getChild(0), getChild(1), getArrayOffset(rowId), getArrayLength(rowId));
   }
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
index f3ece538c3b80..f8e37e995a17f 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
@@ -101,21 +101,25 @@ public double getDouble(int rowId) {
 
   @Override
   public Decimal getDecimal(int rowId, int precision, int scale) {
+    if (isNullAt(rowId)) return null;
     return accessor.getDecimal(rowId, precision, scale);
   }
 
   @Override
   public UTF8String getUTF8String(int rowId) {
+    if (isNullAt(rowId)) return null;
     return accessor.getUTF8String(rowId);
   }
 
   @Override
   public byte[] getBinary(int rowId) {
+    if (isNullAt(rowId)) return null;
     return accessor.getBinary(rowId);
   }
 
   @Override
   public ColumnarArray getArray(int rowId) {
+    if (isNullAt(rowId)) return null;
     return accessor.getArray(rowId);
   }
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
index 530d4d23d4eaf..ad99b450a4809 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
@@ -80,12 +80,14 @@ public abstract class ColumnVector implements AutoCloseable {
   public abstract boolean isNullAt(int rowId);
 
   /**
-   * Returns the boolean type value for rowId.
+   * Returns the boolean type value for rowId. The return value is undefined and can be anything,
+   * if the slot for rowId is null.
    */
   public abstract boolean getBoolean(int rowId);
 
   /**
-   * Gets boolean type values from [rowId, rowId + count)
+   * Gets boolean type values from [rowId, rowId + count). The return values for the null slots
+   * are undefined and can be anything.
    */
   public boolean[] getBooleans(int rowId, int count) {
     boolean[] res = new boolean[count];
@@ -96,12 +98,14 @@ public boolean[] getBooleans(int rowId, int count) {
   }
 
   /**
-   * Returns the byte type value for rowId.
+   * Returns the byte type value for rowId. The return value is undefined and can be anything,
+   * if the slot for rowId is null.
    */
   public abstract byte getByte(int rowId);
 
   /**
-   * Gets byte type values from [rowId, rowId + count)
+   * Gets byte type values from [rowId, rowId + count). The return values for the null slots
+   * are undefined and can be anything.
    */
   public byte[] getBytes(int rowId, int count) {
     byte[] res = new byte[count];
@@ -112,12 +116,14 @@ public byte[] getBytes(int rowId, int count) {
   }
 
   /**
-   * Returns the short type value for rowId.
+   * Returns the short type value for rowId. The return value is undefined and can be anything,
+   * if the slot for rowId is null.
    */
   public abstract short getShort(int rowId);
 
   /**
-   * Gets short type values from [rowId, rowId + count)
+   * Gets short type values from [rowId, rowId + count). The return values for the null slots
+   * are undefined and can be anything.
    */
   public short[] getShorts(int rowId, int count) {
     short[] res = new short[count];
@@ -128,12 +134,14 @@ public short[] getShorts(int rowId, int count) {
   }
 
   /**
-   * Returns the int type value for rowId.
+   * Returns the int type value for rowId. The return value is undefined and can be anything,
+   * if the slot for rowId is null.
    */
   public abstract int getInt(int rowId);
 
   /**
-   * Gets int type values from [rowId, rowId + count)
+   * Gets int type values from [rowId, rowId + count). The return values for the null slots
+   * are undefined and can be anything.
    */
   public int[] getInts(int rowId, int count) {
     int[] res = new int[count];
@@ -144,12 +152,14 @@ public int[] getInts(int rowId, int count) {
   }
 
   /**
-   * Returns the long type value for rowId.
+   * Returns the long type value for rowId. The return value is undefined and can be anything,
+   * if the slot for rowId is null.
    */
   public abstract long getLong(int rowId);
 
   /**
-   * Gets long type values from [rowId, rowId + count)
+   * Gets long type values from [rowId, rowId + count). The return values for the null slots
+   * are undefined and can be anything.
    */
   public long[] getLongs(int rowId, int count) {
     long[] res = new long[count];
@@ -160,12 +170,14 @@ public long[] getLongs(int rowId, int count) {
   }
 
   /**
-   * Returns the float type value for rowId.
+   * Returns the float type value for rowId. The return value is undefined and can be anything,
+   * if the slot for rowId is null.
    */
   public abstract float getFloat(int rowId);
 
   /**
-   * Gets float type values from [rowId, rowId + count)
+   * Gets float type values from [rowId, rowId + count). The return values for the null slots
+   * are undefined and can be anything.
    */
   public float[] getFloats(int rowId, int count) {
     float[] res = new float[count];
@@ -176,12 +188,14 @@ public float[] getFloats(int rowId, int count) {
   }
 
   /**
-   * Returns the double type value for rowId.
+   * Returns the double type value for rowId. The return value is undefined and can be anything,
+   * if the slot for rowId is null.
    */
   public abstract double getDouble(int rowId);
 
   /**
-   * Gets double type values from [rowId, rowId + count)
+   * Gets double type values from [rowId, rowId + count). The return values for the null slots
+   * are undefined and can be anything.
    */
   public double[] getDoubles(int rowId, int count) {
     double[] res = new double[count];
@@ -192,7 +206,7 @@ public double[] getDoubles(int rowId, int count) {
   }
 
   /**
-   * Returns the struct type value for rowId.
+   * Returns the struct type value for rowId. If the slot for rowId is null, it should return null.
    *
    * To support struct type, implementations must implement {@link #getChild(int)} and make this
    * vector a tree structure. The number of child vectors must be same as the number of fields of
@@ -205,7 +219,7 @@ public final ColumnarRow getStruct(int rowId) {
   }
 
   /**
-   * Returns the array type value for rowId.
+   * Returns the array type value for rowId. If the slot for rowId is null, it should return null.
    *
    * To support array type, implementations must construct an {@link ColumnarArray} and return it in
    * this method. {@link ColumnarArray} requires a {@link ColumnVector} that stores the data of all
@@ -218,13 +232,13 @@ public final ColumnarRow getStruct(int rowId) {
   public abstract ColumnarArray getArray(int rowId);
 
   /**
-   * Returns the map type value for rowId.
+   * Returns the map type value for rowId. If the slot for rowId is null, it should return null.
    *
    * In Spark, map type value is basically a key data array and a value data array. A key from the
    * key array with a index and a value from the value array with the same index contribute to
    * an entry of this map type value.
    *
-   * To support map type, implementations must construct an {@link ColumnarMap} and return it in
+   * To support map type, implementations must construct a {@link ColumnarMap} and return it in
    * this method. {@link ColumnarMap} requires a {@link ColumnVector} that stores the data of all
    * the keys of all the maps in this vector, and another {@link ColumnVector} that stores the data
    * of all the values of all the maps in this vector, and a pair of offset and length which
@@ -233,24 +247,25 @@ public final ColumnarRow getStruct(int rowId) {
   public abstract ColumnarMap getMap(int ordinal);
 
   /**
-   * Returns the decimal type value for rowId.
+   * Returns the decimal type value for rowId. If the slot for rowId is null, it should return null.
    */
   public abstract Decimal getDecimal(int rowId, int precision, int scale);
 
   /**
-   * Returns the string type value for rowId. Note that the returned UTF8String may point to the
-   * data of this column vector, please copy it if you want to keep it after this column vector is
-   * freed.
+   * Returns the string type value for rowId. If the slot for rowId is null, it should return null.
+   * Note that the returned UTF8String may point to the data of this column vector, please copy it
+   * if you want to keep it after this column vector is freed.
    */
   public abstract UTF8String getUTF8String(int rowId);
 
   /**
-   * Returns the binary type value for rowId.
+   * Returns the binary type value for rowId. If the slot for rowId is null, it should return null.
    */
   public abstract byte[] getBinary(int rowId);
 
   /**
-   * Returns the calendar interval type value for rowId.
+   * Returns the calendar interval type value for rowId. If the slot for rowId is null, it should
+   * return null.
    *
    * In Spark, calendar interval type value is basically an integer value representing the number of
    * months in this interval, and a long value representing the number of microseconds in this
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
index b400f7f93c1fe..f2f2279590023 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
@@ -119,43 +119,36 @@ public boolean anyNull() {
 
   @Override
   public Decimal getDecimal(int ordinal, int precision, int scale) {
-    if (data.getChild(ordinal).isNullAt(rowId)) return null;
     return data.getChild(ordinal).getDecimal(rowId, precision, scale);
   }
 
   @Override
   public UTF8String getUTF8String(int ordinal) {
-    if (data.getChild(ordinal).isNullAt(rowId)) return null;
     return data.getChild(ordinal).getUTF8String(rowId);
   }
 
   @Override
   public byte[] getBinary(int ordinal) {
-    if (data.getChild(ordinal).isNullAt(rowId)) return null;
     return data.getChild(ordinal).getBinary(rowId);
   }
 
   @Override
   public CalendarInterval getInterval(int ordinal) {
-    if (data.getChild(ordinal).isNullAt(rowId)) return null;
     return data.getChild(ordinal).getInterval(rowId);
   }
 
   @Override
   public ColumnarRow getStruct(int ordinal, int numFields) {
-    if (data.getChild(ordinal).isNullAt(rowId)) return null;
     return data.getChild(ordinal).getStruct(rowId);
   }
 
   @Override
   public ColumnarArray getArray(int ordinal) {
-    if (data.getChild(ordinal).isNullAt(rowId)) return null;
     return data.getChild(ordinal).getArray(rowId);
   }
 
   @Override
   public ColumnarMap getMap(int ordinal) {
-    if (data.getChild(ordinal).isNullAt(rowId)) return null;
     return data.getChild(ordinal).getMap(rowId);
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index 8fe2985836f2e..772f687526008 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -572,7 +572,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       }
   }
 
-  testVector("String APIs", 6, StringType) {
+  testVector("String APIs", 7, StringType) {
     column =>
       val reference = mutable.ArrayBuffer.empty[String]
 
@@ -619,6 +619,10 @@ class ColumnarBatchSuite extends SparkFunSuite {
       idx += 1
       assert(column.arrayData().elementsAppended == 17 + (s + s).length)
 
+      column.putNull(idx)
+      assert(column.getUTF8String(idx) == null)
+      idx += 1
+
       reference.zipWithIndex.foreach { v =>
         val errMsg = "VectorType=" + column.getClass.getSimpleName
         assert(v._1.length == column.getArrayLength(v._2), errMsg)
@@ -647,6 +651,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       reference += new CalendarInterval(0, 2000)
 
       column.putNull(2)
+      assert(column.getInterval(2) == null)
       reference += null
 
       months.putInt(3, 20)
@@ -683,6 +688,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       assert(column.getArray(0).numElements == 1)
       assert(column.getArray(1).numElements == 2)
       assert(column.isNullAt(2))
+      assert(column.getArray(2) == null)
       assert(column.getArray(3).numElements == 0)
       assert(column.getArray(4).numElements == 3)
 
@@ -785,6 +791,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       column.putArray(0, 0, 1)
       column.putArray(1, 1, 2)
       column.putNull(2)
+      assert(column.getMap(2) == null)
       column.putArray(3, 3, 0)
       column.putArray(4, 3, 3)
 
@@ -821,6 +828,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       c2.putDouble(0, 3.45)
 
       column.putNull(1)
+      assert(column.getStruct(1) == null)
 
       c1.putInt(2, 456)
       c2.putDouble(2, 5.67)
@@ -1261,4 +1269,68 @@ class ColumnarBatchSuite extends SparkFunSuite {
     batch.close()
     allocator.close()
   }
+
+  testVector("Decimal API", 4, DecimalType.IntDecimal) {
+    column =>
+
+      val reference = mutable.ArrayBuffer.empty[Decimal]
+
+      var idx = 0
+      column.putDecimal(idx, new Decimal().set(10), 10)
+      reference += new Decimal().set(10)
+      idx += 1
+
+      column.putDecimal(idx, new Decimal().set(20), 10)
+      reference += new Decimal().set(20)
+      idx += 1
+
+      column.putNull(idx)
+      assert(column.getDecimal(idx, 10, 0) == null)
+      reference += null
+      idx += 1
+
+      column.putDecimal(idx, new Decimal().set(30), 10)
+      reference += new Decimal().set(30)
+
+      reference.zipWithIndex.foreach { case (v, i) =>
+        val errMsg = "VectorType=" + column.getClass.getSimpleName
+        assert(v == column.getDecimal(i, 10, 0), errMsg)
+        if (v == null) assert(column.isNullAt(i), errMsg)
+      }
+
+      column.close()
+  }
+
+  testVector("Binary APIs", 4, BinaryType) {
+    column =>
+
+      val reference = mutable.ArrayBuffer.empty[String]
+      var idx = 0
+      column.putByteArray(idx, "Hello".getBytes(StandardCharsets.UTF_8))
+      reference += "Hello"
+      idx += 1
+
+      column.putByteArray(idx, "World".getBytes(StandardCharsets.UTF_8))
+      reference += "World"
+      idx += 1
+
+      column.putNull(idx)
+      reference += null
+      idx += 1
+
+      column.putByteArray(idx, "abc".getBytes(StandardCharsets.UTF_8))
+      reference += "abc"
+
+      reference.zipWithIndex.foreach { case (v, i) =>
+        val errMsg = "VectorType=" + column.getClass.getSimpleName
+        if (v != null) {
+          assert(v == new String(column.getBinary(i)), errMsg)
+        } else {
+          assert(column.isNullAt(i), errMsg)
+          assert(column.getBinary(i) == null, errMsg)
+        }
+      }
+
+      column.close()
+  }
 }

From 2b07452cacb4c226c815a216c4cfea2a04227700 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 1 Feb 2018 20:44:46 -0800
Subject: [PATCH 239/566] [SPARK-23301][SQL] data source column pruning should
 work for arbitrary expressions

This PR fixes a mistake in the `PushDownOperatorsToDataSource` rule, the column pruning logic is incorrect about `Project`.

a new test case for column pruning with arbitrary expressions, and improve the existing tests to make sure the `PushDownOperatorsToDataSource` really works.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20476 from cloud-fan/push-down.

(cherry picked from commit 19c7c7ebdef6c1c7a02ebac9af6a24f521b52c37)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../v2/PushDownOperatorsToDataSource.scala    |  53 ++++-----
 .../sources/v2/JavaAdvancedDataSourceV2.java  |  29 ++++-
 .../sql/sources/v2/DataSourceV2Suite.scala    | 111 ++++++++++++++++--
 3 files changed, 154 insertions(+), 39 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala
index df034adf1e7d6..566a48394f02e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.execution.datasources.v2
 
-import org.apache.spark.sql.catalyst.expressions.{And, Attribute, AttributeMap, Expression, NamedExpression, PredicateHelper}
+import org.apache.spark.sql.catalyst.expressions.{And, Attribute, AttributeMap, AttributeSet, Expression, NamedExpression, PredicateHelper}
 import org.apache.spark.sql.catalyst.optimizer.RemoveRedundantProject
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.rules.Rule
@@ -81,35 +81,34 @@ object PushDownOperatorsToDataSource extends Rule[LogicalPlan] with PredicateHel
 
     // TODO: add more push down rules.
 
-    // TODO: nested fields pruning
-    def pushDownRequiredColumns(plan: LogicalPlan, requiredByParent: Seq[Attribute]): Unit = {
-      plan match {
-        case Project(projectList, child) =>
-          val required = projectList.filter(requiredByParent.contains).flatMap(_.references)
-          pushDownRequiredColumns(child, required)
-
-        case Filter(condition, child) =>
-          val required = requiredByParent ++ condition.references
-          pushDownRequiredColumns(child, required)
-
-        case DataSourceV2Relation(fullOutput, reader) => reader match {
-          case r: SupportsPushDownRequiredColumns =>
-            // Match original case of attributes.
-            val attrMap = AttributeMap(fullOutput.zip(fullOutput))
-            val requiredColumns = requiredByParent.map(attrMap)
-            r.pruneColumns(requiredColumns.toStructType)
-          case _ =>
-        }
+    pushDownRequiredColumns(filterPushed, filterPushed.outputSet)
+    // After column pruning, we may have redundant PROJECT nodes in the query plan, remove them.
+    RemoveRedundantProject(filterPushed)
+  }
+
+  // TODO: nested fields pruning
+  private def pushDownRequiredColumns(plan: LogicalPlan, requiredByParent: AttributeSet): Unit = {
+    plan match {
+      case Project(projectList, child) =>
+        val required = projectList.flatMap(_.references)
+        pushDownRequiredColumns(child, AttributeSet(required))
+
+      case Filter(condition, child) =>
+        val required = requiredByParent ++ condition.references
+        pushDownRequiredColumns(child, required)
 
-        // TODO: there may be more operators can be used to calculate required columns, we can add
-        // more and more in the future.
-        case _ => plan.children.foreach(child => pushDownRequiredColumns(child, child.output))
+      case relation: DataSourceV2Relation => relation.reader match {
+        case reader: SupportsPushDownRequiredColumns =>
+          val requiredColumns = relation.output.filter(requiredByParent.contains)
+          reader.pruneColumns(requiredColumns.toStructType)
+
+        case _ =>
       }
-    }
 
-    pushDownRequiredColumns(filterPushed, filterPushed.output)
-    // After column pruning, we may have redundant PROJECT nodes in the query plan, remove them.
-    RemoveRedundantProject(filterPushed)
+      // TODO: there may be more operators that can be used to calculate the required columns. We
+      // can add more and more in the future.
+      case _ => plan.children.foreach(child => pushDownRequiredColumns(child, child.outputSet))
+    }
   }
 
   /**
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java
index d421f7d19563f..172e5d5eebcbe 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java
@@ -32,11 +32,12 @@
 
 public class JavaAdvancedDataSourceV2 implements DataSourceV2, ReadSupport {
 
-  class Reader implements DataSourceReader, SupportsPushDownRequiredColumns,
+  public class Reader implements DataSourceReader, SupportsPushDownRequiredColumns,
       SupportsPushDownFilters {
 
-    private StructType requiredSchema = new StructType().add("i", "int").add("j", "int");
-    private Filter[] filters = new Filter[0];
+    // Exposed for testing.
+    public StructType requiredSchema = new StructType().add("i", "int").add("j", "int");
+    public Filter[] filters = new Filter[0];
 
     @Override
     public StructType readSchema() {
@@ -50,8 +51,26 @@ public void pruneColumns(StructType requiredSchema) {
 
     @Override
     public Filter[] pushFilters(Filter[] filters) {
-      this.filters = filters;
-      return new Filter[0];
+      Filter[] supported = Arrays.stream(filters).filter(f -> {
+        if (f instanceof GreaterThan) {
+          GreaterThan gt = (GreaterThan) f;
+          return gt.attribute().equals("i") && gt.value() instanceof Integer;
+        } else {
+          return false;
+        }
+      }).toArray(Filter[]::new);
+
+      Filter[] unsupported = Arrays.stream(filters).filter(f -> {
+        if (f instanceof GreaterThan) {
+          GreaterThan gt = (GreaterThan) f;
+          return !gt.attribute().equals("i") || !(gt.value() instanceof Integer);
+        } else {
+          return true;
+        }
+      }).toArray(Filter[]::new);
+
+      this.filters = supported;
+      return unsupported;
     }
 
     @Override
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
index f34928f013570..e0e034d734f27 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
@@ -22,10 +22,12 @@ import java.util.{ArrayList, List => JList}
 import test.org.apache.spark.sql.sources.v2._
 
 import org.apache.spark.SparkException
-import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
+import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row}
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanExec
 import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
 import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.sources.{Filter, GreaterThan}
 import org.apache.spark.sql.sources.v2.reader._
 import org.apache.spark.sql.sources.v2.reader.partitioning.{ClusteredDistribution, Distribution, Partitioning}
@@ -48,14 +50,72 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext {
   }
 
   test("advanced implementation") {
+    def getReader(query: DataFrame): AdvancedDataSourceV2#Reader = {
+      query.queryExecution.executedPlan.collect {
+        case d: DataSourceV2ScanExec => d.reader.asInstanceOf[AdvancedDataSourceV2#Reader]
+      }.head
+    }
+
+    def getJavaReader(query: DataFrame): JavaAdvancedDataSourceV2#Reader = {
+      query.queryExecution.executedPlan.collect {
+        case d: DataSourceV2ScanExec => d.reader.asInstanceOf[JavaAdvancedDataSourceV2#Reader]
+      }.head
+    }
+
     Seq(classOf[AdvancedDataSourceV2], classOf[JavaAdvancedDataSourceV2]).foreach { cls =>
       withClue(cls.getName) {
         val df = spark.read.format(cls.getName).load()
         checkAnswer(df, (0 until 10).map(i => Row(i, -i)))
-        checkAnswer(df.select('j), (0 until 10).map(i => Row(-i)))
-        checkAnswer(df.filter('i > 3), (4 until 10).map(i => Row(i, -i)))
-        checkAnswer(df.select('j).filter('i > 6), (7 until 10).map(i => Row(-i)))
-        checkAnswer(df.select('i).filter('i > 10), Nil)
+
+        val q1 = df.select('j)
+        checkAnswer(q1, (0 until 10).map(i => Row(-i)))
+        if (cls == classOf[AdvancedDataSourceV2]) {
+          val reader = getReader(q1)
+          assert(reader.filters.isEmpty)
+          assert(reader.requiredSchema.fieldNames === Seq("j"))
+        } else {
+          val reader = getJavaReader(q1)
+          assert(reader.filters.isEmpty)
+          assert(reader.requiredSchema.fieldNames === Seq("j"))
+        }
+
+        val q2 = df.filter('i > 3)
+        checkAnswer(q2, (4 until 10).map(i => Row(i, -i)))
+        if (cls == classOf[AdvancedDataSourceV2]) {
+          val reader = getReader(q2)
+          assert(reader.filters.flatMap(_.references).toSet == Set("i"))
+          assert(reader.requiredSchema.fieldNames === Seq("i", "j"))
+        } else {
+          val reader = getJavaReader(q2)
+          assert(reader.filters.flatMap(_.references).toSet == Set("i"))
+          assert(reader.requiredSchema.fieldNames === Seq("i", "j"))
+        }
+
+        val q3 = df.select('i).filter('i > 6)
+        checkAnswer(q3, (7 until 10).map(i => Row(i)))
+        if (cls == classOf[AdvancedDataSourceV2]) {
+          val reader = getReader(q3)
+          assert(reader.filters.flatMap(_.references).toSet == Set("i"))
+          assert(reader.requiredSchema.fieldNames === Seq("i"))
+        } else {
+          val reader = getJavaReader(q3)
+          assert(reader.filters.flatMap(_.references).toSet == Set("i"))
+          assert(reader.requiredSchema.fieldNames === Seq("i"))
+        }
+
+        val q4 = df.select('j).filter('j < -10)
+        checkAnswer(q4, Nil)
+        if (cls == classOf[AdvancedDataSourceV2]) {
+          val reader = getReader(q4)
+          // 'j < 10 is not supported by the testing data source.
+          assert(reader.filters.isEmpty)
+          assert(reader.requiredSchema.fieldNames === Seq("j"))
+        } else {
+          val reader = getJavaReader(q4)
+          // 'j < 10 is not supported by the testing data source.
+          assert(reader.filters.isEmpty)
+          assert(reader.requiredSchema.fieldNames === Seq("j"))
+        }
       }
     }
   }
@@ -204,6 +264,39 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext {
     val df2 = df.select(($"i" + 1).as("k"), $"j")
     checkAnswer(df.join(df2, "j"), (0 until 10).map(i => Row(-i, i, i + 1)))
   }
+
+  test("SPARK-23301: column pruning with arbitrary expressions") {
+    def getReader(query: DataFrame): AdvancedDataSourceV2#Reader = {
+      query.queryExecution.executedPlan.collect {
+        case d: DataSourceV2ScanExec => d.reader.asInstanceOf[AdvancedDataSourceV2#Reader]
+      }.head
+    }
+
+    val df = spark.read.format(classOf[AdvancedDataSourceV2].getName).load()
+
+    val q1 = df.select('i + 1)
+    checkAnswer(q1, (1 until 11).map(i => Row(i)))
+    val reader1 = getReader(q1)
+    assert(reader1.requiredSchema.fieldNames === Seq("i"))
+
+    val q2 = df.select(lit(1))
+    checkAnswer(q2, (0 until 10).map(i => Row(1)))
+    val reader2 = getReader(q2)
+    assert(reader2.requiredSchema.isEmpty)
+
+    // 'j === 1 can't be pushed down, but we should still be able do column pruning
+    val q3 = df.filter('j === -1).select('j * 2)
+    checkAnswer(q3, Row(-2))
+    val reader3 = getReader(q3)
+    assert(reader3.filters.isEmpty)
+    assert(reader3.requiredSchema.fieldNames === Seq("j"))
+
+    // column pruning should work with other operators.
+    val q4 = df.sort('i).limit(1).select('i + 1)
+    checkAnswer(q4, Row(1))
+    val reader4 = getReader(q4)
+    assert(reader4.requiredSchema.fieldNames === Seq("i"))
+  }
 }
 
 class SimpleDataSourceV2 extends DataSourceV2 with ReadSupport {
@@ -251,8 +344,12 @@ class AdvancedDataSourceV2 extends DataSourceV2 with ReadSupport {
     }
 
     override def pushFilters(filters: Array[Filter]): Array[Filter] = {
-      this.filters = filters
-      Array.empty
+      val (supported, unsupported) = filters.partition {
+        case GreaterThan("i", _: Int) => true
+        case _ => false
+      }
+      this.filters = supported
+      unsupported
     }
 
     override def pushedFilters(): Array[Filter] = filters

From e5e9f9a430c827669ecfe9d5c13cc555fc89c980 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 2 Feb 2018 22:43:28 +0800
Subject: [PATCH 240/566] [SPARK-23312][SQL] add a config to turn off
 vectorized cache reader

## What changes were proposed in this pull request?

https://issues.apache.org/jira/browse/SPARK-23309 reported a performance regression about cached table in Spark 2.3. While the investigating is still going on, this PR adds a conf to turn off the vectorized cache reader, to unblock the 2.3 release.

## How was this patch tested?

a new test

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20483 from cloud-fan/cache.

(cherry picked from commit b9503fcbb3f4a3ce263164d1f11a8e99b9ca5710)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../org/apache/spark/sql/internal/SQLConf.scala   |  8 ++++++++
 .../columnar/InMemoryTableScanExec.scala          |  2 +-
 .../org/apache/spark/sql/CachedTableSuite.scala   | 15 +++++++++++++--
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 7394a0d7cf983..e498f5520f6b4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -141,6 +141,12 @@ object SQLConf {
       .booleanConf
       .createWithDefault(true)
 
+  val CACHE_VECTORIZED_READER_ENABLED =
+    buildConf("spark.sql.inMemoryColumnarStorage.enableVectorizedReader")
+      .doc("Enables vectorized reader for columnar caching.")
+      .booleanConf
+      .createWithDefault(true)
+
   val COLUMN_VECTOR_OFFHEAP_ENABLED =
     buildConf("spark.sql.columnVector.offheap.enabled")
       .internal()
@@ -1256,6 +1262,8 @@ class SQLConf extends Serializable with Logging {
 
   def columnBatchSize: Int = getConf(COLUMN_BATCH_SIZE)
 
+  def cacheVectorizedReaderEnabled: Boolean = getConf(CACHE_VECTORIZED_READER_ENABLED)
+
   def numShufflePartitions: Int = getConf(SHUFFLE_PARTITIONS)
 
   def targetPostShuffleInputSize: Long =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
index c167f1e7dc621..e972f8b30d87c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
@@ -54,7 +54,7 @@ case class InMemoryTableScanExec(
   override val supportsBatch: Boolean = {
     // In the initial implementation, for ease of review
     // support only primitive data types and # of fields is less than wholeStageMaxNumFields
-    relation.schema.fields.forall(f => f.dataType match {
+    conf.cacheVectorizedReaderEnabled && relation.schema.fields.forall(f => f.dataType match {
       case BooleanType | ByteType | ShortType | IntegerType | LongType |
            FloatType | DoubleType => true
       case _ => false
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index 72fe0f42801f1..9f27fa09127af 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -21,8 +21,6 @@ import scala.collection.mutable.HashSet
 import scala.concurrent.duration._
 import scala.language.postfixOps
 
-import org.scalatest.concurrent.Eventually._
-
 import org.apache.spark.CleanerListener
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
@@ -30,6 +28,7 @@ import org.apache.spark.sql.execution.{RDDScanExec, SparkPlan}
 import org.apache.spark.sql.execution.columnar._
 import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils}
 import org.apache.spark.storage.{RDDBlockId, StorageLevel}
 import org.apache.spark.util.{AccumulatorContext, Utils}
@@ -782,4 +781,16 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext
       assert(getNumInMemoryRelations(cachedDs2) == 1)
     }
   }
+
+  test("SPARK-23312: vectorized cache reader can be disabled") {
+    Seq(true, false).foreach { vectorized =>
+      withSQLConf(SQLConf.CACHE_VECTORIZED_READER_ENABLED.key -> vectorized.toString) {
+        val df = spark.range(10).cache()
+        df.queryExecution.executedPlan.foreach {
+          case i: InMemoryTableScanExec => assert(i.supportsBatch == vectorized)
+          case _ =>
+        }
+      }
+    }
+  }
 }

From 56eb9a310217a5372bdba1e24e4af0d4de1829ca Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Fri, 2 Feb 2018 17:37:51 -0800
Subject: [PATCH 241/566] [SPARK-23064][SS][DOCS] Stream-stream joins
 Documentation - follow up

## What changes were proposed in this pull request?
Further clarification of caveats in using stream-stream outer joins.

## How was this patch tested?
N/A

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #20494 from tdas/SPARK-23064-2.

(cherry picked from commit eaf35de2471fac4337dd2920026836d52b1ec847)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 docs/structured-streaming-programming-guide.md | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index 62589a62ac4c4..48d6d0b542cc0 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -1346,10 +1346,20 @@ joined <- join(
 </div>
 </div>
 
-However, note that the outer NULL results will be generated with a delay (depends on the specified
-watermark delay and the time range condition) because the engine has to wait for that long to ensure
+
+There are a few points to note regarding outer joins.
+
+- *The outer NULL results will be generated with a delay that depends on the specified watermark
+delay and the time range condition.* This is because the engine has to wait for that long to ensure
 there were no matches and there will be no more matches in future.
 
+- In the current implementation in the micro-batch engine, watermarks are advanced at the end of a
+micro-batch, and the next micro-batch uses the updated watermark to clean up state and output
+outer results. Since we trigger a micro-batch only when there is new data to be processed, the
+generation of the outer result may get delayed if there no new data being received in the stream.
+*In short, if any of the two input streams being joined does not receive data for a while, the
+outer (both cases, left or right) output may get delayed.*
+
 ##### Support matrix for joins in streaming queries
 
 <table class ="table">

From dcd0af4be752ab61b8caf36f70d98e97c6925473 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 2 Feb 2018 20:36:27 -0800
Subject: [PATCH 242/566] [SQL] Minor doc update: Add an example in
 DataFrameReader.schema

## What changes were proposed in this pull request?
This patch adds a small example to the schema string definition of schema function. It isn't obvious how to use it, so an example would be useful.

## How was this patch tested?
N/A - doc only.

Author: Reynold Xin <rxin@databricks.com>

Closes #20491 from rxin/schema-doc.

(cherry picked from commit 3ff83ad43a704cc3354ef9783e711c065e2a1a22)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../src/main/scala/org/apache/spark/sql/DataFrameReader.scala | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 46b5f54a33f74..fcaf8d618c168 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -74,6 +74,10 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * infer the input schema automatically from data. By specifying the schema here, the underlying
    * data source can skip the schema inference step, and thus speed up data loading.
    *
+   * {{{
+   *   spark.read.schema("a INT, b STRING, c DOUBLE").csv("test.csv")
+   * }}}
+   *
    * @since 2.3.0
    */
   def schema(schemaString: String): DataFrameReader = {

From b614c083a4875c874180a93b08ea5031fa90cfec Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 2 Feb 2018 20:49:08 -0800
Subject: [PATCH 243/566] [SPARK-23317][SQL] rename ContinuousReader.setOffset
 to setStartOffset

## What changes were proposed in this pull request?

In the document of `ContinuousReader.setOffset`, we say this method is used to specify the start offset. We also have a `ContinuousReader.getStartOffset` to get the value back. I think it makes more sense to rename `ContinuousReader.setOffset` to `setStartOffset`.

## How was this patch tested?

N/A

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20486 from cloud-fan/rename.

(cherry picked from commit fe73cb4b439169f16cc24cd851a11fd398ce7edf)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../org/apache/spark/sql/kafka010/KafkaContinuousReader.scala | 2 +-
 .../sql/sources/v2/reader/streaming/ContinuousReader.java     | 4 ++--
 .../execution/streaming/continuous/ContinuousExecution.scala  | 2 +-
 .../streaming/continuous/ContinuousRateStreamSource.scala     | 2 +-
 .../spark/sql/execution/streaming/RateSourceV2Suite.scala     | 2 +-
 .../sql/streaming/sources/StreamingDataSourceV2Suite.scala    | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
index 41c443bc12120..b049a054cb40e 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
@@ -71,7 +71,7 @@ class KafkaContinuousReader(
   override def readSchema: StructType = KafkaOffsetReader.kafkaSchema
 
   private var offset: Offset = _
-  override def setOffset(start: ju.Optional[Offset]): Unit = {
+  override def setStartOffset(start: ju.Optional[Offset]): Unit = {
     offset = start.orElse {
       val offsets = initialOffsets match {
         case EarliestOffsetRangeLimit => KafkaSourceOffset(offsetReader.fetchEarliestOffsets())
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/ContinuousReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/ContinuousReader.java
index d1d1e7ffd1dd4..7fe7f00ac2fa8 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/ContinuousReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/ContinuousReader.java
@@ -51,12 +51,12 @@ public interface ContinuousReader extends BaseStreamingSource, DataSourceReader
      * start from the first record after the provided offset, or from an implementation-defined
      * inferred starting point if no offset is provided.
      */
-    void setOffset(Optional<Offset> start);
+    void setStartOffset(Optional<Offset> start);
 
     /**
      * Return the specified or inferred start offset for this reader.
      *
-     * @throws IllegalStateException if setOffset has not been called
+     * @throws IllegalStateException if setStartOffset has not been called
      */
     Offset getStartOffset();
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
index 08c81419a9d34..ed22b9100497a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -181,7 +181,7 @@ class ContinuousExecution(
 
         val loggedOffset = offsets.offsets(0)
         val realOffset = loggedOffset.map(off => reader.deserializeOffset(off.json))
-        reader.setOffset(java.util.Optional.ofNullable(realOffset.orNull))
+        reader.setStartOffset(java.util.Optional.ofNullable(realOffset.orNull))
         new StreamingDataSourceV2Relation(newOutput, reader)
     }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
index 0eaaa4889ba9e..b63d8d3e20650 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
@@ -61,7 +61,7 @@ class RateStreamContinuousReader(options: DataSourceOptions)
 
   private var offset: Offset = _
 
-  override def setOffset(offset: java.util.Optional[Offset]): Unit = {
+  override def setStartOffset(offset: java.util.Optional[Offset]): Unit = {
     this.offset = offset.orElse(RateStreamSourceV2.createInitialOffset(numPartitions, creationTime))
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
index 3158995ec62f1..0d68d9c3138aa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
@@ -160,7 +160,7 @@ class RateSourceV2Suite extends StreamTest {
   test("continuous data") {
     val reader = new RateStreamContinuousReader(
       new DataSourceOptions(Map("numPartitions" -> "2", "rowsPerSecond" -> "20").asJava))
-    reader.setOffset(Optional.empty())
+    reader.setStartOffset(Optional.empty())
     val tasks = reader.createDataReaderFactories()
     assert(tasks.size == 2)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
index cb873ab688e96..51f44fa6285e4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
@@ -43,7 +43,7 @@ case class FakeReader() extends MicroBatchReader with ContinuousReader {
   def readSchema(): StructType = StructType(Seq())
   def stop(): Unit = {}
   def mergeOffsets(offsets: Array[PartitionOffset]): Offset = RateStreamOffset(Map())
-  def setOffset(start: Optional[Offset]): Unit = {}
+  def setStartOffset(start: Optional[Offset]): Unit = {}
 
   def createDataReaderFactories(): java.util.ArrayList[DataReaderFactory[Row]] = {
     throw new IllegalStateException("fake source - cannot actually read")

From 1bcb3728db11be6e34060eff670fc8245ad571c6 Mon Sep 17 00:00:00 2001
From: caoxuewen <cao.xuewen@zte.com.cn>
Date: Sat, 3 Feb 2018 00:02:03 -0800
Subject: [PATCH 244/566] [SPARK-23311][SQL][TEST] add FilterFunction test case
 for test CombineTypedFilters

## What changes were proposed in this pull request?

In the current test case for CombineTypedFilters, we lack the test of FilterFunction, so let's add it.
In addition, in TypedFilterOptimizationSuite's existing test cases, Let's extract a common LocalRelation.

## How was this patch tested?

add new test cases.

Author: caoxuewen <cao.xuewen@zte.com.cn>

Closes #20482 from heary-cao/TypedFilterOptimizationSuite.

(cherry picked from commit 63b49fa2e599080c2ba7d5189f9dde20a2e01fb4)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../spark/sql/catalyst/dsl/package.scala      |  3 +
 .../TypedFilterOptimizationSuite.scala        | 95 ++++++++++++++++---
 2 files changed, 84 insertions(+), 14 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index 59cb26d5e6c36..efb2eba655e15 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -21,6 +21,7 @@ import java.sql.{Date, Timestamp}
 
 import scala.language.implicitConversions
 
+import org.apache.spark.api.java.function.FilterFunction
 import org.apache.spark.sql.Encoder
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
@@ -301,6 +302,8 @@ package object dsl {
 
       def filter[T : Encoder](func: T => Boolean): LogicalPlan = TypedFilter(func, logicalPlan)
 
+      def filter[T : Encoder](func: FilterFunction[T]): LogicalPlan = TypedFilter(func, logicalPlan)
+
       def serialize[T : Encoder]: LogicalPlan = CatalystSerde.serialize[T](logicalPlan)
 
       def deserialize[T : Encoder]: LogicalPlan = CatalystSerde.deserialize[T](logicalPlan)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/TypedFilterOptimizationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/TypedFilterOptimizationSuite.scala
index 56f096f3ecf8c..5fc99a3a57c0f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/TypedFilterOptimizationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/TypedFilterOptimizationSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.optimizer
 
 import scala.reflect.runtime.universe.TypeTag
 
+import org.apache.spark.api.java.function.FilterFunction
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
@@ -38,18 +39,19 @@ class TypedFilterOptimizationSuite extends PlanTest {
 
   implicit private def productEncoder[T <: Product : TypeTag] = ExpressionEncoder[T]()
 
+  val testRelation = LocalRelation('_1.int, '_2.int)
+
   test("filter after serialize with the same object type") {
-    val input = LocalRelation('_1.int, '_2.int)
     val f = (i: (Int, Int)) => i._1 > 0
 
-    val query = input
+    val query = testRelation
       .deserialize[(Int, Int)]
       .serialize[(Int, Int)]
       .filter(f).analyze
 
     val optimized = Optimize.execute(query)
 
-    val expected = input
+    val expected = testRelation
       .deserialize[(Int, Int)]
       .where(callFunction(f, BooleanType, 'obj))
       .serialize[(Int, Int)].analyze
@@ -58,10 +60,9 @@ class TypedFilterOptimizationSuite extends PlanTest {
   }
 
   test("filter after serialize with different object types") {
-    val input = LocalRelation('_1.int, '_2.int)
     val f = (i: OtherTuple) => i._1 > 0
 
-    val query = input
+    val query = testRelation
       .deserialize[(Int, Int)]
       .serialize[(Int, Int)]
       .filter(f).analyze
@@ -70,17 +71,16 @@ class TypedFilterOptimizationSuite extends PlanTest {
   }
 
   test("filter before deserialize with the same object type") {
-    val input = LocalRelation('_1.int, '_2.int)
     val f = (i: (Int, Int)) => i._1 > 0
 
-    val query = input
+    val query = testRelation
       .filter(f)
       .deserialize[(Int, Int)]
       .serialize[(Int, Int)].analyze
 
     val optimized = Optimize.execute(query)
 
-    val expected = input
+    val expected = testRelation
       .deserialize[(Int, Int)]
       .where(callFunction(f, BooleanType, 'obj))
       .serialize[(Int, Int)].analyze
@@ -89,10 +89,9 @@ class TypedFilterOptimizationSuite extends PlanTest {
   }
 
   test("filter before deserialize with different object types") {
-    val input = LocalRelation('_1.int, '_2.int)
     val f = (i: OtherTuple) => i._1 > 0
 
-    val query = input
+    val query = testRelation
       .filter(f)
       .deserialize[(Int, Int)]
       .serialize[(Int, Int)].analyze
@@ -101,21 +100,89 @@ class TypedFilterOptimizationSuite extends PlanTest {
   }
 
   test("back to back filter with the same object type") {
-    val input = LocalRelation('_1.int, '_2.int)
     val f1 = (i: (Int, Int)) => i._1 > 0
     val f2 = (i: (Int, Int)) => i._2 > 0
 
-    val query = input.filter(f1).filter(f2).analyze
+    val query = testRelation.filter(f1).filter(f2).analyze
     val optimized = Optimize.execute(query)
     assert(optimized.collect { case t: TypedFilter => t }.length == 1)
   }
 
   test("back to back filter with different object types") {
-    val input = LocalRelation('_1.int, '_2.int)
     val f1 = (i: (Int, Int)) => i._1 > 0
     val f2 = (i: OtherTuple) => i._2 > 0
 
-    val query = input.filter(f1).filter(f2).analyze
+    val query = testRelation.filter(f1).filter(f2).analyze
+    val optimized = Optimize.execute(query)
+    assert(optimized.collect { case t: TypedFilter => t }.length == 2)
+  }
+
+  test("back to back FilterFunction with the same object type") {
+    val f1 = new FilterFunction[(Int, Int)] {
+      override def call(value: (Int, Int)): Boolean = value._1 > 0
+    }
+    val f2 = new FilterFunction[(Int, Int)] {
+      override def call(value: (Int, Int)): Boolean = value._2 > 0
+    }
+
+    val query = testRelation.filter(f1).filter(f2).analyze
+    val optimized = Optimize.execute(query)
+    assert(optimized.collect { case t: TypedFilter => t }.length == 1)
+  }
+
+  test("back to back FilterFunction with different object types") {
+    val f1 = new FilterFunction[(Int, Int)] {
+      override def call(value: (Int, Int)): Boolean = value._1 > 0
+    }
+    val f2 = new FilterFunction[OtherTuple] {
+      override def call(value: OtherTuple): Boolean = value._2 > 0
+    }
+
+    val query = testRelation.filter(f1).filter(f2).analyze
+    val optimized = Optimize.execute(query)
+    assert(optimized.collect { case t: TypedFilter => t }.length == 2)
+  }
+
+  test("FilterFunction and filter with the same object type") {
+    val f1 = new FilterFunction[(Int, Int)] {
+      override def call(value: (Int, Int)): Boolean = value._1 > 0
+    }
+    val f2 = (i: (Int, Int)) => i._2 > 0
+
+    val query = testRelation.filter(f1).filter(f2).analyze
+    val optimized = Optimize.execute(query)
+    assert(optimized.collect { case t: TypedFilter => t }.length == 1)
+  }
+
+  test("FilterFunction and filter with different object types") {
+    val f1 = new FilterFunction[(Int, Int)] {
+      override def call(value: (Int, Int)): Boolean = value._1 > 0
+    }
+    val f2 = (i: OtherTuple) => i._2 > 0
+
+    val query = testRelation.filter(f1).filter(f2).analyze
+    val optimized = Optimize.execute(query)
+    assert(optimized.collect { case t: TypedFilter => t }.length == 2)
+  }
+
+  test("filter and FilterFunction with the same object type") {
+    val f2 = (i: (Int, Int)) => i._1 > 0
+    val f1 = new FilterFunction[(Int, Int)] {
+      override def call(value: (Int, Int)): Boolean = value._2 > 0
+    }
+
+    val query = testRelation.filter(f1).filter(f2).analyze
+    val optimized = Optimize.execute(query)
+    assert(optimized.collect { case t: TypedFilter => t }.length == 1)
+  }
+
+  test("filter and FilterFunction with different object types") {
+    val f2 = (i: (Int, Int)) => i._1 > 0
+    val f1 = new FilterFunction[OtherTuple] {
+      override def call(value: OtherTuple): Boolean = value._2 > 0
+    }
+
+    val query = testRelation.filter(f1).filter(f2).analyze
     val optimized = Optimize.execute(query)
     assert(optimized.collect { case t: TypedFilter => t }.length == 2)
   }

From 4de206182c8a1f76e1e5e6b597c4b3890e2ca255 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Sat, 3 Feb 2018 00:04:00 -0800
Subject: [PATCH 245/566] [SPARK-23305][SQL][TEST] Test
 `spark.sql.files.ignoreMissingFiles` for all file-based data sources

## What changes were proposed in this pull request?

Like Parquet, all file-based data source handles `spark.sql.files.ignoreMissingFiles` correctly. We had better have a test coverage for feature parity and in order to prevent future accidental regression for all data sources.

## How was this patch tested?

Pass Jenkins with a newly added test case.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20479 from dongjoon-hyun/SPARK-23305.

(cherry picked from commit 522e0b1866a0298669c83de5a47ba380dc0b7c84)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../spark/sql/FileBasedDataSourceSuite.scala  | 37 +++++++++++++++++++
 .../parquet/ParquetQuerySuite.scala           | 33 -----------------
 2 files changed, 37 insertions(+), 33 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
index c272c99ae45a8..640d6b1583663 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
@@ -17,6 +17,10 @@
 
 package org.apache.spark.sql
 
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.SparkException
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 
 class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext {
@@ -92,4 +96,37 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  allFileBasedDataSources.foreach { format =>
+    testQuietly(s"Enabling/disabling ignoreMissingFiles using $format") {
+      def testIgnoreMissingFiles(): Unit = {
+        withTempDir { dir =>
+          val basePath = dir.getCanonicalPath
+          Seq("0").toDF("a").write.format(format).save(new Path(basePath, "first").toString)
+          Seq("1").toDF("a").write.format(format).save(new Path(basePath, "second").toString)
+          val thirdPath = new Path(basePath, "third")
+          Seq("2").toDF("a").write.format(format).save(thirdPath.toString)
+          val df = spark.read.format(format).load(
+            new Path(basePath, "first").toString,
+            new Path(basePath, "second").toString,
+            new Path(basePath, "third").toString)
+
+          val fs = thirdPath.getFileSystem(spark.sparkContext.hadoopConfiguration)
+          assert(fs.delete(thirdPath, true))
+          checkAnswer(df, Seq(Row("0"), Row("1")))
+        }
+      }
+
+      withSQLConf(SQLConf.IGNORE_MISSING_FILES.key -> "true") {
+        testIgnoreMissingFiles()
+      }
+
+      withSQLConf(SQLConf.IGNORE_MISSING_FILES.key -> "false") {
+        val exception = intercept[SparkException] {
+          testIgnoreMissingFiles()
+        }
+        assert(exception.getMessage().contains("does not exist"))
+      }
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
index 6ad88ed997ce7..55b0f729be8ce 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -355,39 +355,6 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
     }
   }
 
-  testQuietly("Enabling/disabling ignoreMissingFiles") {
-    def testIgnoreMissingFiles(): Unit = {
-      withTempDir { dir =>
-        val basePath = dir.getCanonicalPath
-        spark.range(1).toDF("a").write.parquet(new Path(basePath, "first").toString)
-        spark.range(1, 2).toDF("a").write.parquet(new Path(basePath, "second").toString)
-        val thirdPath = new Path(basePath, "third")
-        spark.range(2, 3).toDF("a").write.parquet(thirdPath.toString)
-        val df = spark.read.parquet(
-          new Path(basePath, "first").toString,
-          new Path(basePath, "second").toString,
-          new Path(basePath, "third").toString)
-
-        val fs = thirdPath.getFileSystem(spark.sparkContext.hadoopConfiguration)
-        fs.delete(thirdPath, true)
-        checkAnswer(
-          df,
-          Seq(Row(0), Row(1)))
-      }
-    }
-
-    withSQLConf(SQLConf.IGNORE_MISSING_FILES.key -> "true") {
-      testIgnoreMissingFiles()
-    }
-
-    withSQLConf(SQLConf.IGNORE_MISSING_FILES.key -> "false") {
-      val exception = intercept[SparkException] {
-        testIgnoreMissingFiles()
-      }
-      assert(exception.getMessage().contains("does not exist"))
-    }
-  }
-
   /**
    * this is part of test 'Enabling/disabling ignoreCorruptFiles' but run in a loop
    * to increase the chance of failure

From be3de87914f29a56137e391d0cf494c0d1a7ba12 Mon Sep 17 00:00:00 2001
From: Shashwat Anand <me@shashwat.me>
Date: Sat, 3 Feb 2018 10:31:04 -0800
Subject: [PATCH 246/566] [MINOR][DOC] Use raw triple double quotes around
 docstrings where there are occurrences of backslashes.

From [PEP 257](https://www.python.org/dev/peps/pep-0257/):

> For consistency, always use """triple double quotes""" around docstrings. Use r"""raw triple double quotes""" if you use any backslashes in your docstrings. For Unicode docstrings, use u"""Unicode triple-quoted strings""".

For example, this is what help (kafka_wordcount) shows:

```
DESCRIPTION
    Counts words in UTF8 encoded, '
    ' delimited text received from the network every second.
     Usage: kafka_wordcount.py <zk> <topic>

     To run this on your local machine, you need to setup Kafka and create a producer first, see
     http://kafka.apache.org/documentation.html#quickstart

     and then run the example
        `$ bin/spark-submit --jars       external/kafka-assembly/target/scala-*/spark-streaming-kafka-assembly-*.jar       examples/src/main/python/streaming/kafka_wordcount.py       localhost:2181 test`
```

This is what it shows, after the fix:

```
DESCRIPTION
    Counts words in UTF8 encoded, '\n' delimited text received from the network every second.
    Usage: kafka_wordcount.py <zk> <topic>

    To run this on your local machine, you need to setup Kafka and create a producer first, see
    http://kafka.apache.org/documentation.html#quickstart

    and then run the example
       `$ bin/spark-submit --jars \
         external/kafka-assembly/target/scala-*/spark-streaming-kafka-assembly-*.jar \
         examples/src/main/python/streaming/kafka_wordcount.py \
         localhost:2181 test`
```

The thing worth noticing is no linebreak here in the help.

## What changes were proposed in this pull request?

Change triple double quotes to raw triple double quotes when there are occurrences of backslashes in docstrings.

## How was this patch tested?

Manually as this is a doc fix.

Author: Shashwat Anand <me@shashwat.me>

Closes #20497 from ashashwat/docstring-fixes.

(cherry picked from commit 4aaa7d40bf495317e740b6d6f9c2a55dfd03521b)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../main/python/sql/streaming/structured_network_wordcount.py   | 2 +-
 .../sql/streaming/structured_network_wordcount_windowed.py      | 2 +-
 examples/src/main/python/streaming/direct_kafka_wordcount.py    | 2 +-
 examples/src/main/python/streaming/flume_wordcount.py           | 2 +-
 examples/src/main/python/streaming/kafka_wordcount.py           | 2 +-
 examples/src/main/python/streaming/network_wordcount.py         | 2 +-
 .../src/main/python/streaming/network_wordjoinsentiments.py     | 2 +-
 examples/src/main/python/streaming/sql_network_wordcount.py     | 2 +-
 .../src/main/python/streaming/stateful_network_wordcount.py     | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/src/main/python/sql/streaming/structured_network_wordcount.py b/examples/src/main/python/sql/streaming/structured_network_wordcount.py
index afde2550587ca..c3284c1d01017 100644
--- a/examples/src/main/python/sql/streaming/structured_network_wordcount.py
+++ b/examples/src/main/python/sql/streaming/structured_network_wordcount.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-"""
+r"""
  Counts words in UTF8 encoded, '\n' delimited text received from the network.
  Usage: structured_network_wordcount.py <hostname> <port>
    <hostname> and <port> describe the TCP server that Structured Streaming
diff --git a/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py b/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py
index 02a7d3363d780..db672551504b5 100644
--- a/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py
+++ b/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-"""
+r"""
  Counts words in UTF8 encoded, '\n' delimited text received from the network over a
  sliding window of configurable duration. Each line from the network is tagged
  with a timestamp that is used to determine the windows into which it falls.
diff --git a/examples/src/main/python/streaming/direct_kafka_wordcount.py b/examples/src/main/python/streaming/direct_kafka_wordcount.py
index 7097f7f4502bd..425df309011a0 100644
--- a/examples/src/main/python/streaming/direct_kafka_wordcount.py
+++ b/examples/src/main/python/streaming/direct_kafka_wordcount.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-"""
+r"""
  Counts words in UTF8 encoded, '\n' delimited text directly received from Kafka in every 2 seconds.
  Usage: direct_kafka_wordcount.py <broker_list> <topic>
 
diff --git a/examples/src/main/python/streaming/flume_wordcount.py b/examples/src/main/python/streaming/flume_wordcount.py
index d75bc6daac138..5d6e6dc36d6f9 100644
--- a/examples/src/main/python/streaming/flume_wordcount.py
+++ b/examples/src/main/python/streaming/flume_wordcount.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-"""
+r"""
  Counts words in UTF8 encoded, '\n' delimited text received from the network every second.
  Usage: flume_wordcount.py <hostname> <port>
 
diff --git a/examples/src/main/python/streaming/kafka_wordcount.py b/examples/src/main/python/streaming/kafka_wordcount.py
index 8d697f620f467..704f6602e2297 100644
--- a/examples/src/main/python/streaming/kafka_wordcount.py
+++ b/examples/src/main/python/streaming/kafka_wordcount.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-"""
+r"""
  Counts words in UTF8 encoded, '\n' delimited text received from the network every second.
  Usage: kafka_wordcount.py <zk> <topic>
 
diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index 2b48bcfd55db0..9010fafb425e6 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-"""
+r"""
  Counts words in UTF8 encoded, '\n' delimited text received from the network every second.
  Usage: network_wordcount.py <hostname> <port>
    <hostname> and <port> describe the TCP server that Spark Streaming would connect to receive data.
diff --git a/examples/src/main/python/streaming/network_wordjoinsentiments.py b/examples/src/main/python/streaming/network_wordjoinsentiments.py
index b309d9fad33f5..d51a380a5d5f9 100644
--- a/examples/src/main/python/streaming/network_wordjoinsentiments.py
+++ b/examples/src/main/python/streaming/network_wordjoinsentiments.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-"""
+r"""
  Shows the most positive words in UTF8 encoded, '\n' delimited text directly received the network
  every 5 seconds. The streaming data is joined with a static RDD of the AFINN word list
  (http://neuro.imm.dtu.dk/wiki/AFINN)
diff --git a/examples/src/main/python/streaming/sql_network_wordcount.py b/examples/src/main/python/streaming/sql_network_wordcount.py
index 398ac8d2d8f5e..7f12281c0e3fe 100644
--- a/examples/src/main/python/streaming/sql_network_wordcount.py
+++ b/examples/src/main/python/streaming/sql_network_wordcount.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-"""
+r"""
  Use DataFrames and SQL to count words in UTF8 encoded, '\n' delimited text received from the
  network every second.
 
diff --git a/examples/src/main/python/streaming/stateful_network_wordcount.py b/examples/src/main/python/streaming/stateful_network_wordcount.py
index f8bbc659c2ea7..d7bb61e729f18 100644
--- a/examples/src/main/python/streaming/stateful_network_wordcount.py
+++ b/examples/src/main/python/streaming/stateful_network_wordcount.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-"""
+r"""
  Counts words in UTF8 encoded, '\n' delimited text received from the
  network every second.
 

From 45f0f4ff76accab3387b08b3773a0b127333ea3a Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sat, 3 Feb 2018 10:40:21 -0800
Subject: [PATCH 247/566] [SPARK-21658][SQL][PYSPARK] Revert "[] Add default
 None for value in na.replace in PySpark"

This reverts commit 0fcde87aadc9a92e138f11583119465ca4b5c518.

See the discussion in [SPARK-21658](https://issues.apache.org/jira/browse/SPARK-21658),  [SPARK-19454](https://issues.apache.org/jira/browse/SPARK-19454) and https://github.com/apache/spark/pull/16793

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20496 from HyukjinKwon/revert-SPARK-21658.

(cherry picked from commit 551dff2bccb65e9b3f77b986f167aec90d9a6016)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 python/pyspark/sql/dataframe.py | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 1496cba91b90e..2e55407b5397b 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1577,16 +1577,6 @@ def replace(self, to_replace, value=None, subset=None):
         |null|  null|null|
         +----+------+----+
 
-        >>> df4.na.replace('Alice').show()
-        +----+------+----+
-        | age|height|name|
-        +----+------+----+
-        |  10|    80|null|
-        |   5|  null| Bob|
-        |null|  null| Tom|
-        |null|  null|null|
-        +----+------+----+
-
         >>> df4.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
         +----+------+----+
         | age|height|name|
@@ -2055,7 +2045,7 @@ def fill(self, value, subset=None):
 
     fill.__doc__ = DataFrame.fillna.__doc__
 
-    def replace(self, to_replace, value=None, subset=None):
+    def replace(self, to_replace, value, subset=None):
         return self.df.replace(to_replace, value, subset)
 
     replace.__doc__ = DataFrame.replace.__doc__

From 430025cba1ca8cc652fd11f894cef96203921dab Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Sun, 4 Feb 2018 09:15:48 -0800
Subject: [PATCH 248/566] [SPARK-22036][SQL][FOLLOWUP] Fix
 decimalArithmeticOperations.sql

## What changes were proposed in this pull request?

Fix decimalArithmeticOperations.sql test

## How was this patch tested?

N/A

Author: Yuming Wang <wgyumg@gmail.com>
Author: wangyum <wgyumg@gmail.com>
Author: Yuming Wang <yumwang@ebay.com>

Closes #20498 from wangyum/SPARK-22036.

(cherry picked from commit 6fb3fd15365d43733aefdb396db205d7ccf57f75)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../native/decimalArithmeticOperations.sql    |   6 +-
 .../decimalArithmeticOperations.sql.out       | 140 ++++++++++--------
 2 files changed, 80 insertions(+), 66 deletions(-)

diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalArithmeticOperations.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalArithmeticOperations.sql
index c6d8a49d4b93a..9be7fcdadfea8 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalArithmeticOperations.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalArithmeticOperations.sql
@@ -48,8 +48,9 @@ select 12345678901234567890.0 * 12345678901234567890.0;
 select 1e35 / 0.1;
 
 -- arithmetic operations causing a precision loss are truncated
+select 12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345;
 select 123456789123456789.1234567890 * 1.123456789123456789;
-select 0.001 / 9876543210987654321098765432109876543.2
+select 12345678912345.123456789123 / 0.000000012345678;
 
 -- return NULL instead of rounding, according to old Spark versions' behavior
 set spark.sql.decimalOperations.allowPrecisionLoss=false;
@@ -74,7 +75,8 @@ select 12345678901234567890.0 * 12345678901234567890.0;
 select 1e35 / 0.1;
 
 -- arithmetic operations causing a precision loss return NULL
+select 12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345;
 select 123456789123456789.1234567890 * 1.123456789123456789;
-select 0.001 / 9876543210987654321098765432109876543.2
+select 12345678912345.123456789123 / 0.000000012345678;
 
 drop table decimals_test;
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalArithmeticOperations.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalArithmeticOperations.sql.out
index 4d70fe19d539f..6bfdb84548d4d 100644
--- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalArithmeticOperations.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalArithmeticOperations.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 32
+-- Number of queries: 36
 
 
 -- !query 0
@@ -146,146 +146,158 @@ NULL
 
 
 -- !query 17
-select 123456789123456789.1234567890 * 1.123456789123456789
+select 12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345
 -- !query 17 schema
-struct<(CAST(123456789123456789.1234567890 AS DECIMAL(36,18)) * CAST(1.123456789123456789 AS DECIMAL(36,18))):decimal(38,18)>
+struct<(CAST(12345678912345678912345678912.1234567 AS DECIMAL(38,6)) + CAST(9999999999999999999999999999999.12345 AS DECIMAL(38,6))):decimal(38,6)>
 -- !query 17 output
-138698367904130467.654320988515622621
+10012345678912345678912345678911.246907
 
 
 -- !query 18
-select 0.001 / 9876543210987654321098765432109876543.2
-
-set spark.sql.decimalOperations.allowPrecisionLoss=false
+select 123456789123456789.1234567890 * 1.123456789123456789
 -- !query 18 schema
-struct<>
+struct<(CAST(123456789123456789.1234567890 AS DECIMAL(36,18)) * CAST(1.123456789123456789 AS DECIMAL(36,18))):decimal(38,18)>
 -- !query 18 output
-org.apache.spark.sql.catalyst.parser.ParseException
-
-mismatched input 'spark' expecting <EOF>(line 3, pos 4)
-
-== SQL ==
-select 0.001 / 9876543210987654321098765432109876543.2
-
-set spark.sql.decimalOperations.allowPrecisionLoss=false
-----^^^
+138698367904130467.654320988515622621
 
 
 -- !query 19
-select id, a+b, a-b, a*b, a/b from decimals_test order by id
+select 12345678912345.123456789123 / 0.000000012345678
 -- !query 19 schema
-struct<id:int,(a + b):decimal(38,17),(a - b):decimal(38,17),(a * b):decimal(38,6),(a / b):decimal(38,6)>
+struct<(CAST(12345678912345.123456789123 AS DECIMAL(29,15)) / CAST(1.2345678E-8 AS DECIMAL(29,15))):decimal(38,9)>
 -- !query 19 output
-1	1099	-899	99900	0.1001
-2	24690.246	0	152402061.885129	1
-3	1234.2234567891011	-1233.9765432108989	152.358023	0.0001
-4	123456789123456790.12345678912345679	123456789123456787.87654321087654321	138698367904130467.515623	109890109097814272.043109
+1000000073899961059796.725866332
 
 
 -- !query 20
-select id, a*10, b/10 from decimals_test order by id
+set spark.sql.decimalOperations.allowPrecisionLoss=false
 -- !query 20 schema
-struct<id:int,(CAST(a AS DECIMAL(38,18)) * CAST(CAST(10 AS DECIMAL(2,0)) AS DECIMAL(38,18))):decimal(38,15),(CAST(b AS DECIMAL(38,18)) / CAST(CAST(10 AS DECIMAL(2,0)) AS DECIMAL(38,18))):decimal(38,18)>
+struct<key:string,value:string>
 -- !query 20 output
-1	1000	99.9
-2	123451.23	1234.5123
-3	1.234567891011	123.41
-4	1234567891234567890	0.112345678912345679
+spark.sql.decimalOperations.allowPrecisionLoss	false
 
 
 -- !query 21
-select 10.3 * 3.0
+select id, a+b, a-b, a*b, a/b from decimals_test order by id
 -- !query 21 schema
-struct<(CAST(10.3 AS DECIMAL(3,1)) * CAST(3.0 AS DECIMAL(3,1))):decimal(6,2)>
+struct<id:int,(a + b):decimal(38,18),(a - b):decimal(38,18),(a * b):decimal(38,36),(a / b):decimal(38,18)>
 -- !query 21 output
-30.9
+1	1099	-899	NULL	0.1001001001001001
+2	24690.246	0	NULL	1
+3	1234.2234567891011	-1233.9765432108989	NULL	0.000100037913541123
+4	123456789123456790.123456789123456789	123456789123456787.876543210876543211	NULL	109890109097814272.043109406191131436
 
 
 -- !query 22
-select 10.3000 * 3.0
+select id, a*10, b/10 from decimals_test order by id
 -- !query 22 schema
-struct<(CAST(10.3000 AS DECIMAL(6,4)) * CAST(3.0 AS DECIMAL(6,4))):decimal(9,5)>
+struct<id:int,(CAST(a AS DECIMAL(38,18)) * CAST(CAST(10 AS DECIMAL(2,0)) AS DECIMAL(38,18))):decimal(38,18),(CAST(b AS DECIMAL(38,18)) / CAST(CAST(10 AS DECIMAL(2,0)) AS DECIMAL(38,18))):decimal(38,19)>
 -- !query 22 output
-30.9
+1	1000	99.9
+2	123451.23	1234.5123
+3	1.234567891011	123.41
+4	1234567891234567890	0.1123456789123456789
 
 
 -- !query 23
-select 10.30000 * 30.0
+select 10.3 * 3.0
 -- !query 23 schema
-struct<(CAST(10.30000 AS DECIMAL(7,5)) * CAST(30.0 AS DECIMAL(7,5))):decimal(11,6)>
+struct<(CAST(10.3 AS DECIMAL(3,1)) * CAST(3.0 AS DECIMAL(3,1))):decimal(6,2)>
 -- !query 23 output
-309
+30.9
 
 
 -- !query 24
-select 10.300000000000000000 * 3.000000000000000000
+select 10.3000 * 3.0
 -- !query 24 schema
-struct<(CAST(10.300000000000000000 AS DECIMAL(20,18)) * CAST(3.000000000000000000 AS DECIMAL(20,18))):decimal(38,34)>
+struct<(CAST(10.3000 AS DECIMAL(6,4)) * CAST(3.0 AS DECIMAL(6,4))):decimal(9,5)>
 -- !query 24 output
 30.9
 
 
 -- !query 25
-select 10.300000000000000000 * 3.0000000000000000000
+select 10.30000 * 30.0
 -- !query 25 schema
-struct<(CAST(10.300000000000000000 AS DECIMAL(21,19)) * CAST(3.0000000000000000000 AS DECIMAL(21,19))):decimal(38,34)>
+struct<(CAST(10.30000 AS DECIMAL(7,5)) * CAST(30.0 AS DECIMAL(7,5))):decimal(11,6)>
 -- !query 25 output
-30.9
+309
 
 
 -- !query 26
-select (5e36 + 0.1) + 5e36
+select 10.300000000000000000 * 3.000000000000000000
 -- !query 26 schema
-struct<(CAST((CAST(5E+36 AS DECIMAL(38,1)) + CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) + CAST(5E+36 AS DECIMAL(38,1))):decimal(38,1)>
+struct<(CAST(10.300000000000000000 AS DECIMAL(20,18)) * CAST(3.000000000000000000 AS DECIMAL(20,18))):decimal(38,36)>
 -- !query 26 output
-NULL
+30.9
 
 
 -- !query 27
-select (-4e36 - 0.1) - 7e36
+select 10.300000000000000000 * 3.0000000000000000000
 -- !query 27 schema
-struct<(CAST((CAST(-4E+36 AS DECIMAL(38,1)) - CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) - CAST(7E+36 AS DECIMAL(38,1))):decimal(38,1)>
+struct<(CAST(10.300000000000000000 AS DECIMAL(21,19)) * CAST(3.0000000000000000000 AS DECIMAL(21,19))):decimal(38,37)>
 -- !query 27 output
 NULL
 
 
 -- !query 28
-select 12345678901234567890.0 * 12345678901234567890.0
+select (5e36 + 0.1) + 5e36
 -- !query 28 schema
-struct<(12345678901234567890.0 * 12345678901234567890.0):decimal(38,2)>
+struct<(CAST((CAST(5E+36 AS DECIMAL(38,1)) + CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) + CAST(5E+36 AS DECIMAL(38,1))):decimal(38,1)>
 -- !query 28 output
 NULL
 
 
 -- !query 29
-select 1e35 / 0.1
+select (-4e36 - 0.1) - 7e36
 -- !query 29 schema
-struct<(CAST(1E+35 AS DECIMAL(37,1)) / CAST(0.1 AS DECIMAL(37,1))):decimal(38,6)>
+struct<(CAST((CAST(-4E+36 AS DECIMAL(38,1)) - CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) - CAST(7E+36 AS DECIMAL(38,1))):decimal(38,1)>
 -- !query 29 output
 NULL
 
 
 -- !query 30
-select 123456789123456789.1234567890 * 1.123456789123456789
+select 12345678901234567890.0 * 12345678901234567890.0
 -- !query 30 schema
-struct<(CAST(123456789123456789.1234567890 AS DECIMAL(36,18)) * CAST(1.123456789123456789 AS DECIMAL(36,18))):decimal(38,18)>
+struct<(12345678901234567890.0 * 12345678901234567890.0):decimal(38,2)>
 -- !query 30 output
-138698367904130467.654320988515622621
+NULL
 
 
 -- !query 31
-select 0.001 / 9876543210987654321098765432109876543.2
-
-drop table decimals_test
+select 1e35 / 0.1
 -- !query 31 schema
-struct<>
+struct<(CAST(1E+35 AS DECIMAL(37,1)) / CAST(0.1 AS DECIMAL(37,1))):decimal(38,3)>
 -- !query 31 output
-org.apache.spark.sql.catalyst.parser.ParseException
+NULL
 
-mismatched input 'table' expecting <EOF>(line 3, pos 5)
 
-== SQL ==
-select 0.001 / 9876543210987654321098765432109876543.2
+-- !query 32
+select 12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345
+-- !query 32 schema
+struct<(CAST(12345678912345678912345678912.1234567 AS DECIMAL(38,7)) + CAST(9999999999999999999999999999999.12345 AS DECIMAL(38,7))):decimal(38,7)>
+-- !query 32 output
+NULL
+
+
+-- !query 33
+select 123456789123456789.1234567890 * 1.123456789123456789
+-- !query 33 schema
+struct<(CAST(123456789123456789.1234567890 AS DECIMAL(36,18)) * CAST(1.123456789123456789 AS DECIMAL(36,18))):decimal(38,28)>
+-- !query 33 output
+NULL
+
 
+-- !query 34
+select 12345678912345.123456789123 / 0.000000012345678
+-- !query 34 schema
+struct<(CAST(12345678912345.123456789123 AS DECIMAL(29,15)) / CAST(1.2345678E-8 AS DECIMAL(29,15))):decimal(38,18)>
+-- !query 34 output
+NULL
+
+
+-- !query 35
 drop table decimals_test
------^^^
+-- !query 35 schema
+struct<>
+-- !query 35 output
+

From e688ffee20cf9d7124e03b28521e31e10d0bb7f3 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <zsxwing@gmail.com>
Date: Mon, 5 Feb 2018 18:41:49 +0800
Subject: [PATCH 249/566] [SPARK-23307][WEBUI] Sort jobs/stages/tasks/queries
 with the completed timestamp before cleaning up them

## What changes were proposed in this pull request?

Sort jobs/stages/tasks/queries with the completed timestamp before cleaning up them to make the behavior consistent with 2.2.

## How was this patch tested?

- Jenkins.
- Manually ran the following codes and checked the UI for jobs/stages/tasks/queries.

```
spark.ui.retainedJobs 10
spark.ui.retainedStages 10
spark.sql.ui.retainedExecutions 10
spark.ui.retainedTasks 10
```

```
new Thread() {
  override def run() {
    spark.range(1, 2).foreach { i =>
        Thread.sleep(10000)
    }
  }
}.start()

Thread.sleep(5000)

for (_ <- 1 to 20) {
    new Thread() {
      override def run() {
        spark.range(1, 2).foreach { i =>
        }
      }
    }.start()
}

Thread.sleep(15000)
  spark.range(1, 2).foreach { i =>
}

sc.makeRDD(1 to 100, 100).foreach { i =>
}
```

Author: Shixiong Zhu <zsxwing@gmail.com>

Closes #20481 from zsxwing/SPARK-23307.

(cherry picked from commit a6bf3db20773ba65cbc4f2775db7bd215e78829a)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/status/AppStatusListener.scala      | 13 +--
 .../org/apache/spark/status/storeTypes.scala  |  7 ++
 .../spark/status/AppStatusListenerSuite.scala | 90 +++++++++++++++++++
 .../execution/ui/SQLAppStatusListener.scala   |  4 +-
 .../sql/execution/ui/SQLAppStatusStore.scala  |  9 +-
 .../ui/SQLAppStatusListenerSuite.scala        | 45 ++++++++++
 6 files changed, 158 insertions(+), 10 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
index b4edcf23abc09..8ac029191b1eb 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
@@ -850,8 +850,8 @@ private[spark] class AppStatusListener(
       return
     }
 
-    val toDelete = KVUtils.viewToSeq(kvstore.view(classOf[JobDataWrapper]),
-        countToDelete.toInt) { j =>
+    val view = kvstore.view(classOf[JobDataWrapper]).index("completionTime").first(0L)
+    val toDelete = KVUtils.viewToSeq(view, countToDelete.toInt) { j =>
       j.info.status != JobExecutionStatus.RUNNING && j.info.status != JobExecutionStatus.UNKNOWN
     }
     toDelete.foreach { j => kvstore.delete(j.getClass(), j.info.jobId) }
@@ -863,8 +863,8 @@ private[spark] class AppStatusListener(
       return
     }
 
-    val stages = KVUtils.viewToSeq(kvstore.view(classOf[StageDataWrapper]),
-        countToDelete.toInt) { s =>
+    val view = kvstore.view(classOf[StageDataWrapper]).index("completionTime").first(0L)
+    val stages = KVUtils.viewToSeq(view, countToDelete.toInt) { s =>
       s.info.status != v1.StageStatus.ACTIVE && s.info.status != v1.StageStatus.PENDING
     }
 
@@ -920,8 +920,9 @@ private[spark] class AppStatusListener(
     val countToDelete = calculateNumberToRemove(stage.savedTasks.get(), maxTasksPerStage).toInt
     if (countToDelete > 0) {
       val stageKey = Array(stage.info.stageId, stage.info.attemptNumber)
-      val view = kvstore.view(classOf[TaskDataWrapper]).index("stage").first(stageKey)
-        .last(stageKey)
+      val view = kvstore.view(classOf[TaskDataWrapper])
+        .index(TaskIndexNames.COMPLETION_TIME)
+        .parent(stageKey)
 
       // Try to delete finished tasks only.
       val toDelete = KVUtils.viewToSeq(view, countToDelete) { t =>
diff --git a/core/src/main/scala/org/apache/spark/status/storeTypes.scala b/core/src/main/scala/org/apache/spark/status/storeTypes.scala
index c9cb996a55fcc..412644d3657b5 100644
--- a/core/src/main/scala/org/apache/spark/status/storeTypes.scala
+++ b/core/src/main/scala/org/apache/spark/status/storeTypes.scala
@@ -73,6 +73,8 @@ private[spark] class JobDataWrapper(
   @JsonIgnore @KVIndex
   private def id: Int = info.jobId
 
+  @JsonIgnore @KVIndex("completionTime")
+  private def completionTime: Long = info.completionTime.map(_.getTime).getOrElse(-1L)
 }
 
 private[spark] class StageDataWrapper(
@@ -90,6 +92,8 @@ private[spark] class StageDataWrapper(
   @JsonIgnore @KVIndex("active")
   private def active: Boolean = info.status == StageStatus.ACTIVE
 
+  @JsonIgnore @KVIndex("completionTime")
+  private def completionTime: Long = info.completionTime.map(_.getTime).getOrElse(-1L)
 }
 
 /**
@@ -134,6 +138,7 @@ private[spark] object TaskIndexNames {
   final val STAGE = "stage"
   final val STATUS = "sta"
   final val TASK_INDEX = "idx"
+  final val COMPLETION_TIME = "ct"
 }
 
 /**
@@ -337,6 +342,8 @@ private[spark] class TaskDataWrapper(
   @JsonIgnore @KVIndex(value = TaskIndexNames.ERROR, parent = TaskIndexNames.STAGE)
   private def error: String = if (errorMessage.isDefined) errorMessage.get else ""
 
+  @JsonIgnore @KVIndex(value = TaskIndexNames.COMPLETION_TIME, parent = TaskIndexNames.STAGE)
+  private def completionTime: Long = launchTime + duration
 }
 
 private[spark] class RDDStorageInfoWrapper(val info: RDDStorageInfo) {
diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
index e7981bec6d64b..01d76a2cfb2a8 100644
--- a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
@@ -967,6 +967,96 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     }
   }
 
+  test("eviction should respect job completion time") {
+    val testConf = conf.clone().set(MAX_RETAINED_JOBS, 2)
+    val listener = new AppStatusListener(store, testConf, true)
+
+    // Start job 1 and job 2
+    time += 1
+    listener.onJobStart(SparkListenerJobStart(1, time, Nil, null))
+    time += 1
+    listener.onJobStart(SparkListenerJobStart(2, time, Nil, null))
+
+    // Stop job 2 before job 1
+    time += 1
+    listener.onJobEnd(SparkListenerJobEnd(2, time, JobSucceeded))
+    time += 1
+    listener.onJobEnd(SparkListenerJobEnd(1, time, JobSucceeded))
+
+    // Start job 3 and job 2 should be evicted.
+    time += 1
+    listener.onJobStart(SparkListenerJobStart(3, time, Nil, null))
+    assert(store.count(classOf[JobDataWrapper]) === 2)
+    intercept[NoSuchElementException] {
+      store.read(classOf[JobDataWrapper], 2)
+    }
+  }
+
+  test("eviction should respect stage completion time") {
+    val testConf = conf.clone().set(MAX_RETAINED_STAGES, 2)
+    val listener = new AppStatusListener(store, testConf, true)
+
+    val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1")
+    val stage2 = new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2")
+    val stage3 = new StageInfo(3, 0, "stage3", 4, Nil, Nil, "details3")
+
+    // Start stage 1 and stage 2
+    time += 1
+    stage1.submissionTime = Some(time)
+    listener.onStageSubmitted(SparkListenerStageSubmitted(stage1, new Properties()))
+    time += 1
+    stage2.submissionTime = Some(time)
+    listener.onStageSubmitted(SparkListenerStageSubmitted(stage2, new Properties()))
+
+    // Stop stage 2 before stage 1
+    time += 1
+    stage2.completionTime = Some(time)
+    listener.onStageCompleted(SparkListenerStageCompleted(stage2))
+    time += 1
+    stage1.completionTime = Some(time)
+    listener.onStageCompleted(SparkListenerStageCompleted(stage1))
+
+    // Start stage 3 and stage 2 should be evicted.
+    stage3.submissionTime = Some(time)
+    listener.onStageSubmitted(SparkListenerStageSubmitted(stage3, new Properties()))
+    assert(store.count(classOf[StageDataWrapper]) === 2)
+    intercept[NoSuchElementException] {
+      store.read(classOf[StageDataWrapper], Array(2, 0))
+    }
+  }
+
+  test("eviction should respect task completion time") {
+    val testConf = conf.clone().set(MAX_RETAINED_TASKS_PER_STAGE, 2)
+    val listener = new AppStatusListener(store, testConf, true)
+
+    val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1")
+    stage1.submissionTime = Some(time)
+    listener.onStageSubmitted(SparkListenerStageSubmitted(stage1, new Properties()))
+
+    // Start task 1 and task 2
+    val tasks = createTasks(3, Array("1"))
+    tasks.take(2).foreach { task =>
+      listener.onTaskStart(SparkListenerTaskStart(stage1.stageId, stage1.attemptNumber, task))
+    }
+
+    // Stop task 2 before task 1
+    time += 1
+    tasks(1).markFinished(TaskState.FINISHED, time)
+    listener.onTaskEnd(
+      SparkListenerTaskEnd(stage1.stageId, stage1.attemptId, "taskType", Success, tasks(1), null))
+    time += 1
+    tasks(0).markFinished(TaskState.FINISHED, time)
+    listener.onTaskEnd(
+      SparkListenerTaskEnd(stage1.stageId, stage1.attemptId, "taskType", Success, tasks(0), null))
+
+    // Start task 3 and task 2 should be evicted.
+    listener.onTaskStart(SparkListenerTaskStart(stage1.stageId, stage1.attemptNumber, tasks(2)))
+    assert(store.count(classOf[TaskDataWrapper]) === 2)
+    intercept[NoSuchElementException] {
+      store.read(classOf[TaskDataWrapper], tasks(1).id)
+    }
+  }
+
   test("driver logs") {
     val listener = new AppStatusListener(store, conf, true)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
index 73a105266e1c1..53fb9a0cc21cf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
@@ -332,8 +332,8 @@ class SQLAppStatusListener(
       return
     }
 
-    val toDelete = KVUtils.viewToSeq(kvstore.view(classOf[SQLExecutionUIData]),
-        countToDelete.toInt) { e => e.completionTime.isDefined }
+    val view = kvstore.view(classOf[SQLExecutionUIData]).index("completionTime").first(0L)
+    val toDelete = KVUtils.viewToSeq(view, countToDelete.toInt)(_.completionTime.isDefined)
     toDelete.foreach { e => kvstore.delete(e.getClass(), e.executionId) }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala
index 910f2e52fdbb3..9a76584717f42 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala
@@ -23,11 +23,12 @@ import java.util.Date
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 
+import com.fasterxml.jackson.annotation.JsonIgnore
 import com.fasterxml.jackson.databind.annotation.JsonDeserialize
 
 import org.apache.spark.JobExecutionStatus
 import org.apache.spark.status.KVUtils.KVIndexParam
-import org.apache.spark.util.kvstore.KVStore
+import org.apache.spark.util.kvstore.{KVIndex, KVStore}
 
 /**
  * Provides a view of a KVStore with methods that make it easy to query SQL-specific state. There's
@@ -90,7 +91,11 @@ class SQLExecutionUIData(
      * from the SQL listener instance.
      */
     @JsonDeserialize(keyAs = classOf[JLong])
-    val metricValues: Map[Long, String])
+    val metricValues: Map[Long, String]) {
+
+  @JsonIgnore @KVIndex("completionTime")
+  private def completionTimeIndex: Long = completionTime.map(_.getTime).getOrElse(-1L)
+}
 
 class SparkPlanGraphWrapper(
     @KVIndexParam val executionId: Long,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala
index 7d84f45d36bee..85face3994fd4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala
@@ -35,6 +35,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
 import org.apache.spark.sql.catalyst.util.quietly
 import org.apache.spark.sql.execution.{LeafExecNode, QueryExecution, SparkPlanInfo, SQLExecution}
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
+import org.apache.spark.sql.internal.StaticSQLConf.UI_RETAINED_EXECUTIONS
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.status.ElementTrackingStore
 import org.apache.spark.status.config._
@@ -510,6 +511,50 @@ class SQLAppStatusListenerSuite extends SparkFunSuite with SharedSQLContext with
     }
   }
 
+  test("eviction should respect execution completion time") {
+    val conf = sparkContext.conf.clone().set(UI_RETAINED_EXECUTIONS.key, "2")
+    val store = new ElementTrackingStore(new InMemoryStore, conf)
+    val listener = new SQLAppStatusListener(conf, store, live = true)
+    val statusStore = new SQLAppStatusStore(store, Some(listener))
+
+    var time = 0
+    val df = createTestDataFrame
+    // Start execution 1 and execution 2
+    time += 1
+    listener.onOtherEvent(SparkListenerSQLExecutionStart(
+      1,
+      "test",
+      "test",
+      df.queryExecution.toString,
+      SparkPlanInfo.fromSparkPlan(df.queryExecution.executedPlan),
+      time))
+    time += 1
+    listener.onOtherEvent(SparkListenerSQLExecutionStart(
+      2,
+      "test",
+      "test",
+      df.queryExecution.toString,
+      SparkPlanInfo.fromSparkPlan(df.queryExecution.executedPlan),
+      time))
+
+    // Stop execution 2 before execution 1
+    time += 1
+    listener.onOtherEvent(SparkListenerSQLExecutionEnd(2, time))
+    time += 1
+    listener.onOtherEvent(SparkListenerSQLExecutionEnd(1, time))
+
+    // Start execution 3 and execution 2 should be evicted.
+    time += 1
+    listener.onOtherEvent(SparkListenerSQLExecutionStart(
+      3,
+      "test",
+      "test",
+      df.queryExecution.toString,
+      SparkPlanInfo.fromSparkPlan(df.queryExecution.executedPlan),
+      time))
+    assert(statusStore.executionsCount === 2)
+    assert(statusStore.execution(2) === None)
+  }
 }
 
 

From 173449c2bd454a87487f8eebf7d74ee6ed505290 Mon Sep 17 00:00:00 2001
From: Sital Kedia <skedia@fb.com>
Date: Mon, 5 Feb 2018 10:19:18 -0800
Subject: [PATCH 250/566] [SPARK-23310][CORE] Turn off read ahead input stream
 for unshafe shuffle reader

To fix regression for TPC-DS queries

Author: Sital Kedia <skedia@fb.com>

Closes #20492 from sitalkedia/turn_off_async_inputstream.

(cherry picked from commit 03b7e120dd7ff7848c936c7a23644da5bd7219ab)
Signed-off-by: Sameer Agarwal <sameerag@apache.org>
---
 .../util/collection/unsafe/sort/UnsafeSorterSpillReader.java  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
index e2f48e5508af6..71e7c7a95ebdb 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
@@ -76,8 +76,10 @@ public UnsafeSorterSpillReader(
         SparkEnv.get() == null ? 0.5 :
              SparkEnv.get().conf().getDouble("spark.unsafe.sorter.spill.read.ahead.fraction", 0.5);
 
+    // SPARK-23310: Disable read-ahead input stream, because it is causing lock contention and perf regression for
+    // TPC-DS queries.
     final boolean readAheadEnabled = SparkEnv.get() != null &&
-        SparkEnv.get().conf().getBoolean("spark.unsafe.sorter.spill.read.ahead.enabled", true);
+        SparkEnv.get().conf().getBoolean("spark.unsafe.sorter.spill.read.ahead.enabled", false);
 
     final InputStream bs =
         new NioBufferedFileInputStream(file, (int) bufferSizeBytes);

From 4aa9aafcd542d5f28b7e6bb756c2e965010a757c Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Mon, 5 Feb 2018 14:17:11 -0800
Subject: [PATCH 251/566] [SPARK-23330][WEBUI] Spark UI SQL executions page
 throws NPE

## What changes were proposed in this pull request?

Spark SQL executions page throws the following error and the page crashes:
```
HTTP ERROR 500
Problem accessing /SQL/. Reason:

Server Error
Caused by:
java.lang.NullPointerException
at scala.collection.immutable.StringOps$.length$extension(StringOps.scala:47)
at scala.collection.immutable.StringOps.length(StringOps.scala:47)
at scala.collection.IndexedSeqOptimized$class.isEmpty(IndexedSeqOptimized.scala:27)
at scala.collection.immutable.StringOps.isEmpty(StringOps.scala:29)
at scala.collection.TraversableOnce$class.nonEmpty(TraversableOnce.scala:111)
at scala.collection.immutable.StringOps.nonEmpty(StringOps.scala:29)
at org.apache.spark.sql.execution.ui.ExecutionTable.descriptionCell(AllExecutionsPage.scala:182)
at org.apache.spark.sql.execution.ui.ExecutionTable.row(AllExecutionsPage.scala:155)
at org.apache.spark.sql.execution.ui.ExecutionTable$$anonfun$8.apply(AllExecutionsPage.scala:204)
at org.apache.spark.sql.execution.ui.ExecutionTable$$anonfun$8.apply(AllExecutionsPage.scala:204)
at org.apache.spark.ui.UIUtils$$anonfun$listingTable$2.apply(UIUtils.scala:339)
at org.apache.spark.ui.UIUtils$$anonfun$listingTable$2.apply(UIUtils.scala:339)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at org.apache.spark.ui.UIUtils$.listingTable(UIUtils.scala:339)
at org.apache.spark.sql.execution.ui.ExecutionTable.toNodeSeq(AllExecutionsPage.scala:203)
at org.apache.spark.sql.execution.ui.AllExecutionsPage.render(AllExecutionsPage.scala:67)
at org.apache.spark.ui.WebUI$$anonfun$2.apply(WebUI.scala:82)
at org.apache.spark.ui.WebUI$$anonfun$2.apply(WebUI.scala:82)
at org.apache.spark.ui.JettyUtils$$anon$3.doGet(JettyUtils.scala:90)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:687)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:790)
at org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:848)
at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:584)
at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1180)
at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:512)
at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1112)
at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:213)
at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:134)
at org.eclipse.jetty.server.Server.handle(Server.java:534)
at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:320)
at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:251)
at org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:283)
at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:108)
at org.eclipse.jetty.io.SelectChannelEndPoint$2.run(SelectChannelEndPoint.java:93)
at org.eclipse.jetty.util.thread.strategy.ExecuteProduceConsume.executeProduceConsume(ExecuteProduceConsume.java:303)
at org.eclipse.jetty.util.thread.strategy.ExecuteProduceConsume.produceConsume(ExecuteProduceConsume.java:148)
at org.eclipse.jetty.util.thread.strategy.ExecuteProduceConsume.run(ExecuteProduceConsume.java:136)
at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:671)
at org.eclipse.jetty.util.thread.QueuedThreadPool$2.run(QueuedThreadPool.java:589)
at java.lang.Thread.run(Thread.java:748)
```

One of the possible reason that this page fails may be the `SparkListenerSQLExecutionStart` event get dropped before processed, so the execution description and details don't get updated.
This was not a issue in 2.2 because it would ignore any job start event that arrives before the corresponding execution start event, which doesn't sound like a good decision.

We shall try to handle the null values in the front page side, that is, try to give a default value when `execution.details` or `execution.description` is null.
Another possible approach is not to spill the `LiveExecutionData` in `SQLAppStatusListener.update(exec: LiveExecutionData)` if `exec.details` is null. This is not ideal because this way you will not see the execution if `SparkListenerSQLExecutionStart` event is lost, because `AllExecutionsPage` only read executions from KVStore.

## How was this patch tested?

After the change, the page shows the following:
![image](https://user-images.githubusercontent.com/4784782/35775480-28cc5fde-093e-11e8-8ccc-f58c2ef4a514.png)

Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #20502 from jiangxb1987/executionPage.

(cherry picked from commit c2766b07b4b9ed976931966a79c65043e81cf694)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../apache/spark/sql/execution/ui/AllExecutionsPage.scala   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala
index 7019d98e1619f..e751ce39cd5d7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala
@@ -179,7 +179,7 @@ private[ui] abstract class ExecutionTable(
   }
 
   private def descriptionCell(execution: SQLExecutionUIData): Seq[Node] = {
-    val details = if (execution.details.nonEmpty) {
+    val details = if (execution.details != null && execution.details.nonEmpty) {
       <span onclick="clickDetail(this)" class="expand-details">
         +details
       </span> ++
@@ -190,8 +190,10 @@ private[ui] abstract class ExecutionTable(
       Nil
     }
 
-    val desc = {
+    val desc = if (execution.description != null && execution.description.nonEmpty) {
       <a href={executionURL(execution.executionId)}>{execution.description}</a>
+    } else {
+      <a href={executionURL(execution.executionId)}>{execution.executionId}</a>
     }
 
     <div>{desc} {details}</div>

From 521494d7bdcbb6699e0b12cd3ff60fc27908938f Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <zsxwing@gmail.com>
Date: Tue, 6 Feb 2018 14:42:42 +0800
Subject: [PATCH 252/566] [SPARK-23326][WEBUI] schedulerDelay should return 0
 when the task is running

## What changes were proposed in this pull request?

When a task is still running, metrics like executorRunTime are not available. Then `schedulerDelay` will be almost the same as `duration` and that's confusing.

This PR makes `schedulerDelay` return 0 when the task is running which is the same behavior as 2.2.

## How was this patch tested?

`AppStatusUtilsSuite.schedulerDelay`

Author: Shixiong Zhu <zsxwing@gmail.com>

Closes #20493 from zsxwing/SPARK-23326.

(cherry picked from commit f3f1e14bb73dfdd2927d95b12d7d61d22de8a0ac)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/status/AppStatusUtils.scala  | 11 ++-
 .../spark/status/AppStatusUtilsSuite.scala    | 89 +++++++++++++++++++
 2 files changed, 98 insertions(+), 2 deletions(-)
 create mode 100644 core/src/test/scala/org/apache/spark/status/AppStatusUtilsSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusUtils.scala b/core/src/main/scala/org/apache/spark/status/AppStatusUtils.scala
index 341bd4e0cd016..87f434daf4870 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusUtils.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusUtils.scala
@@ -17,16 +17,23 @@
 
 package org.apache.spark.status
 
-import org.apache.spark.status.api.v1.{TaskData, TaskMetrics}
+import org.apache.spark.status.api.v1.TaskData
 
 private[spark] object AppStatusUtils {
 
+  private val TASK_FINISHED_STATES = Set("FAILED", "KILLED", "SUCCESS")
+
+  private def isTaskFinished(task: TaskData): Boolean = {
+    TASK_FINISHED_STATES.contains(task.status)
+  }
+
   def schedulerDelay(task: TaskData): Long = {
-    if (task.taskMetrics.isDefined && task.duration.isDefined) {
+    if (isTaskFinished(task) && task.taskMetrics.isDefined && task.duration.isDefined) {
       val m = task.taskMetrics.get
       schedulerDelay(task.launchTime.getTime(), fetchStart(task), task.duration.get,
         m.executorDeserializeTime, m.resultSerializationTime, m.executorRunTime)
     } else {
+      // The task is still running and the metrics like executorRunTime are not available.
       0L
     }
   }
diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusUtilsSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusUtilsSuite.scala
new file mode 100644
index 0000000000000..9e74e86ad54b9
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/status/AppStatusUtilsSuite.scala
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.status
+
+import java.util.Date
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.status.api.v1.{TaskData, TaskMetrics}
+
+class AppStatusUtilsSuite extends SparkFunSuite {
+
+  test("schedulerDelay") {
+    val runningTask = new TaskData(
+      taskId = 0,
+      index = 0,
+      attempt = 0,
+      launchTime = new Date(1L),
+      resultFetchStart = None,
+      duration = Some(100L),
+      executorId = "1",
+      host = "localhost",
+      status = "RUNNING",
+      taskLocality = "PROCESS_LOCAL",
+      speculative = false,
+      accumulatorUpdates = Nil,
+      errorMessage = None,
+      taskMetrics = Some(new TaskMetrics(
+        executorDeserializeTime = 0L,
+        executorDeserializeCpuTime = 0L,
+        executorRunTime = 0L,
+        executorCpuTime = 0L,
+        resultSize = 0L,
+        jvmGcTime = 0L,
+        resultSerializationTime = 0L,
+        memoryBytesSpilled = 0L,
+        diskBytesSpilled = 0L,
+        peakExecutionMemory = 0L,
+        inputMetrics = null,
+        outputMetrics = null,
+        shuffleReadMetrics = null,
+        shuffleWriteMetrics = null)))
+    assert(AppStatusUtils.schedulerDelay(runningTask) === 0L)
+
+    val finishedTask = new TaskData(
+      taskId = 0,
+      index = 0,
+      attempt = 0,
+      launchTime = new Date(1L),
+      resultFetchStart = None,
+      duration = Some(100L),
+      executorId = "1",
+      host = "localhost",
+      status = "SUCCESS",
+      taskLocality = "PROCESS_LOCAL",
+      speculative = false,
+      accumulatorUpdates = Nil,
+      errorMessage = None,
+      taskMetrics = Some(new TaskMetrics(
+        executorDeserializeTime = 5L,
+        executorDeserializeCpuTime = 3L,
+        executorRunTime = 90L,
+        executorCpuTime = 10L,
+        resultSize = 100L,
+        jvmGcTime = 10L,
+        resultSerializationTime = 2L,
+        memoryBytesSpilled = 0L,
+        diskBytesSpilled = 0L,
+        peakExecutionMemory = 100L,
+        inputMetrics = null,
+        outputMetrics = null,
+        shuffleReadMetrics = null,
+        shuffleWriteMetrics = null)))
+    assert(AppStatusUtils.schedulerDelay(finishedTask) === 3L)
+  }
+}

From 44933033e9216ccb2e533b9dc6e6cb03cce39817 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Tue, 6 Feb 2018 18:29:37 +0900
Subject: [PATCH 253/566] [SPARK-23290][SQL][PYTHON][BACKPORT-2.3] Use
 datetime.date for date type when converting Spark DataFrame to Pandas
 DataFrame.

## What changes were proposed in this pull request?

This is a backport of #20506.

In #18664, there was a change in how `DateType` is being returned to users ([line 1968 in dataframe.py](https://github.com/apache/spark/pull/18664/files#diff-6fc344560230bf0ef711bb9b5573f1faR1968)). This can cause client code which works in Spark 2.2 to fail.
See [SPARK-23290](https://issues.apache.org/jira/browse/SPARK-23290?focusedCommentId=16350917&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-16350917) for an example.

This pr modifies to use `datetime.date` for date type as Spark 2.2 does.

## How was this patch tested?

Tests modified to fit the new behavior and existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #20515 from ueshin/issues/SPARK-23290_2.3.
---
 python/pyspark/serializers.py   |  9 ++++--
 python/pyspark/sql/dataframe.py |  7 ++--
 python/pyspark/sql/tests.py     | 57 ++++++++++++++++++++++++---------
 python/pyspark/sql/types.py     | 15 +++++++++
 4 files changed, 66 insertions(+), 22 deletions(-)

diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 88d6a191babca..e870325d202ca 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -267,12 +267,15 @@ def load_stream(self, stream):
         """
         Deserialize ArrowRecordBatches to an Arrow table and return as a list of pandas.Series.
         """
-        from pyspark.sql.types import _check_dataframe_localize_timestamps
+        from pyspark.sql.types import from_arrow_schema, _check_dataframe_convert_date, \
+            _check_dataframe_localize_timestamps
         import pyarrow as pa
         reader = pa.open_stream(stream)
+        schema = from_arrow_schema(reader.schema)
         for batch in reader:
-            # NOTE: changed from pa.Columns.to_pandas, timezone issue in conversion fixed in 0.7.1
-            pdf = _check_dataframe_localize_timestamps(batch.to_pandas(), self._timezone)
+            pdf = batch.to_pandas()
+            pdf = _check_dataframe_convert_date(pdf, schema)
+            pdf = _check_dataframe_localize_timestamps(pdf, self._timezone)
             yield [c for _, c in pdf.iteritems()]
 
     def __repr__(self):
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 2e55407b5397b..59a417015b949 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1923,7 +1923,8 @@ def toPandas(self):
 
         if self.sql_ctx.getConf("spark.sql.execution.arrow.enabled", "false").lower() == "true":
             try:
-                from pyspark.sql.types import _check_dataframe_localize_timestamps
+                from pyspark.sql.types import _check_dataframe_convert_date, \
+                    _check_dataframe_localize_timestamps
                 from pyspark.sql.utils import require_minimum_pyarrow_version
                 import pyarrow
                 require_minimum_pyarrow_version()
@@ -1931,6 +1932,7 @@ def toPandas(self):
                 if tables:
                     table = pyarrow.concat_tables(tables)
                     pdf = table.to_pandas()
+                    pdf = _check_dataframe_convert_date(pdf, self.schema)
                     return _check_dataframe_localize_timestamps(pdf, timezone)
                 else:
                     return pd.DataFrame.from_records([], columns=self.columns)
@@ -2009,7 +2011,6 @@ def _to_corrected_pandas_type(dt):
     """
     When converting Spark SQL records to Pandas DataFrame, the inferred data type may be wrong.
     This method gets the corrected data type for Pandas if that type may be inferred uncorrectly.
-    NOTE: DateType is inferred incorrectly as 'object', TimestampType is correct with datetime64[ns]
     """
     import numpy as np
     if type(dt) == ByteType:
@@ -2020,8 +2021,6 @@ def _to_corrected_pandas_type(dt):
         return np.int32
     elif type(dt) == FloatType:
         return np.float32
-    elif type(dt) == DateType:
-        return 'datetime64[ns]'
     else:
         return None
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 0d5bc137abf47..95b9c0e4412dd 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -2810,7 +2810,7 @@ def test_to_pandas(self):
         self.assertEquals(types[1], np.object)
         self.assertEquals(types[2], np.bool)
         self.assertEquals(types[3], np.float32)
-        self.assertEquals(types[4], 'datetime64[ns]')
+        self.assertEquals(types[4], np.object)  # datetime.date
         self.assertEquals(types[5], 'datetime64[ns]')
 
     @unittest.skipIf(not _have_old_pandas, "Old Pandas not installed")
@@ -3356,7 +3356,7 @@ class ArrowTests(ReusedSQLTestCase):
 
     @classmethod
     def setUpClass(cls):
-        from datetime import datetime
+        from datetime import date, datetime
         from decimal import Decimal
         ReusedSQLTestCase.setUpClass()
 
@@ -3378,11 +3378,11 @@ def setUpClass(cls):
             StructField("7_date_t", DateType(), True),
             StructField("8_timestamp_t", TimestampType(), True)])
         cls.data = [(u"a", 1, 10, 0.2, 2.0, Decimal("2.0"),
-                     datetime(1969, 1, 1), datetime(1969, 1, 1, 1, 1, 1)),
+                     date(1969, 1, 1), datetime(1969, 1, 1, 1, 1, 1)),
                     (u"b", 2, 20, 0.4, 4.0, Decimal("4.0"),
-                     datetime(2012, 2, 2), datetime(2012, 2, 2, 2, 2, 2)),
+                     date(2012, 2, 2), datetime(2012, 2, 2, 2, 2, 2)),
                     (u"c", 3, 30, 0.8, 6.0, Decimal("6.0"),
-                     datetime(2100, 3, 3), datetime(2100, 3, 3, 3, 3, 3))]
+                     date(2100, 3, 3), datetime(2100, 3, 3, 3, 3, 3))]
 
     @classmethod
     def tearDownClass(cls):
@@ -3435,7 +3435,9 @@ def _toPandas_arrow_toggle(self, df):
     def test_toPandas_arrow_toggle(self):
         df = self.spark.createDataFrame(self.data, schema=self.schema)
         pdf, pdf_arrow = self._toPandas_arrow_toggle(df)
-        self.assertFramesEqual(pdf_arrow, pdf)
+        expected = self.create_pandas_data_frame()
+        self.assertFramesEqual(expected, pdf)
+        self.assertFramesEqual(expected, pdf_arrow)
 
     def test_toPandas_respect_session_timezone(self):
         df = self.spark.createDataFrame(self.data, schema=self.schema)
@@ -4036,18 +4038,42 @@ def test_vectorized_udf_unsupported_types(self):
             with self.assertRaisesRegexp(Exception, 'Unsupported data type'):
                 df.select(f(col('map'))).collect()
 
-    def test_vectorized_udf_null_date(self):
+    def test_vectorized_udf_dates(self):
         from pyspark.sql.functions import pandas_udf, col
         from datetime import date
-        schema = StructType().add("date", DateType())
-        data = [(date(1969, 1, 1),),
-                (date(2012, 2, 2),),
-                (None,),
-                (date(2100, 4, 4),)]
+        schema = StructType().add("idx", LongType()).add("date", DateType())
+        data = [(0, date(1969, 1, 1),),
+                (1, date(2012, 2, 2),),
+                (2, None,),
+                (3, date(2100, 4, 4),)]
         df = self.spark.createDataFrame(data, schema=schema)
-        date_f = pandas_udf(lambda t: t, returnType=DateType())
-        res = df.select(date_f(col("date")))
-        self.assertEquals(df.collect(), res.collect())
+
+        date_copy = pandas_udf(lambda t: t, returnType=DateType())
+        df = df.withColumn("date_copy", date_copy(col("date")))
+
+        @pandas_udf(returnType=StringType())
+        def check_data(idx, date, date_copy):
+            import pandas as pd
+            msgs = []
+            is_equal = date.isnull()
+            for i in range(len(idx)):
+                if (is_equal[i] and data[idx[i]][1] is None) or \
+                        date[i] == data[idx[i]][1]:
+                    msgs.append(None)
+                else:
+                    msgs.append(
+                        "date values are not equal (date='%s': data[%d][1]='%s')"
+                        % (date[i], idx[i], data[idx[i]][1]))
+            return pd.Series(msgs)
+
+        result = df.withColumn("check_data",
+                               check_data(col("idx"), col("date"), col("date_copy"))).collect()
+
+        self.assertEquals(len(data), len(result))
+        for i in range(len(result)):
+            self.assertEquals(data[i][1], result[i][1])  # "date" col
+            self.assertEquals(data[i][1], result[i][2])  # "date_copy" col
+            self.assertIsNone(result[i][3])  # "check_data" col
 
     def test_vectorized_udf_timestamps(self):
         from pyspark.sql.functions import pandas_udf, col
@@ -4088,6 +4114,7 @@ def check_data(idx, timestamp, timestamp_copy):
         self.assertEquals(len(data), len(result))
         for i in range(len(result)):
             self.assertEquals(data[i][1], result[i][1])  # "timestamp" col
+            self.assertEquals(data[i][1], result[i][2])  # "timestamp_copy" col
             self.assertIsNone(result[i][3])  # "check_data" col
 
     def test_vectorized_udf_return_timestamp_tz(self):
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 0dc5823f72a3c..093dae5a22e1f 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1694,6 +1694,21 @@ def from_arrow_schema(arrow_schema):
          for field in arrow_schema])
 
 
+def _check_dataframe_convert_date(pdf, schema):
+    """ Correct date type value to use datetime.date.
+
+    Pandas DataFrame created from PyArrow uses datetime64[ns] for date type values, but we should
+    use datetime.date to match the behavior with when Arrow optimization is disabled.
+
+    :param pdf: pandas.DataFrame
+    :param schema: a Spark schema of the pandas.DataFrame
+    """
+    for field in schema:
+        if type(field.dataType) == DateType:
+            pdf[field.name] = pdf[field.name].dt.date
+    return pdf
+
+
 def _check_dataframe_localize_timestamps(pdf, timezone):
     """
     Convert timezone aware timestamps to timezone-naive in the specified timezone or local timezone

From a511544822be6e3fc9c6bb5080a163b9acbb41f2 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Tue, 6 Feb 2018 18:30:50 +0900
Subject: [PATCH 254/566] [SPARK-23334][SQL][PYTHON] Fix pandas_udf with return
 type StringType() to handle str type properly in Python 2.

## What changes were proposed in this pull request?

In Python 2, when `pandas_udf` tries to return string type value created in the udf with `".."`, the execution fails. E.g.,

```python
from pyspark.sql.functions import pandas_udf, col
import pandas as pd

df = spark.range(10)
str_f = pandas_udf(lambda x: pd.Series(["%s" % i for i in x]), "string")
df.select(str_f(col('id'))).show()
```

raises the following exception:

```
...

java.lang.AssertionError: assertion failed: Invalid schema from pandas_udf: expected StringType, got BinaryType
	at scala.Predef$.assert(Predef.scala:170)
	at org.apache.spark.sql.execution.python.ArrowEvalPythonExec$$anon$2.<init>(ArrowEvalPythonExec.scala:93)

...
```

Seems like pyarrow ignores `type` parameter for `pa.Array.from_pandas()` and consider it as binary type when the type is string type and the string values are `str` instead of `unicode` in Python 2.

This pr adds a workaround for the case.

## How was this patch tested?

Added a test and existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #20507 from ueshin/issues/SPARK-23334.

(cherry picked from commit 63c5bf13ce5cd3b8d7e7fb88de881ed207fde720)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 python/pyspark/serializers.py | 4 ++++
 python/pyspark/sql/tests.py   | 9 +++++++++
 2 files changed, 13 insertions(+)

diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index e870325d202ca..91a7f093cec19 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -230,6 +230,10 @@ def create_array(s, t):
             s = _check_series_convert_timestamps_internal(s.fillna(0), timezone)
             # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2
             return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False)
+        elif t is not None and pa.types.is_string(t) and sys.version < '3':
+            # TODO: need decode before converting to Arrow in Python 2
+            return pa.Array.from_pandas(s.apply(
+                lambda v: v.decode("utf-8") if isinstance(v, str) else v), mask=mask, type=t)
         return pa.Array.from_pandas(s, mask=mask, type=t)
 
     arrs = [create_array(s, t) for s, t in series]
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 95b9c0e4412dd..2577ed7f282ff 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3896,6 +3896,15 @@ def test_vectorized_udf_null_string(self):
         res = df.select(str_f(col('str')))
         self.assertEquals(df.collect(), res.collect())
 
+    def test_vectorized_udf_string_in_udf(self):
+        from pyspark.sql.functions import pandas_udf, col
+        import pandas as pd
+        df = self.spark.range(10)
+        str_f = pandas_udf(lambda x: pd.Series(map(str, x)), StringType())
+        actual = df.select(str_f(col('id')))
+        expected = df.select(col('id').cast('string'))
+        self.assertEquals(expected.collect(), actual.collect())
+
     def test_vectorized_udf_datatype_string(self):
         from pyspark.sql.functions import pandas_udf, col
         df = self.spark.range(10).select(

From 7782fd03ab95552dff1d1477887632bbc8f6ee51 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Tue, 6 Feb 2018 10:46:48 -0800
Subject: [PATCH 255/566] [SPARK-23310][CORE][FOLLOWUP] Fix Java style check
 issues.

## What changes were proposed in this pull request?

This is a follow-up of #20492 which broke lint-java checks.
This pr fixes the lint-java issues.

```
[ERROR] src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java:[79] (sizes) LineLength: Line is longer than 100 characters (found 114).
```

## How was this patch tested?

Checked manually in my local environment.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #20514 from ueshin/issues/SPARK-23310/fup1.

(cherry picked from commit 7db9979babe52d15828967c86eb77e3fb2791579)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../util/collection/unsafe/sort/UnsafeSorterSpillReader.java  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
index 71e7c7a95ebdb..2c53c8d809d2e 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
@@ -76,8 +76,8 @@ public UnsafeSorterSpillReader(
         SparkEnv.get() == null ? 0.5 :
              SparkEnv.get().conf().getDouble("spark.unsafe.sorter.spill.read.ahead.fraction", 0.5);
 
-    // SPARK-23310: Disable read-ahead input stream, because it is causing lock contention and perf regression for
-    // TPC-DS queries.
+    // SPARK-23310: Disable read-ahead input stream, because it is causing lock contention and perf
+    // regression for TPC-DS queries.
     final boolean readAheadEnabled = SparkEnv.get() != null &&
         SparkEnv.get().conf().getBoolean("spark.unsafe.sorter.spill.read.ahead.enabled", false);
 

From 036a04b29c818ddbe695f7833577781e8bb16d3f Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 6 Feb 2018 12:27:37 -0800
Subject: [PATCH 256/566] [SPARK-23312][SQL][FOLLOWUP] add a config to turn off
 vectorized cache reader

## What changes were proposed in this pull request?

https://github.com/apache/spark/pull/20483 tried to provide a way to turn off the new columnar cache reader, to restore the behavior in 2.2. However even we turn off that config, the behavior is still different than 2.2.

If the output data are rows, we still enable whole stage codegen for the scan node, which is different with 2.2, we should also fix it.

## How was this patch tested?

existing tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20513 from cloud-fan/cache.

(cherry picked from commit ac7454cac04a1d9252b3856360eda5c3e8bcb8da)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../spark/sql/execution/columnar/InMemoryTableScanExec.scala   | 3 +++
 .../src/test/scala/org/apache/spark/sql/CachedTableSuite.scala | 3 ++-
 .../apache/spark/sql/execution/WholeStageCodegenSuite.scala    | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
index e972f8b30d87c..a93e8a1ad954d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
@@ -61,6 +61,9 @@ case class InMemoryTableScanExec(
     }) && !WholeStageCodegenExec.isTooManyFields(conf, relation.schema)
   }
 
+  // TODO: revisit this. Shall we always turn off whole stage codegen if the output data are rows?
+  override def supportCodegen: Boolean = supportsBatch
+
   override protected def needsUnsafeRowConversion: Boolean = false
 
   private val columnIndices =
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index 9f27fa09127af..669e5f2bf4e65 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -787,7 +787,8 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext
       withSQLConf(SQLConf.CACHE_VECTORIZED_READER_ENABLED.key -> vectorized.toString) {
         val df = spark.range(10).cache()
         df.queryExecution.executedPlan.foreach {
-          case i: InMemoryTableScanExec => assert(i.supportsBatch == vectorized)
+          case i: InMemoryTableScanExec =>
+            assert(i.supportsBatch == vectorized && i.supportCodegen == vectorized)
           case _ =>
         }
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
index 6e8d5a70d5a8f..ef16292a8e75c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -137,7 +137,7 @@ class WholeStageCodegenSuite extends QueryTest with SharedSQLContext {
     val dsStringFilter = dsString.filter(_ == "1")
     val planString = dsStringFilter.queryExecution.executedPlan
     assert(planString.collect {
-      case WholeStageCodegenExec(FilterExec(_, i: InMemoryTableScanExec)) if !i.supportsBatch => ()
+      case i: InMemoryTableScanExec if !i.supportsBatch => ()
     }.length == 1)
     assert(dsStringFilter.collect() === Array("1"))
   }

From 77cccc5e154b14f8a9cad829d7fd476e3b6405ce Mon Sep 17 00:00:00 2001
From: Li Jin <ice.xelloss@gmail.com>
Date: Tue, 6 Feb 2018 12:30:04 -0800
Subject: [PATCH 257/566] [MINOR][TEST] Fix class name for Pandas UDF tests

In https://github.com/apache/spark/commit/b2ce17b4c9fea58140a57ca1846b2689b15c0d61, I mistakenly renamed `VectorizedUDFTests` to `ScalarPandasUDF`. This PR fixes the mistake.

Existing tests.

Author: Li Jin <ice.xelloss@gmail.com>

Closes #20489 from icexelloss/fix-scalar-udf-tests.

(cherry picked from commit caf30445632de6aec810309293499199e7a20892)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 python/pyspark/sql/tests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 2577ed7f282ff..878d402e57546 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3740,7 +3740,7 @@ def foo(k, v):
 
 
 @unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
-class VectorizedUDFTests(ReusedSQLTestCase):
+class ScalarPandasUDFTests(ReusedSQLTestCase):
 
     @classmethod
     def setUpClass(cls):
@@ -4253,7 +4253,7 @@ def test_register_vectorized_udf_basic(self):
 
 
 @unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
-class GroupbyApplyTests(ReusedSQLTestCase):
+class GroupedMapPandasUDFTests(ReusedSQLTestCase):
 
     def assertFramesEqual(self, expected, result):
         msg = ("DataFrames are not equal: " +

From f9c913263219f5e8a375542994142645dd0f6c6a Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 6 Feb 2018 12:43:45 -0800
Subject: [PATCH 258/566] [SPARK-23315][SQL] failed to get output from
 canonicalized data source v2 related plans

## What changes were proposed in this pull request?

`DataSourceV2Relation`  keeps a `fullOutput` and resolves the real output on demand by column name lookup. i.e.
```
lazy val output: Seq[Attribute] = reader.readSchema().map(_.name).map { name =>
  fullOutput.find(_.name == name).get
}
```

This will be broken after we canonicalize the plan, because all attribute names become "None", see https://github.com/apache/spark/blob/v2.3.0-rc1/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala#L42

To fix this, `DataSourceV2Relation` should just keep `output`, and update the `output` when doing column pruning.

## How was this patch tested?

a new test case

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20485 from cloud-fan/canonicalize.

(cherry picked from commit b96a083b1c6ff0d2c588be9499b456e1adce97dc)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../v2/DataSourceReaderHolder.scala           | 12 +++-----
 .../datasources/v2/DataSourceV2Relation.scala |  8 ++---
 .../datasources/v2/DataSourceV2ScanExec.scala |  4 +--
 .../v2/PushDownOperatorsToDataSource.scala    | 29 +++++++++++++------
 .../sql/sources/v2/DataSourceV2Suite.scala    | 20 ++++++++++++-
 5 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala
index 6460c97abe344..81219e9771bd8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.v2
 
 import java.util.Objects
 
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
+import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.sources.v2.reader._
 
 /**
@@ -28,9 +28,9 @@ import org.apache.spark.sql.sources.v2.reader._
 trait DataSourceReaderHolder {
 
   /**
-   * The full output of the data source reader, without column pruning.
+   * The output of the data source reader, w.r.t. column pruning.
    */
-  def fullOutput: Seq[AttributeReference]
+  def output: Seq[Attribute]
 
   /**
    * The held data source reader.
@@ -46,7 +46,7 @@ trait DataSourceReaderHolder {
       case s: SupportsPushDownFilters => s.pushedFilters().toSet
       case _ => Nil
     }
-    Seq(fullOutput, reader.getClass, reader.readSchema(), filters)
+    Seq(output, reader.getClass, filters)
   }
 
   def canEqual(other: Any): Boolean
@@ -61,8 +61,4 @@ trait DataSourceReaderHolder {
   override def hashCode(): Int = {
     metadata.map(Objects.hashCode).foldLeft(0)((a, b) => 31 * a + b)
   }
-
-  lazy val output: Seq[Attribute] = reader.readSchema().map(_.name).map { name =>
-    fullOutput.find(_.name == name).get
-  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
index eebfa29f91b99..38f6b15224788 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics}
 import org.apache.spark.sql.sources.v2.reader._
 
 case class DataSourceV2Relation(
-    fullOutput: Seq[AttributeReference],
+    output: Seq[AttributeReference],
     reader: DataSourceReader)
   extends LeafNode with MultiInstanceRelation with DataSourceReaderHolder {
 
@@ -37,7 +37,7 @@ case class DataSourceV2Relation(
   }
 
   override def newInstance(): DataSourceV2Relation = {
-    copy(fullOutput = fullOutput.map(_.newInstance()))
+    copy(output = output.map(_.newInstance()))
   }
 }
 
@@ -46,8 +46,8 @@ case class DataSourceV2Relation(
  * to the non-streaming relation.
  */
 class StreamingDataSourceV2Relation(
-    fullOutput: Seq[AttributeReference],
-    reader: DataSourceReader) extends DataSourceV2Relation(fullOutput, reader) {
+    output: Seq[AttributeReference],
+    reader: DataSourceReader) extends DataSourceV2Relation(output, reader) {
   override def isStreaming: Boolean = true
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
index df469af2c262a..7d9581be4db89 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
@@ -35,14 +35,12 @@ import org.apache.spark.sql.types.StructType
  * Physical plan node for scanning data from a data source.
  */
 case class DataSourceV2ScanExec(
-    fullOutput: Seq[AttributeReference],
+    output: Seq[AttributeReference],
     @transient reader: DataSourceReader)
   extends LeafExecNode with DataSourceReaderHolder with ColumnarBatchScan {
 
   override def canEqual(other: Any): Boolean = other.isInstanceOf[DataSourceV2ScanExec]
 
-  override def producedAttributes: AttributeSet = AttributeSet(fullOutput)
-
   override def outputPartitioning: physical.Partitioning = reader match {
     case s: SupportsReportPartitioning =>
       new DataSourcePartitioning(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala
index 566a48394f02e..1ca6cbf061b4e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala
@@ -81,33 +81,44 @@ object PushDownOperatorsToDataSource extends Rule[LogicalPlan] with PredicateHel
 
     // TODO: add more push down rules.
 
-    pushDownRequiredColumns(filterPushed, filterPushed.outputSet)
+    val columnPruned = pushDownRequiredColumns(filterPushed, filterPushed.outputSet)
     // After column pruning, we may have redundant PROJECT nodes in the query plan, remove them.
-    RemoveRedundantProject(filterPushed)
+    RemoveRedundantProject(columnPruned)
   }
 
   // TODO: nested fields pruning
-  private def pushDownRequiredColumns(plan: LogicalPlan, requiredByParent: AttributeSet): Unit = {
+  private def pushDownRequiredColumns(
+      plan: LogicalPlan, requiredByParent: AttributeSet): LogicalPlan = {
     plan match {
-      case Project(projectList, child) =>
+      case p @ Project(projectList, child) =>
         val required = projectList.flatMap(_.references)
-        pushDownRequiredColumns(child, AttributeSet(required))
+        p.copy(child = pushDownRequiredColumns(child, AttributeSet(required)))
 
-      case Filter(condition, child) =>
+      case f @ Filter(condition, child) =>
         val required = requiredByParent ++ condition.references
-        pushDownRequiredColumns(child, required)
+        f.copy(child = pushDownRequiredColumns(child, required))
 
       case relation: DataSourceV2Relation => relation.reader match {
         case reader: SupportsPushDownRequiredColumns =>
+          // TODO: Enable the below assert after we make `DataSourceV2Relation` immutable. Fow now
+          // it's possible that the mutable reader being updated by someone else, and we need to
+          // always call `reader.pruneColumns` here to correct it.
+          // assert(relation.output.toStructType == reader.readSchema(),
+          //  "Schema of data source reader does not match the relation plan.")
+
           val requiredColumns = relation.output.filter(requiredByParent.contains)
           reader.pruneColumns(requiredColumns.toStructType)
 
-        case _ =>
+          val nameToAttr = relation.output.map(_.name).zip(relation.output).toMap
+          val newOutput = reader.readSchema().map(_.name).map(nameToAttr)
+          relation.copy(output = newOutput)
+
+        case _ => relation
       }
 
       // TODO: there may be more operators that can be used to calculate the required columns. We
       // can add more and more in the future.
-      case _ => plan.children.foreach(child => pushDownRequiredColumns(child, child.outputSet))
+      case _ => plan.mapChildren(c => pushDownRequiredColumns(c, c.outputSet))
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
index e0e034d734f27..6ad0e5f79bc40 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
@@ -24,7 +24,7 @@ import test.org.apache.spark.sql.sources.v2._
 import org.apache.spark.SparkException
 import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row}
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
-import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanExec
+import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, DataSourceV2ScanExec}
 import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
 import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector
 import org.apache.spark.sql.functions._
@@ -297,6 +297,24 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext {
     val reader4 = getReader(q4)
     assert(reader4.requiredSchema.fieldNames === Seq("i"))
   }
+
+  test("SPARK-23315: get output from canonicalized data source v2 related plans") {
+    def checkCanonicalizedOutput(df: DataFrame, numOutput: Int): Unit = {
+      val logical = df.queryExecution.optimizedPlan.collect {
+        case d: DataSourceV2Relation => d
+      }.head
+      assert(logical.canonicalized.output.length == numOutput)
+
+      val physical = df.queryExecution.executedPlan.collect {
+        case d: DataSourceV2ScanExec => d
+      }.head
+      assert(physical.canonicalized.output.length == numOutput)
+    }
+
+    val df = spark.read.format(classOf[AdvancedDataSourceV2].getName).load()
+    checkCanonicalizedOutput(df, 2)
+    checkCanonicalizedOutput(df.select('i), 1)
+  }
 }
 
 class SimpleDataSourceV2 extends DataSourceV2 with ReadSupport {

From 874d3f89fe0f903a6465520c3e6c4788a6865d9a Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 6 Feb 2018 16:46:43 -0800
Subject: [PATCH 259/566] [SPARK-23327][SQL] Update the description and tests
 of three external API or functions

## What changes were proposed in this pull request?
Update the description and tests of three external API or functions `createFunction `, `length` and `repartitionByRange `

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20495 from gatorsmile/updateFunc.

(cherry picked from commit c36fecc3b416c38002779c3cf40b6a665ac4bf13)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 R/pkg/R/functions.R                           |  4 +++-
 python/pyspark/sql/functions.py               |  8 ++++---
 .../sql/catalyst/catalog/SessionCatalog.scala |  7 ++++--
 .../expressions/stringExpressions.scala       | 23 ++++++++++---------
 .../scala/org/apache/spark/sql/Dataset.scala  |  2 ++
 .../sql/execution/command/functions.scala     | 14 +++++++----
 .../org/apache/spark/sql/functions.scala      |  4 +++-
 .../execution/command/DDLParserSuite.scala    | 10 ++++----
 8 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 55365a41d774b..9f7c6317cd924 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -1026,7 +1026,9 @@ setMethod("last_day",
           })
 
 #' @details
-#' \code{length}: Computes the length of a given string or binary column.
+#' \code{length}: Computes the character length of a string data or number of bytes
+#' of a binary data. The length of string data includes the trailing spaces.
+#' The length of binary data includes binary zeros.
 #'
 #' @rdname column_string_functions
 #' @aliases length length,Column-method
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index b637707ac3cf5..52fab7c086efa 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1705,10 +1705,12 @@ def unhex(col):
 @ignore_unicode_prefix
 @since(1.5)
 def length(col):
-    """Calculates the length of a string or binary expression.
+    """Computes the character length of string data or number of bytes of binary data.
+    The length of character data includes the trailing spaces. The length of binary data
+    includes binary zeros.
 
-    >>> spark.createDataFrame([('ABC',)], ['a']).select(length('a').alias('length')).collect()
-    [Row(length=3)]
+    >>> spark.createDataFrame([('ABC ',)], ['a']).select(length('a').alias('length')).collect()
+    [Row(length=4)]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.length(_to_java_column(col)))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index a129896230775..4b119c75260a7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -988,8 +988,11 @@ class SessionCatalog(
   // -------------------------------------------------------
 
   /**
-   * Create a metastore function in the database specified in `funcDefinition`.
+   * Create a function in the database specified in `funcDefinition`.
    * If no such database is specified, create it in the current database.
+   *
+   * @param ignoreIfExists: When true, ignore if the function with the specified name exists
+   *                        in the specified database.
    */
   def createFunction(funcDefinition: CatalogFunction, ignoreIfExists: Boolean): Unit = {
     val db = formatDatabaseName(funcDefinition.identifier.database.getOrElse(getCurrentDatabase))
@@ -1061,7 +1064,7 @@ class SessionCatalog(
   }
 
   /**
-   * Check if the specified function exists.
+   * Check if the function with the specified name exists
    */
   def functionExists(name: FunctionIdentifier): Boolean = {
     val db = formatDatabaseName(name.database.getOrElse(getCurrentDatabase))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 5cf783f1a5979..d7612e30b4c57 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -1653,19 +1653,19 @@ case class Left(str: Expression, len: Expression, child: Expression) extends Run
  * A function that returns the char length of the given string expression or
  * number of bytes of the given binary expression.
  */
-// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the character length of `expr` or number of bytes in binary data.",
+  usage = "_FUNC_(expr) - Returns the character length of string data or number of bytes of " +
+    "binary data. The length of string data includes the trailing spaces. The length of binary " +
+    "data includes binary zeros.",
   examples = """
     Examples:
-      > SELECT _FUNC_('Spark SQL');
-       9
-      > SELECT CHAR_LENGTH('Spark SQL');
-       9
-      > SELECT CHARACTER_LENGTH('Spark SQL');
-       9
+      > SELECT _FUNC_('Spark SQL ');
+       10
+      > SELECT CHAR_LENGTH('Spark SQL ');
+       10
+      > SELECT CHARACTER_LENGTH('Spark SQL ');
+       10
   """)
-// scalastyle:on line.size.limit
 case class Length(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
   override def dataType: DataType = IntegerType
   override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType))
@@ -1687,7 +1687,7 @@ case class Length(child: Expression) extends UnaryExpression with ImplicitCastIn
  * A function that returns the bit length of the given string or binary expression.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the bit length of `expr` or number of bits in binary data.",
+  usage = "_FUNC_(expr) - Returns the bit length of string data or number of bits of binary data.",
   examples = """
     Examples:
       > SELECT _FUNC_('Spark SQL');
@@ -1716,7 +1716,8 @@ case class BitLength(child: Expression) extends UnaryExpression with ImplicitCas
  * A function that returns the byte length of the given string or binary expression.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the byte length of `expr` or number of bytes in binary data.",
+  usage = "_FUNC_(expr) - Returns the byte length of string data or number of bytes of binary " +
+    "data.",
   examples = """
     Examples:
       > SELECT _FUNC_('Spark SQL');
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index d47cd0aecf56a..0aee1d7be5788 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -2825,6 +2825,7 @@ class Dataset[T] private[sql](
    *
    * At least one partition-by expression must be specified.
    * When no explicit sort order is specified, "ascending nulls first" is assumed.
+   * Note, the rows are not sorted in each partition of the resulting Dataset.
    *
    * @group typedrel
    * @since 2.3.0
@@ -2848,6 +2849,7 @@ class Dataset[T] private[sql](
    *
    * At least one partition-by expression must be specified.
    * When no explicit sort order is specified, "ascending nulls first" is assumed.
+   * Note, the rows are not sorted in each partition of the resulting Dataset.
    *
    * @group typedrel
    * @since 2.3.0
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala
index 4f92ffee687aa..1f7808c2f8e80 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala
@@ -40,6 +40,10 @@ import org.apache.spark.sql.types.{StringType, StructField, StructType}
  *    CREATE [OR REPLACE] FUNCTION [IF NOT EXISTS] [databaseName.]functionName
  *    AS className [USING JAR\FILE 'uri' [, JAR|FILE 'uri']]
  * }}}
+ *
+ * @param ignoreIfExists: When true, ignore if the function with the specified name exists
+ *                        in the specified database.
+ * @param replace: When true, alter the function with the specified name
  */
 case class CreateFunctionCommand(
     databaseName: Option[String],
@@ -47,17 +51,17 @@ case class CreateFunctionCommand(
     className: String,
     resources: Seq[FunctionResource],
     isTemp: Boolean,
-    ifNotExists: Boolean,
+    ignoreIfExists: Boolean,
     replace: Boolean)
   extends RunnableCommand {
 
-  if (ifNotExists && replace) {
+  if (ignoreIfExists && replace) {
     throw new AnalysisException("CREATE FUNCTION with both IF NOT EXISTS and REPLACE" +
       " is not allowed.")
   }
 
   // Disallow to define a temporary function with `IF NOT EXISTS`
-  if (ifNotExists && isTemp) {
+  if (ignoreIfExists && isTemp) {
     throw new AnalysisException(
       "It is not allowed to define a TEMPORARY function with IF NOT EXISTS.")
   }
@@ -79,12 +83,12 @@ case class CreateFunctionCommand(
       // Handles `CREATE OR REPLACE FUNCTION AS ... USING ...`
       if (replace && catalog.functionExists(func.identifier)) {
         // alter the function in the metastore
-        catalog.alterFunction(CatalogFunction(func.identifier, className, resources))
+        catalog.alterFunction(func)
       } else {
         // For a permanent, we will store the metadata into underlying external catalog.
         // This function will be loaded into the FunctionRegistry when a query uses it.
         // We do not load it into FunctionRegistry right now.
-        catalog.createFunction(CatalogFunction(func.identifier, className, resources), ifNotExists)
+        catalog.createFunction(func, ignoreIfExists)
       }
     }
     Seq.empty[Row]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 0d11682d80a3c..0d54c02c3d06f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2267,7 +2267,9 @@ object functions {
   }
 
   /**
-   * Computes the length of a given string or binary column.
+   * Computes the character length of a given string or number of bytes of a binary string.
+   * The length of character strings include the trailing spaces. The length of binary strings
+   * includes binary zeros.
    *
    * @group string_funcs
    * @since 1.5.0
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
index 2b1aea08b1223..e0ccae15f1d05 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
@@ -236,7 +236,7 @@ class DDLParserSuite extends PlanTest with SharedSQLContext {
       Seq(
         FunctionResource(FunctionResourceType.fromString("jar"), "/path/to/jar1"),
         FunctionResource(FunctionResourceType.fromString("jar"), "/path/to/jar2")),
-      isTemp = true, ifNotExists = false, replace = false)
+      isTemp = true, ignoreIfExists = false, replace = false)
     val expected2 = CreateFunctionCommand(
       Some("hello"),
       "world",
@@ -244,7 +244,7 @@ class DDLParserSuite extends PlanTest with SharedSQLContext {
       Seq(
         FunctionResource(FunctionResourceType.fromString("archive"), "/path/to/archive"),
         FunctionResource(FunctionResourceType.fromString("file"), "/path/to/file")),
-      isTemp = false, ifNotExists = false, replace = false)
+      isTemp = false, ignoreIfExists = false, replace = false)
     val expected3 = CreateFunctionCommand(
       None,
       "helloworld3",
@@ -252,7 +252,7 @@ class DDLParserSuite extends PlanTest with SharedSQLContext {
       Seq(
         FunctionResource(FunctionResourceType.fromString("jar"), "/path/to/jar1"),
         FunctionResource(FunctionResourceType.fromString("jar"), "/path/to/jar2")),
-      isTemp = true, ifNotExists = false, replace = true)
+      isTemp = true, ignoreIfExists = false, replace = true)
     val expected4 = CreateFunctionCommand(
       Some("hello"),
       "world1",
@@ -260,7 +260,7 @@ class DDLParserSuite extends PlanTest with SharedSQLContext {
       Seq(
         FunctionResource(FunctionResourceType.fromString("archive"), "/path/to/archive"),
         FunctionResource(FunctionResourceType.fromString("file"), "/path/to/file")),
-      isTemp = false, ifNotExists = false, replace = true)
+      isTemp = false, ignoreIfExists = false, replace = true)
     val expected5 = CreateFunctionCommand(
       Some("hello"),
       "world2",
@@ -268,7 +268,7 @@ class DDLParserSuite extends PlanTest with SharedSQLContext {
       Seq(
         FunctionResource(FunctionResourceType.fromString("archive"), "/path/to/archive"),
         FunctionResource(FunctionResourceType.fromString("file"), "/path/to/file")),
-      isTemp = false, ifNotExists = true, replace = false)
+      isTemp = false, ignoreIfExists = true, replace = false)
     comparePlans(parsed1, expected1)
     comparePlans(parsed2, expected2)
     comparePlans(parsed3, expected3)

From cb22e830b0af3f2d760beffea9a79a6d349e4661 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Wed, 7 Feb 2018 23:24:16 +0900
Subject: [PATCH 260/566] [SPARK-23122][PYSPARK][FOLLOWUP] Replace
 registerTempTable by createOrReplaceTempView

## What changes were proposed in this pull request?
Replace `registerTempTable` by `createOrReplaceTempView`.

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20523 from gatorsmile/updateExamples.

(cherry picked from commit 9775df67f924663598d51723a878557ddafb8cfd)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 python/pyspark/sql/udf.py                                       | 2 +-
 .../src/test/java/test/org/apache/spark/sql/JavaUDAFSuite.java  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index 82a28c859525c..5a848c26b5fff 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -348,7 +348,7 @@ def registerJavaUDAF(self, name, javaClassName):
 
         >>> spark.udf.registerJavaUDAF("javaUDAF", "test.org.apache.spark.sql.MyDoubleAvg")
         >>> df = spark.createDataFrame([(1, "a"),(2, "b"), (3, "a")],["id", "name"])
-        >>> df.registerTempTable("df")
+        >>> df.createOrReplaceTempView("df")
         >>> spark.sql("SELECT name, javaUDAF(id) as avg from df group by name").collect()
         [Row(name=u'b', avg=102.0), Row(name=u'a', avg=102.0)]
         """
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDAFSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDAFSuite.java
index ddbaa45a483cb..08dc129f27a0c 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDAFSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDAFSuite.java
@@ -46,7 +46,7 @@ public void tearDown() {
   @SuppressWarnings("unchecked")
   @Test
   public void udf1Test() {
-    spark.range(1, 10).toDF("value").registerTempTable("df");
+    spark.range(1, 10).toDF("value").createOrReplaceTempView("df");
     spark.udf().registerJavaUDAF("myDoubleAvg", MyDoubleAvg.class.getName());
     Row result = spark.sql("SELECT myDoubleAvg(value) as my_avg from df").head();
     Assert.assertEquals(105.0, result.getDouble(0), 1.0e-6);

From 05239afc9e62ef4c71c9f22a930e73888985510a Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 7 Feb 2018 09:48:49 -0800
Subject: [PATCH 261/566] [SPARK-23345][SQL] Remove open stream record even
 closing it fails

## What changes were proposed in this pull request?

When `DebugFilesystem` closes opened stream, if any exception occurs, we still need to remove the open stream record from `DebugFilesystem`. Otherwise, it goes to report leaked filesystem connection.

## How was this patch tested?

Existing tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #20524 from viirya/SPARK-23345.

(cherry picked from commit 9841ae0313cbee1f083f131f9446808c90ed5a7b)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 core/src/test/scala/org/apache/spark/DebugFilesystem.scala | 7 +++++--
 .../org/apache/spark/sql/test/SharedSparkSession.scala     | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/DebugFilesystem.scala b/core/src/test/scala/org/apache/spark/DebugFilesystem.scala
index 91355f7362900..a5bdc95790722 100644
--- a/core/src/test/scala/org/apache/spark/DebugFilesystem.scala
+++ b/core/src/test/scala/org/apache/spark/DebugFilesystem.scala
@@ -103,8 +103,11 @@ class DebugFilesystem extends LocalFileSystem {
       override def markSupported(): Boolean = wrapped.markSupported()
 
       override def close(): Unit = {
-        wrapped.close()
-        removeOpenStream(wrapped)
+        try {
+          wrapped.close()
+        } finally {
+          removeOpenStream(wrapped)
+        }
       }
 
       override def read(): Int = wrapped.read()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala
index 0b4629a51b425..e758c865b908f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala
@@ -111,7 +111,7 @@ trait SharedSparkSession
     spark.sharedState.cacheManager.clearCache()
     // files can be closed from other threads, so wait a bit
     // normally this doesn't take more than 1s
-    eventually(timeout(10.seconds)) {
+    eventually(timeout(10.seconds), interval(2.seconds)) {
       DebugFilesystem.assertNoOpenStreams()
     }
   }

From 2ba07d5b101c44382e0db6d660da756c2f5ce627 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Thu, 8 Feb 2018 09:29:31 +0900
Subject: [PATCH 262/566] [SPARK-23300][TESTS][BRANCH-2.3] Prints out if Pandas
 and PyArrow are installed or not in PySpark SQL tests

This PR backports https://github.com/apache/spark/pull/20473 to branch-2.3.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20533 from HyukjinKwon/backport-20473.
---
 python/run-tests.py | 56 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index 1341086f02db0..3539c76b911a4 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -31,6 +31,7 @@
     import Queue
 else:
     import queue as Queue
+from distutils.version import LooseVersion
 
 
 # Append `SPARK_HOME/dev` to the Python path so that we can import the sparktestsupport module
@@ -39,7 +40,7 @@
 
 from sparktestsupport import SPARK_HOME  # noqa (suppress pep8 warnings)
 from sparktestsupport.shellutils import which, subprocess_check_output  # noqa
-from sparktestsupport.modules import all_modules  # noqa
+from sparktestsupport.modules import all_modules, pyspark_sql  # noqa
 
 
 python_modules = dict((m.name, m) for m in all_modules if m.python_test_goals if m.name != 'root')
@@ -151,6 +152,55 @@ def parse_opts():
     return opts
 
 
+def _check_dependencies(python_exec, modules_to_test):
+    # If we should test 'pyspark-sql', it checks if PyArrow and Pandas are installed and
+    # explicitly prints out. See SPARK-23300.
+    if pyspark_sql in modules_to_test:
+        # TODO(HyukjinKwon): Relocate and deduplicate these version specifications.
+        minimum_pyarrow_version = '0.8.0'
+        minimum_pandas_version = '0.19.2'
+
+        try:
+            pyarrow_version = subprocess_check_output(
+                [python_exec, "-c", "import pyarrow; print(pyarrow.__version__)"],
+                universal_newlines=True,
+                stderr=open(os.devnull, 'w')).strip()
+            if LooseVersion(pyarrow_version) >= LooseVersion(minimum_pyarrow_version):
+                LOGGER.info("Will test PyArrow related features against Python executable "
+                            "'%s' in '%s' module." % (python_exec, pyspark_sql.name))
+            else:
+                LOGGER.warning(
+                    "Will skip PyArrow related features against Python executable "
+                    "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow "
+                    "%s was found." % (
+                        python_exec, pyspark_sql.name, minimum_pyarrow_version, pyarrow_version))
+        except:
+            LOGGER.warning(
+                "Will skip PyArrow related features against Python executable "
+                "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow "
+                "was not found." % (python_exec, pyspark_sql.name, minimum_pyarrow_version))
+
+        try:
+            pandas_version = subprocess_check_output(
+                [python_exec, "-c", "import pandas; print(pandas.__version__)"],
+                universal_newlines=True,
+                stderr=open(os.devnull, 'w')).strip()
+            if LooseVersion(pandas_version) >= LooseVersion(minimum_pandas_version):
+                LOGGER.info("Will test Pandas related features against Python executable "
+                            "'%s' in '%s' module." % (python_exec, pyspark_sql.name))
+            else:
+                LOGGER.warning(
+                    "Will skip Pandas related features against Python executable "
+                    "'%s' in '%s' module. Pandas >= %s is required; however, Pandas "
+                    "%s was found." % (
+                        python_exec, pyspark_sql.name, minimum_pandas_version, pandas_version))
+        except:
+            LOGGER.warning(
+                "Will skip Pandas related features against Python executable "
+                "'%s' in '%s' module. Pandas >= %s is required; however, Pandas "
+                "was not found." % (python_exec, pyspark_sql.name, minimum_pandas_version))
+
+
 def main():
     opts = parse_opts()
     if (opts.verbose):
@@ -175,6 +225,10 @@ def main():
 
     task_queue = Queue.PriorityQueue()
     for python_exec in python_execs:
+        # Check if the python executable has proper dependencies installed to run tests
+        # for given modules properly.
+        _check_dependencies(python_exec, modules_to_test)
+
         python_implementation = subprocess_check_output(
             [python_exec, "-c", "import platform; print(platform.python_implementation())"],
             universal_newlines=True).strip()

From db59e554273fe0a54a3223079ff39106fdd1442e Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 8 Feb 2018 12:21:18 +0800
Subject: [PATCH 263/566] Revert [SPARK-22279][SQL] Turn on
 spark.sql.hive.convertMetastoreOrc by default

## What changes were proposed in this pull request?

This is to revert the changes made in https://github.com/apache/spark/pull/19499 , because this causes a regression. We should not ignore the table-specific compression conf when the Hive serde tables are converted to the data source tables.

## How was this patch tested?

The existing tests.

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20536 from gatorsmile/revert22279.

(cherry picked from commit 3473fda6dc77bdfd84b3de95d2082856ad4f8626)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
index d9627eb9790eb..93f3f38e52aa9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
@@ -109,7 +109,7 @@ private[spark] object HiveUtils extends Logging {
     .doc("When set to true, the built-in ORC reader and writer are used to process " +
       "ORC tables created by using the HiveQL syntax, instead of Hive serde.")
     .booleanConf
-    .createWithDefault(true)
+    .createWithDefault(false)
 
   val HIVE_METASTORE_SHARED_PREFIXES = buildConf("spark.sql.hive.metastore.sharedPrefixes")
     .doc("A comma separated list of class prefixes that should be loaded using the classloader " +

From 0538302561c4d77b2856b1ce73b3ccbcb6688ac6 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Thu, 8 Feb 2018 16:47:12 +0900
Subject: [PATCH 264/566] [SPARK-23319][TESTS][BRANCH-2.3] Explicitly specify
 Pandas and PyArrow versions in PySpark tests (to skip or test)

This PR backports https://github.com/apache/spark/pull/20487 to branch-2.3.

Author: hyukjinkwon <gurwls223@gmail.com>
Author: Takuya UESHIN <ueshin@databricks.com>

Closes #20534 from HyukjinKwon/PR_TOOL_PICK_PR_20487_BRANCH-2.3.
---
 pom.xml                         |  4 ++
 python/pyspark/sql/dataframe.py |  3 ++
 python/pyspark/sql/session.py   |  3 ++
 python/pyspark/sql/tests.py     | 83 ++++++++++++++++++---------------
 python/pyspark/sql/utils.py     | 30 ++++++++----
 python/setup.py                 | 10 +++-
 6 files changed, 86 insertions(+), 47 deletions(-)

diff --git a/pom.xml b/pom.xml
index a8e448a80c13e..9aa531ec65746 100644
--- a/pom.xml
+++ b/pom.xml
@@ -185,6 +185,10 @@
     <paranamer.version>2.8</paranamer.version>
     <maven-antrun.version>1.8</maven-antrun.version>
     <commons-crypto.version>1.0.0</commons-crypto.version>
+    <!--
+    If you are changing Arrow version specification, please check ./python/pyspark/sql/utils.py,
+    ./python/run-tests.py and ./python/setup.py too.
+    -->
     <arrow.version>0.8.0</arrow.version>
 
     <test.java.home>${java.home}</test.java.home>
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 59a417015b949..8ec24db8717b2 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1913,6 +1913,9 @@ def toPandas(self):
         0    2  Alice
         1    5    Bob
         """
+        from pyspark.sql.utils import require_minimum_pandas_version
+        require_minimum_pandas_version()
+
         import pandas as pd
 
         if self.sql_ctx.getConf("spark.sql.execution.pandas.respectSessionTimeZone").lower() \
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 6c84023c43fb6..2ac2ec2fe6328 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -640,6 +640,9 @@ def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=Tr
         except Exception:
             has_pandas = False
         if has_pandas and isinstance(data, pandas.DataFrame):
+            from pyspark.sql.utils import require_minimum_pandas_version
+            require_minimum_pandas_version()
+
             if self.conf.get("spark.sql.execution.pandas.respectSessionTimeZone").lower() \
                == "true":
                 timezone = self.conf.get("spark.sql.session.timeZone")
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 878d402e57546..0e1b2ec6f35e0 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -48,19 +48,26 @@
 else:
     import unittest
 
-_have_pandas = False
-_have_old_pandas = False
+_pandas_requirement_message = None
 try:
-    import pandas
-    try:
-        from pyspark.sql.utils import require_minimum_pandas_version
-        require_minimum_pandas_version()
-        _have_pandas = True
-    except:
-        _have_old_pandas = True
-except:
-    # No Pandas, but that's okay, we'll skip those tests
-    pass
+    from pyspark.sql.utils import require_minimum_pandas_version
+    require_minimum_pandas_version()
+except ImportError as e:
+    from pyspark.util import _exception_message
+    # If Pandas version requirement is not satisfied, skip related tests.
+    _pandas_requirement_message = _exception_message(e)
+
+_pyarrow_requirement_message = None
+try:
+    from pyspark.sql.utils import require_minimum_pyarrow_version
+    require_minimum_pyarrow_version()
+except ImportError as e:
+    from pyspark.util import _exception_message
+    # If Arrow version requirement is not satisfied, skip related tests.
+    _pyarrow_requirement_message = _exception_message(e)
+
+_have_pandas = _pandas_requirement_message is None
+_have_pyarrow = _pyarrow_requirement_message is None
 
 from pyspark import SparkContext
 from pyspark.sql import SparkSession, SQLContext, HiveContext, Column, Row
@@ -75,15 +82,6 @@
 from pyspark.sql.utils import AnalysisException, ParseException, IllegalArgumentException
 
 
-_have_arrow = False
-try:
-    import pyarrow
-    _have_arrow = True
-except:
-    # No Arrow, but that's okay, we'll skip those tests
-    pass
-
-
 class UTCOffsetTimezone(datetime.tzinfo):
     """
     Specifies timezone in UTC offset
@@ -2788,7 +2786,6 @@ def count_bucketed_cols(names, table="pyspark_bucket"):
 
     def _to_pandas(self):
         from datetime import datetime, date
-        import numpy as np
         schema = StructType().add("a", IntegerType()).add("b", StringType())\
                              .add("c", BooleanType()).add("d", FloatType())\
                              .add("dt", DateType()).add("ts", TimestampType())
@@ -2801,7 +2798,7 @@ def _to_pandas(self):
         df = self.spark.createDataFrame(data, schema)
         return df.toPandas()
 
-    @unittest.skipIf(not _have_pandas, "Pandas not installed")
+    @unittest.skipIf(not _have_pandas, _pandas_requirement_message)
     def test_to_pandas(self):
         import numpy as np
         pdf = self._to_pandas()
@@ -2813,13 +2810,13 @@ def test_to_pandas(self):
         self.assertEquals(types[4], np.object)  # datetime.date
         self.assertEquals(types[5], 'datetime64[ns]')
 
-    @unittest.skipIf(not _have_old_pandas, "Old Pandas not installed")
-    def test_to_pandas_old(self):
+    @unittest.skipIf(_have_pandas, "Required Pandas was found.")
+    def test_to_pandas_required_pandas_not_found(self):
         with QuietTest(self.sc):
             with self.assertRaisesRegexp(ImportError, 'Pandas >= .* must be installed'):
                 self._to_pandas()
 
-    @unittest.skipIf(not _have_pandas, "Pandas not installed")
+    @unittest.skipIf(not _have_pandas, _pandas_requirement_message)
     def test_to_pandas_avoid_astype(self):
         import numpy as np
         schema = StructType().add("a", IntegerType()).add("b", StringType())\
@@ -2837,7 +2834,7 @@ def test_create_dataframe_from_array_of_long(self):
         df = self.spark.createDataFrame(data)
         self.assertEqual(df.first(), Row(longarray=[-9223372036854775808, 0, 9223372036854775807]))
 
-    @unittest.skipIf(not _have_pandas, "Pandas not installed")
+    @unittest.skipIf(not _have_pandas, _pandas_requirement_message)
     def test_create_dataframe_from_pandas_with_timestamp(self):
         import pandas as pd
         from datetime import datetime
@@ -2852,14 +2849,16 @@ def test_create_dataframe_from_pandas_with_timestamp(self):
         self.assertTrue(isinstance(df.schema['ts'].dataType, TimestampType))
         self.assertTrue(isinstance(df.schema['d'].dataType, DateType))
 
-    @unittest.skipIf(not _have_old_pandas, "Old Pandas not installed")
-    def test_create_dataframe_from_old_pandas(self):
-        import pandas as pd
-        from datetime import datetime
-        pdf = pd.DataFrame({"ts": [datetime(2017, 10, 31, 1, 1, 1)],
-                            "d": [pd.Timestamp.now().date()]})
+    @unittest.skipIf(_have_pandas, "Required Pandas was found.")
+    def test_create_dataframe_required_pandas_not_found(self):
         with QuietTest(self.sc):
-            with self.assertRaisesRegexp(ImportError, 'Pandas >= .* must be installed'):
+            with self.assertRaisesRegexp(
+                    ImportError,
+                    "(Pandas >= .* must be installed|No module named '?pandas'?)"):
+                import pandas as pd
+                from datetime import datetime
+                pdf = pd.DataFrame({"ts": [datetime(2017, 10, 31, 1, 1, 1)],
+                                    "d": [pd.Timestamp.now().date()]})
                 self.spark.createDataFrame(pdf)
 
 
@@ -3351,7 +3350,9 @@ def __init__(self, **kwargs):
                 _make_type_verifier(data_type, nullable=False)(obj)
 
 
-@unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
+@unittest.skipIf(
+    not _have_pandas or not _have_pyarrow,
+    _pandas_requirement_message or _pyarrow_requirement_message)
 class ArrowTests(ReusedSQLTestCase):
 
     @classmethod
@@ -3615,7 +3616,9 @@ def test_createDataFrame_with_int_col_names(self):
         self.assertEqual(pdf_col_names, df_arrow.columns)
 
 
-@unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
+@unittest.skipIf(
+    not _have_pandas or not _have_pyarrow,
+    _pandas_requirement_message or _pyarrow_requirement_message)
 class PandasUDFTests(ReusedSQLTestCase):
     def test_pandas_udf_basic(self):
         from pyspark.rdd import PythonEvalType
@@ -3739,7 +3742,9 @@ def foo(k, v):
                     return k
 
 
-@unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
+@unittest.skipIf(
+    not _have_pandas or not _have_pyarrow,
+    _pandas_requirement_message or _pyarrow_requirement_message)
 class ScalarPandasUDFTests(ReusedSQLTestCase):
 
     @classmethod
@@ -4252,7 +4257,9 @@ def test_register_vectorized_udf_basic(self):
         self.assertEquals(expected.collect(), res2.collect())
 
 
-@unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
+@unittest.skipIf(
+    not _have_pandas or not _have_pyarrow,
+    _pandas_requirement_message or _pyarrow_requirement_message)
 class GroupedMapPandasUDFTests(ReusedSQLTestCase):
 
     def assertFramesEqual(self, expected, result):
diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py
index 08c34c6dccc5e..578298632dd4c 100644
--- a/python/pyspark/sql/utils.py
+++ b/python/pyspark/sql/utils.py
@@ -115,18 +115,32 @@ def toJArray(gateway, jtype, arr):
 def require_minimum_pandas_version():
     """ Raise ImportError if minimum version of Pandas is not installed
     """
+    # TODO(HyukjinKwon): Relocate and deduplicate the version specification.
+    minimum_pandas_version = "0.19.2"
+
     from distutils.version import LooseVersion
-    import pandas
-    if LooseVersion(pandas.__version__) < LooseVersion('0.19.2'):
-        raise ImportError("Pandas >= 0.19.2 must be installed on calling Python process; "
-                          "however, your version was %s." % pandas.__version__)
+    try:
+        import pandas
+    except ImportError:
+        raise ImportError("Pandas >= %s must be installed; however, "
+                          "it was not found." % minimum_pandas_version)
+    if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
+        raise ImportError("Pandas >= %s must be installed; however, "
+                          "your version was %s." % (minimum_pandas_version, pandas.__version__))
 
 
 def require_minimum_pyarrow_version():
     """ Raise ImportError if minimum version of pyarrow is not installed
     """
+    # TODO(HyukjinKwon): Relocate and deduplicate the version specification.
+    minimum_pyarrow_version = "0.8.0"
+
     from distutils.version import LooseVersion
-    import pyarrow
-    if LooseVersion(pyarrow.__version__) < LooseVersion('0.8.0'):
-        raise ImportError("pyarrow >= 0.8.0 must be installed on calling Python process; "
-                          "however, your version was %s." % pyarrow.__version__)
+    try:
+        import pyarrow
+    except ImportError:
+        raise ImportError("PyArrow >= %s must be installed; however, "
+                          "it was not found." % minimum_pyarrow_version)
+    if LooseVersion(pyarrow.__version__) < LooseVersion(minimum_pyarrow_version):
+        raise ImportError("PyArrow >= %s must be installed; however, "
+                          "your version was %s." % (minimum_pyarrow_version, pyarrow.__version__))
diff --git a/python/setup.py b/python/setup.py
index 251d4526d4dd0..6a98401941d8d 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -100,6 +100,11 @@ def _supports_symlinks():
               file=sys.stderr)
         exit(-1)
 
+# If you are changing the versions here, please also change ./python/pyspark/sql/utils.py and
+# ./python/run-tests.py. In case of Arrow, you should also check ./pom.xml.
+_minimum_pandas_version = "0.19.2"
+_minimum_pyarrow_version = "0.8.0"
+
 try:
     # We copy the shell script to be under pyspark/python/pyspark so that the launcher scripts
     # find it where expected. The rest of the files aren't copied because they are accessed
@@ -201,7 +206,10 @@ def _supports_symlinks():
         extras_require={
             'ml': ['numpy>=1.7'],
             'mllib': ['numpy>=1.7'],
-            'sql': ['pandas>=0.19.2', 'pyarrow>=0.8.0']
+            'sql': [
+                'pandas>=%s' % _minimum_pandas_version,
+                'pyarrow>=%s' % _minimum_pyarrow_version,
+            ]
         },
         classifiers=[
             'Development Status :: 5 - Production/Stable',

From 0c2a2100d0116776d2dcb2d48493f77a64aead0c Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 8 Feb 2018 00:08:54 -0800
Subject: [PATCH 265/566] [SPARK-23348][SQL] append data using saveAsTable
 should adjust the data types

## What changes were proposed in this pull request?

For inserting/appending data to an existing table, Spark should adjust the data types of the input query according to the table schema, or fail fast if it's uncastable.

There are several ways to insert/append data: SQL API, `DataFrameWriter.insertInto`, `DataFrameWriter.saveAsTable`. The first 2 ways create `InsertIntoTable` plan, and the last way creates `CreateTable` plan. However, we only adjust input query data types for `InsertIntoTable`, and users may hit weird errors when appending data using `saveAsTable`. See the JIRA for the error case.

This PR fixes this bug by adjusting data types for `CreateTable` too.

## How was this patch tested?

new test.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20527 from cloud-fan/saveAsTable.

(cherry picked from commit 7f5f5fb1296275a38da0adfa05125dd8ebf729ff)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../sql/execution/datasources/rules.scala     | 72 +++++++++++--------
 .../sql/execution/command/DDLSuite.scala      | 28 ++++++++
 2 files changed, 69 insertions(+), 31 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index 5dbcf4a915cbf..5cc21eeaeaa94 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -22,7 +22,7 @@ import java.util.Locale
 import org.apache.spark.sql.{AnalysisException, SaveMode, SparkSession}
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.catalog._
-import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Expression, InputFileBlockLength, InputFileBlockStart, InputFileName, RowOrdering}
+import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Cast, Expression, InputFileBlockLength, InputFileBlockStart, InputFileName, RowOrdering}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.command.DDLUtils
@@ -178,7 +178,8 @@ case class PreprocessTableCreation(sparkSession: SparkSession) extends Rule[Logi
 
       c.copy(
         tableDesc = existingTable,
-        query = Some(newQuery))
+        query = Some(DDLPreprocessingUtils.castAndRenameQueryOutput(
+          newQuery, existingTable.schema.toAttributes, conf)))
 
     // Here we normalize partition, bucket and sort column names, w.r.t. the case sensitivity
     // config, and do various checks:
@@ -316,7 +317,7 @@ case class PreprocessTableCreation(sparkSession: SparkSession) extends Rule[Logi
  * table. It also does data type casting and field renaming, to make sure that the columns to be
  * inserted have the correct data type and fields have the correct names.
  */
-case class PreprocessTableInsertion(conf: SQLConf) extends Rule[LogicalPlan] with CastSupport {
+case class PreprocessTableInsertion(conf: SQLConf) extends Rule[LogicalPlan] {
   private def preprocess(
       insert: InsertIntoTable,
       tblName: String,
@@ -336,6 +337,8 @@ case class PreprocessTableInsertion(conf: SQLConf) extends Rule[LogicalPlan] wit
           s"including ${staticPartCols.size} partition column(s) having constant value(s).")
     }
 
+    val newQuery = DDLPreprocessingUtils.castAndRenameQueryOutput(
+      insert.query, expectedColumns, conf)
     if (normalizedPartSpec.nonEmpty) {
       if (normalizedPartSpec.size != partColNames.length) {
         throw new AnalysisException(
@@ -346,37 +349,11 @@ case class PreprocessTableInsertion(conf: SQLConf) extends Rule[LogicalPlan] wit
            """.stripMargin)
       }
 
-      castAndRenameChildOutput(insert.copy(partition = normalizedPartSpec), expectedColumns)
+      insert.copy(query = newQuery, partition = normalizedPartSpec)
     } else {
       // All partition columns are dynamic because the InsertIntoTable command does
       // not explicitly specify partitioning columns.
-      castAndRenameChildOutput(insert, expectedColumns)
-        .copy(partition = partColNames.map(_ -> None).toMap)
-    }
-  }
-
-  private def castAndRenameChildOutput(
-      insert: InsertIntoTable,
-      expectedOutput: Seq[Attribute]): InsertIntoTable = {
-    val newChildOutput = expectedOutput.zip(insert.query.output).map {
-      case (expected, actual) =>
-        if (expected.dataType.sameType(actual.dataType) &&
-            expected.name == actual.name &&
-            expected.metadata == actual.metadata) {
-          actual
-        } else {
-          // Renaming is needed for handling the following cases like
-          // 1) Column names/types do not match, e.g., INSERT INTO TABLE tab1 SELECT 1, 2
-          // 2) Target tables have column metadata
-          Alias(cast(actual, expected.dataType), expected.name)(
-            explicitMetadata = Option(expected.metadata))
-        }
-    }
-
-    if (newChildOutput == insert.query.output) {
-      insert
-    } else {
-      insert.copy(query = Project(newChildOutput, insert.query))
+      insert.copy(query = newQuery, partition = partColNames.map(_ -> None).toMap)
     }
   }
 
@@ -491,3 +468,36 @@ object PreWriteCheck extends (LogicalPlan => Unit) {
     }
   }
 }
+
+object DDLPreprocessingUtils {
+
+  /**
+   * Adjusts the name and data type of the input query output columns, to match the expectation.
+   */
+  def castAndRenameQueryOutput(
+      query: LogicalPlan,
+      expectedOutput: Seq[Attribute],
+      conf: SQLConf): LogicalPlan = {
+    val newChildOutput = expectedOutput.zip(query.output).map {
+      case (expected, actual) =>
+        if (expected.dataType.sameType(actual.dataType) &&
+          expected.name == actual.name &&
+          expected.metadata == actual.metadata) {
+          actual
+        } else {
+          // Renaming is needed for handling the following cases like
+          // 1) Column names/types do not match, e.g., INSERT INTO TABLE tab1 SELECT 1, 2
+          // 2) Target tables have column metadata
+          Alias(
+            Cast(actual, expected.dataType, Option(conf.sessionLocalTimeZone)),
+            expected.name)(explicitMetadata = Option(expected.metadata))
+        }
+    }
+
+    if (newChildOutput == query.output) {
+      query
+    } else {
+      Project(newChildOutput, query)
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 6ca21b5aa1595..c03f5d1545f52 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -37,6 +37,8 @@ import org.apache.spark.util.Utils
 
 
 class InMemoryCatalogedDDLSuite extends DDLSuite with SharedSQLContext with BeforeAndAfterEach {
+  import testImplicits._
+
   override def afterEach(): Unit = {
     try {
       // drop all databases, tables and functions after each test
@@ -132,6 +134,32 @@ class InMemoryCatalogedDDLSuite extends DDLSuite with SharedSQLContext with Befo
       checkAnswer(spark.table("t"), Row(Row("a", 1)) :: Nil)
     }
   }
+
+  // TODO: This test is copied from HiveDDLSuite, unify it later.
+  test("SPARK-23348: append data to data source table with saveAsTable") {
+    withTable("t", "t1") {
+      Seq(1 -> "a").toDF("i", "j").write.saveAsTable("t")
+      checkAnswer(spark.table("t"), Row(1, "a"))
+
+      sql("INSERT INTO t SELECT 2, 'b'")
+      checkAnswer(spark.table("t"), Row(1, "a") :: Row(2, "b") :: Nil)
+
+      Seq(3 -> "c").toDF("i", "j").write.mode("append").saveAsTable("t")
+      checkAnswer(spark.table("t"), Row(1, "a") :: Row(2, "b") :: Row(3, "c") :: Nil)
+
+      Seq("c" -> 3).toDF("i", "j").write.mode("append").saveAsTable("t")
+      checkAnswer(spark.table("t"), Row(1, "a") :: Row(2, "b") :: Row(3, "c")
+        :: Row(null, "3") :: Nil)
+
+      Seq(4 -> "d").toDF("i", "j").write.saveAsTable("t1")
+
+      val e = intercept[AnalysisException] {
+        Seq(5 -> "e").toDF("i", "j").write.mode("append").format("json").saveAsTable("t1")
+      }
+      assert(e.message.contains("The format of the existing table default.t1 is " +
+        "`ParquetFileFormat`. It doesn't match the specified format `JsonFileFormat`."))
+    }
+  }
 }
 
 abstract class DDLSuite extends QueryTest with SQLTestUtils {

From 68f3a070c728d0af95e9b5eec2c49be274b67a20 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 8 Feb 2018 19:20:11 +0800
Subject: [PATCH 266/566] [SPARK-23268][SQL][FOLLOWUP] Reorganize packages in
 data source V2

## What changes were proposed in this pull request?

This is a followup of https://github.com/apache/spark/pull/20435.

While reorganizing the packages for streaming data source v2, the top level stream read/write support interfaces should not be in the reader/writer package, but should be in the `sources.v2` package, to follow the `ReadSupport`, `WriteSupport`, etc.

## How was this patch tested?

N/A

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20509 from cloud-fan/followup.

(cherry picked from commit a75f927173632eee1316879447cb62c8cf30ae37)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../org/apache/spark/sql/kafka010/KafkaSourceProvider.scala  | 4 +---
 .../sql/sources/v2/{reader => }/ContinuousReadSupport.java   | 4 +---
 .../sql/sources/v2/{reader => }/MicroBatchReadSupport.java   | 4 +---
 .../sql/sources/v2/{writer => }/StreamWriteSupport.java      | 5 ++---
 .../apache/spark/sql/sources/v2/writer/DataSourceWriter.java | 1 +
 .../spark/sql/execution/streaming/MicroBatchExecution.scala  | 5 ++---
 .../spark/sql/execution/streaming/RateSourceProvider.scala   | 1 -
 .../spark/sql/execution/streaming/StreamingRelation.scala    | 3 +--
 .../org/apache/spark/sql/execution/streaming/console.scala   | 3 +--
 .../execution/streaming/continuous/ContinuousExecution.scala | 4 +---
 .../sql/execution/streaming/sources/RateStreamSourceV2.scala | 2 +-
 .../spark/sql/execution/streaming/sources/memoryV2.scala     | 2 +-
 .../org/apache/spark/sql/streaming/DataStreamReader.scala    | 3 +--
 .../org/apache/spark/sql/streaming/DataStreamWriter.scala    | 2 +-
 .../apache/spark/sql/streaming/StreamingQueryManager.scala   | 2 +-
 .../spark/sql/execution/streaming/RateSourceV2Suite.scala    | 2 +-
 .../sql/streaming/sources/StreamingDataSourceV2Suite.scala   | 5 ++---
 17 files changed, 19 insertions(+), 33 deletions(-)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{reader => }/ContinuousReadSupport.java (92%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{reader => }/MicroBatchReadSupport.java (93%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{writer => }/StreamWriteSupport.java (93%)

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
index 694ca76e24964..d4fa0359c12d6 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
@@ -30,9 +30,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SparkSession, SQLContext}
 import org.apache.spark.sql.execution.streaming.{Sink, Source}
 import org.apache.spark.sql.sources._
-import org.apache.spark.sql.sources.v2.DataSourceOptions
-import org.apache.spark.sql.sources.v2.reader.ContinuousReadSupport
-import org.apache.spark.sql.sources.v2.writer.StreamWriteSupport
+import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceOptions, StreamWriteSupport}
 import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ContinuousReadSupport.java
similarity index 92%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousReadSupport.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/ContinuousReadSupport.java
index 0c1d5d1a9577a..7df5a451ae5f3 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousReadSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ContinuousReadSupport.java
@@ -15,13 +15,11 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.reader;
+package org.apache.spark.sql.sources.v2;
 
 import java.util.Optional;
 
 import org.apache.spark.annotation.InterfaceStability;
-import org.apache.spark.sql.sources.v2.DataSourceV2;
-import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousReader;
 import org.apache.spark.sql.types.StructType;
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/MicroBatchReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/MicroBatchReadSupport.java
similarity index 93%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/MicroBatchReadSupport.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/MicroBatchReadSupport.java
index 5e8f0c0dafdcf..209ffa7a0b9fa 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/MicroBatchReadSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/MicroBatchReadSupport.java
@@ -15,13 +15,11 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.reader;
+package org.apache.spark.sql.sources.v2;
 
 import java.util.Optional;
 
 import org.apache.spark.annotation.InterfaceStability;
-import org.apache.spark.sql.sources.v2.DataSourceOptions;
-import org.apache.spark.sql.sources.v2.DataSourceV2;
 import org.apache.spark.sql.sources.v2.reader.streaming.MicroBatchReader;
 import org.apache.spark.sql.types.StructType;
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/StreamWriteSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/StreamWriteSupport.java
similarity index 93%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/StreamWriteSupport.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/StreamWriteSupport.java
index 1c0e2e12f8d51..a77b01497269e 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/StreamWriteSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/StreamWriteSupport.java
@@ -15,12 +15,11 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.writer;
+package org.apache.spark.sql.sources.v2;
 
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.execution.streaming.BaseStreamingSink;
-import org.apache.spark.sql.sources.v2.DataSourceOptions;
-import org.apache.spark.sql.sources.v2.DataSourceV2;
+import org.apache.spark.sql.sources.v2.writer.DataSourceWriter;
 import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter;
 import org.apache.spark.sql.streaming.OutputMode;
 import org.apache.spark.sql.types.StructType;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceWriter.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceWriter.java
index 7096aec0d22c2..3da1be2e0ef76 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceWriter.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceWriter.java
@@ -21,6 +21,7 @@
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.sources.v2.DataSourceOptions;
+import org.apache.spark.sql.sources.v2.StreamWriteSupport;
 import org.apache.spark.sql.sources.v2.WriteSupport;
 import org.apache.spark.sql.streaming.OutputMode;
 import org.apache.spark.sql.types.StructType;
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
index d9aa8573ba930..2e6ecb8fdbdfa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
@@ -29,10 +29,9 @@ import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.datasources.v2.{StreamingDataSourceV2Relation, WriteToDataSourceV2}
 import org.apache.spark.sql.execution.streaming.sources.{InternalRowMicroBatchWriter, MicroBatchWriter}
-import org.apache.spark.sql.sources.v2.DataSourceOptions
-import org.apache.spark.sql.sources.v2.reader.MicroBatchReadSupport
+import org.apache.spark.sql.sources.v2.{DataSourceOptions, MicroBatchReadSupport, StreamWriteSupport}
 import org.apache.spark.sql.sources.v2.reader.streaming.{MicroBatchReader, Offset => OffsetV2}
-import org.apache.spark.sql.sources.v2.writer.{StreamWriteSupport, SupportsWriteInternalRow}
+import org.apache.spark.sql.sources.v2.writer.SupportsWriteInternalRow
 import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime, Trigger}
 import org.apache.spark.util.{Clock, Utils}
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
index ce5e63f5bde85..649fbbfa184ec 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
@@ -32,7 +32,6 @@ import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
 import org.apache.spark.sql.execution.streaming.continuous.RateStreamContinuousReader
 import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
 import org.apache.spark.sql.sources.v2._
-import org.apache.spark.sql.sources.v2.reader.ContinuousReadSupport
 import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousReader
 import org.apache.spark.sql.types._
 import org.apache.spark.util.{ManualClock, SystemClock}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
index 845c8d2c14e43..7146190645b37 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
@@ -25,8 +25,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LeafNode
 import org.apache.spark.sql.catalyst.plans.logical.Statistics
 import org.apache.spark.sql.execution.LeafExecNode
 import org.apache.spark.sql.execution.datasources.DataSource
-import org.apache.spark.sql.sources.v2.DataSourceV2
-import org.apache.spark.sql.sources.v2.reader.ContinuousReadSupport
+import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceV2}
 
 object StreamingRelation {
   def apply(dataSource: DataSource): StreamingRelation = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
index db600866067bc..cfba1001c6de0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
@@ -20,8 +20,7 @@ package org.apache.spark.sql.execution.streaming
 import org.apache.spark.sql._
 import org.apache.spark.sql.execution.streaming.sources.ConsoleWriter
 import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister}
-import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2}
-import org.apache.spark.sql.sources.v2.writer.StreamWriteSupport
+import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, StreamWriteSupport}
 import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
index ed22b9100497a..c3294d64b10cd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -31,10 +31,8 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, StreamingDataSourceV2Relation, WriteToDataSourceV2}
 import org.apache.spark.sql.execution.streaming.{ContinuousExecutionRelation, StreamingRelationV2, _}
-import org.apache.spark.sql.sources.v2.DataSourceOptions
-import org.apache.spark.sql.sources.v2.reader.ContinuousReadSupport
+import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceOptions, StreamWriteSupport}
 import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousReader, PartitionOffset}
-import org.apache.spark.sql.sources.v2.writer.StreamWriteSupport
 import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime, Trigger}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.{Clock, Utils}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
index 1315885da8a6f..6dcd331138fd8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.streaming.{RateStreamOffset, ValueRunTimeMsPair}
 import org.apache.spark.sql.sources.DataSourceRegister
-import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2}
+import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, MicroBatchReadSupport}
 import org.apache.spark.sql.sources.v2.reader._
 import org.apache.spark.sql.sources.v2.reader.streaming.{MicroBatchReader, Offset}
 import org.apache.spark.sql.types.{LongType, StructField, StructType, TimestampType}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
index 3411edbc53412..f960208155e3b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics}
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes.{Append, Complete, Update}
 import org.apache.spark.sql.execution.streaming.Sink
-import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2}
+import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, StreamWriteSupport}
 import org.apache.spark.sql.sources.v2.writer._
 import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
 import org.apache.spark.sql.streaming.OutputMode
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index 116ac3da07b75..f23851655350a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -28,8 +28,7 @@ import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming.{StreamingRelation, StreamingRelationV2}
 import org.apache.spark.sql.sources.StreamSourceProvider
-import org.apache.spark.sql.sources.v2.DataSourceOptions
-import org.apache.spark.sql.sources.v2.reader.{ContinuousReadSupport, MicroBatchReadSupport}
+import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceOptions, MicroBatchReadSupport}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
index 9aac360fd4bbc..2fc903168cfa0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.continuous.ContinuousTrigger
 import org.apache.spark.sql.execution.streaming.sources.{MemoryPlanV2, MemorySinkV2}
-import org.apache.spark.sql.sources.v2.writer.StreamWriteSupport
+import org.apache.spark.sql.sources.v2.StreamWriteSupport
 
 /**
  * Interface used to write a streaming `Dataset` to external storage systems (e.g. file systems,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
index ddb1edc433d5a..7cefd03e43bc3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.continuous.{ContinuousExecution, ContinuousTrigger}
 import org.apache.spark.sql.execution.streaming.state.StateStoreCoordinatorRef
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.sources.v2.writer.StreamWriteSupport
+import org.apache.spark.sql.sources.v2.StreamWriteSupport
 import org.apache.spark.util.{Clock, SystemClock, Utils}
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
index 0d68d9c3138aa..983ba1668f58f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
@@ -26,8 +26,8 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming.continuous._
 import org.apache.spark.sql.execution.streaming.sources.{RateStreamBatchTask, RateStreamMicroBatchReader, RateStreamSourceV2}
+import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, MicroBatchReadSupport}
 import org.apache.spark.sql.sources.v2.DataSourceOptions
-import org.apache.spark.sql.sources.v2.reader.{ContinuousReadSupport, MicroBatchReadSupport}
 import org.apache.spark.sql.streaming.StreamTest
 import org.apache.spark.util.ManualClock
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
index 51f44fa6285e4..af4618bed5456 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
@@ -25,10 +25,9 @@ import org.apache.spark.sql.execution.streaming.{RateStreamOffset, Sink, Streami
 import org.apache.spark.sql.execution.streaming.continuous.ContinuousTrigger
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider}
-import org.apache.spark.sql.sources.v2.DataSourceOptions
-import org.apache.spark.sql.sources.v2.reader.{ContinuousReadSupport, DataReaderFactory, MicroBatchReadSupport}
+import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceOptions, MicroBatchReadSupport, StreamWriteSupport}
+import org.apache.spark.sql.sources.v2.reader.DataReaderFactory
 import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousReader, MicroBatchReader, Offset, PartitionOffset}
-import org.apache.spark.sql.sources.v2.writer.StreamWriteSupport
 import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
 import org.apache.spark.sql.streaming.{OutputMode, StreamTest, Trigger}
 import org.apache.spark.sql.types.StructType

From dfb16147791ff87342ff852105420a5eac5c553b Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Fri, 9 Feb 2018 12:54:57 +0800
Subject: [PATCH 267/566] [SPARK-23186][SQL] Initialize DriverManager first
 before loading JDBC Drivers

## What changes were proposed in this pull request?

Since some JDBC Drivers have class initialization code to call `DriverManager`, we need to initialize `DriverManager` first in order to avoid potential executor-side **deadlock** situations like the following (or [STORM-2527](https://issues.apache.org/jira/browse/STORM-2527)).

```
Thread 9587: (state = BLOCKED)
 - sun.reflect.NativeConstructorAccessorImpl.newInstance0(java.lang.reflect.Constructor, java.lang.Object[]) bci=0 (Compiled frame; information may be imprecise)
 - sun.reflect.NativeConstructorAccessorImpl.newInstance(java.lang.Object[]) bci=85, line=62 (Compiled frame)
 - sun.reflect.DelegatingConstructorAccessorImpl.newInstance(java.lang.Object[]) bci=5, line=45 (Compiled frame)
 - java.lang.reflect.Constructor.newInstance(java.lang.Object[]) bci=79, line=423 (Compiled frame)
 - java.lang.Class.newInstance() bci=138, line=442 (Compiled frame)
 - java.util.ServiceLoader$LazyIterator.nextService() bci=119, line=380 (Interpreted frame)
 - java.util.ServiceLoader$LazyIterator.next() bci=11, line=404 (Interpreted frame)
 - java.util.ServiceLoader$1.next() bci=37, line=480 (Interpreted frame)
 - java.sql.DriverManager$2.run() bci=21, line=603 (Interpreted frame)
 - java.sql.DriverManager$2.run() bci=1, line=583 (Interpreted frame)
 - java.security.AccessController.doPrivileged(java.security.PrivilegedAction) bci=0 (Compiled frame)
 - java.sql.DriverManager.loadInitialDrivers() bci=27, line=583 (Interpreted frame)
 - java.sql.DriverManager.<clinit>() bci=32, line=101 (Interpreted frame)
 - org.apache.phoenix.mapreduce.util.ConnectionUtil.getConnection(java.lang.String, java.lang.Integer, java.lang.String, java.util.Properties) bci=12, line=98 (Interpreted frame)
 - org.apache.phoenix.mapreduce.util.ConnectionUtil.getInputConnection(org.apache.hadoop.conf.Configuration, java.util.Properties) bci=22, line=57 (Interpreted frame)
 - org.apache.phoenix.mapreduce.PhoenixInputFormat.getQueryPlan(org.apache.hadoop.mapreduce.JobContext, org.apache.hadoop.conf.Configuration) bci=61, line=116 (Interpreted frame)
 - org.apache.phoenix.mapreduce.PhoenixInputFormat.createRecordReader(org.apache.hadoop.mapreduce.InputSplit, org.apache.hadoop.mapreduce.TaskAttemptContext) bci=10, line=71 (Interpreted frame)
 - org.apache.spark.rdd.NewHadoopRDD$$anon$1.<init>(org.apache.spark.rdd.NewHadoopRDD, org.apache.spark.Partition, org.apache.spark.TaskContext) bci=233, line=156 (Interpreted frame)

Thread 9170: (state = BLOCKED)
 - org.apache.phoenix.jdbc.PhoenixDriver.<clinit>() bci=35, line=125 (Interpreted frame)
 - sun.reflect.NativeConstructorAccessorImpl.newInstance0(java.lang.reflect.Constructor, java.lang.Object[]) bci=0 (Compiled frame)
 - sun.reflect.NativeConstructorAccessorImpl.newInstance(java.lang.Object[]) bci=85, line=62 (Compiled frame)
 - sun.reflect.DelegatingConstructorAccessorImpl.newInstance(java.lang.Object[]) bci=5, line=45 (Compiled frame)
 - java.lang.reflect.Constructor.newInstance(java.lang.Object[]) bci=79, line=423 (Compiled frame)
 - java.lang.Class.newInstance() bci=138, line=442 (Compiled frame)
 - org.apache.spark.sql.execution.datasources.jdbc.DriverRegistry$.register(java.lang.String) bci=89, line=46 (Interpreted frame)
 - org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$2.apply() bci=7, line=53 (Interpreted frame)
 - org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$2.apply() bci=1, line=52 (Interpreted frame)
 - org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$$anon$1.<init>(org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD, org.apache.spark.Partition, org.apache.spark.TaskContext) bci=81, line=347 (Interpreted frame)
 - org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD.compute(org.apache.spark.Partition, org.apache.spark.TaskContext) bci=7, line=339 (Interpreted frame)
```

## How was this patch tested?

N/A

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20359 from dongjoon-hyun/SPARK-23186.

(cherry picked from commit 8cbcc33876c773722163b2259644037bbb259bd1)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/execution/datasources/jdbc/DriverRegistry.scala    | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/DriverRegistry.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/DriverRegistry.scala
index 7a6c0f9fed2f9..1723596de1db2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/DriverRegistry.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/DriverRegistry.scala
@@ -32,6 +32,13 @@ import org.apache.spark.util.Utils
  */
 object DriverRegistry extends Logging {
 
+  /**
+   * Load DriverManager first to avoid any race condition between
+   * DriverManager static initialization block and specific driver class's
+   * static initialization block. e.g. PhoenixDriver
+   */
+  DriverManager.getDrivers
+
   private val wrapperMap: mutable.Map[String, DriverWrapper] = mutable.Map.empty
 
   def register(className: String): Unit = {

From 196304a3a8ed15fd018e9c7b441954d17bd60124 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Fri, 9 Feb 2018 14:21:10 +0800
Subject: [PATCH 268/566] [SPARK-23328][PYTHON] Disallow default value None in
 na.replace/replace when 'to_replace' is not a dictionary

## What changes were proposed in this pull request?

This PR proposes to disallow default value None when 'to_replace' is not a dictionary.

It seems weird we set the default value of `value` to `None` and we ended up allowing the case as below:

```python
>>> df.show()
```
```
+----+------+-----+
| age|height| name|
+----+------+-----+
|  10|    80|Alice|
...
```

```python
>>> df.na.replace('Alice').show()
```
```
+----+------+----+
| age|height|name|
+----+------+----+
|  10|    80|null|
...
```

**After**

This PR targets to disallow the case above:

```python
>>> df.na.replace('Alice').show()
```
```
...
TypeError: value is required when to_replace is not a dictionary.
```

while we still allow when `to_replace` is a dictionary:

```python
>>> df.na.replace({'Alice': None}).show()
```
```
+----+------+----+
| age|height|name|
+----+------+----+
|  10|    80|null|
...
```

## How was this patch tested?

Manually tested, tests were added in `python/pyspark/sql/tests.py` and doctests were fixed.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20499 from HyukjinKwon/SPARK-19454-followup.

(cherry picked from commit 4b4ee2601079f12f8f410a38d2081793cbdedc14)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 docs/sql-programming-guide.md   |  1 +
 python/pyspark/__init__.py      |  1 +
 python/pyspark/_globals.py      | 70 +++++++++++++++++++++++++++++++++
 python/pyspark/sql/dataframe.py | 26 +++++++++---
 python/pyspark/sql/tests.py     | 11 +++---
 5 files changed, 99 insertions(+), 10 deletions(-)
 create mode 100644 python/pyspark/_globals.py

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index a0e221b39cc34..eab4030ee25d2 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1929,6 +1929,7 @@ working with timestamps in `pandas_udf`s to get the best performance, see
     - The rules to determine the result type of an arithmetic operation have been updated. In particular, if the precision / scale needed are out of the range of available values, the scale is reduced up to 6, in order to prevent the truncation of the integer part of the decimals. All the arithmetic operations are affected by the change, ie. addition (`+`), subtraction (`-`), multiplication (`*`), division (`/`), remainder (`%`) and positive module (`pmod`).
     - Literal values used in SQL operations are converted to DECIMAL with the exact precision and scale needed by them.
     - The configuration `spark.sql.decimalOperations.allowPrecisionLoss` has been introduced. It defaults to `true`, which means the new behavior described here; if set to `false`, Spark uses previous rules, ie. it doesn't adjust the needed scale to represent the values and it returns NULL if an exact representation of the value is not possible.
+  - In PySpark, `df.replace` does not allow to omit `value` when `to_replace` is not a dictionary. Previously, `value` could be omitted in the other cases and had `None` by default, which is counterintuitive and error prone.
 
 ## Upgrading From Spark SQL 2.1 to 2.2
 
diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
index 4d142c91629cc..58218918693ca 100644
--- a/python/pyspark/__init__.py
+++ b/python/pyspark/__init__.py
@@ -54,6 +54,7 @@
 from pyspark.taskcontext import TaskContext
 from pyspark.profiler import Profiler, BasicProfiler
 from pyspark.version import __version__
+from pyspark._globals import _NoValue
 
 
 def since(version):
diff --git a/python/pyspark/_globals.py b/python/pyspark/_globals.py
new file mode 100644
index 0000000000000..8e6099db09963
--- /dev/null
+++ b/python/pyspark/_globals.py
@@ -0,0 +1,70 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Module defining global singleton classes.
+
+This module raises a RuntimeError if an attempt to reload it is made. In that
+way the identities of the classes defined here are fixed and will remain so
+even if pyspark itself is reloaded. In particular, a function like the following
+will still work correctly after pyspark is reloaded:
+
+    def foo(arg=pyspark._NoValue):
+        if arg is pyspark._NoValue:
+            ...
+
+See gh-7844 for a discussion of the reload problem that motivated this module.
+
+Note that this approach is taken after from NumPy.
+"""
+
+__ALL__ = ['_NoValue']
+
+
+# Disallow reloading this module so as to preserve the identities of the
+# classes defined here.
+if '_is_loaded' in globals():
+    raise RuntimeError('Reloading pyspark._globals is not allowed')
+_is_loaded = True
+
+
+class _NoValueType(object):
+    """Special keyword value.
+
+    The instance of this class may be used as the default value assigned to a
+    deprecated keyword in order to check if it has been given a user defined
+    value.
+
+    This class was copied from NumPy.
+    """
+    __instance = None
+
+    def __new__(cls):
+        # ensure that only one instance exists
+        if not cls.__instance:
+            cls.__instance = super(_NoValueType, cls).__new__(cls)
+        return cls.__instance
+
+    # needed for python 2 to preserve identity through a pickle
+    def __reduce__(self):
+        return (self.__class__, ())
+
+    def __repr__(self):
+        return "<no value>"
+
+
+_NoValue = _NoValueType()
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 8ec24db8717b2..faee870a2d2e2 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -27,7 +27,7 @@
 
 import warnings
 
-from pyspark import copy_func, since
+from pyspark import copy_func, since, _NoValue
 from pyspark.rdd import RDD, _load_from_socket, ignore_unicode_prefix
 from pyspark.serializers import ArrowSerializer, BatchedSerializer, PickleSerializer, \
     UTF8Deserializer
@@ -1532,7 +1532,7 @@ def fillna(self, value, subset=None):
             return DataFrame(self._jdf.na().fill(value, self._jseq(subset)), self.sql_ctx)
 
     @since(1.4)
-    def replace(self, to_replace, value=None, subset=None):
+    def replace(self, to_replace, value=_NoValue, subset=None):
         """Returns a new :class:`DataFrame` replacing a value with another value.
         :func:`DataFrame.replace` and :func:`DataFrameNaFunctions.replace` are
         aliases of each other.
@@ -1545,8 +1545,8 @@ def replace(self, to_replace, value=None, subset=None):
 
         :param to_replace: bool, int, long, float, string, list or dict.
             Value to be replaced.
-            If the value is a dict, then `value` is ignored and `to_replace` must be a
-            mapping between a value and a replacement.
+            If the value is a dict, then `value` is ignored or can be omitted, and `to_replace`
+            must be a mapping between a value and a replacement.
         :param value: bool, int, long, float, string, list or None.
             The replacement value must be a bool, int, long, float, string or None. If `value` is a
             list, `value` should be of the same length and type as `to_replace`.
@@ -1577,6 +1577,16 @@ def replace(self, to_replace, value=None, subset=None):
         |null|  null|null|
         +----+------+----+
 
+        >>> df4.na.replace({'Alice': None}).show()
+        +----+------+----+
+        | age|height|name|
+        +----+------+----+
+        |  10|    80|null|
+        |   5|  null| Bob|
+        |null|  null| Tom|
+        |null|  null|null|
+        +----+------+----+
+
         >>> df4.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
         +----+------+----+
         | age|height|name|
@@ -1587,6 +1597,12 @@ def replace(self, to_replace, value=None, subset=None):
         |null|  null|null|
         +----+------+----+
         """
+        if value is _NoValue:
+            if isinstance(to_replace, dict):
+                value = None
+            else:
+                raise TypeError("value argument is required when to_replace is not a dictionary.")
+
         # Helper functions
         def all_of(types):
             """Given a type or tuple of types and a sequence of xs
@@ -2047,7 +2063,7 @@ def fill(self, value, subset=None):
 
     fill.__doc__ = DataFrame.fillna.__doc__
 
-    def replace(self, to_replace, value, subset=None):
+    def replace(self, to_replace, value=_NoValue, subset=None):
         return self.df.replace(to_replace, value, subset)
 
     replace.__doc__ = DataFrame.replace.__doc__
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 0e1b2ec6f35e0..59e08c72744b3 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -2237,11 +2237,6 @@ def test_replace(self):
                .replace(False, True).first())
         self.assertTupleEqual(row, (True, True))
 
-        # replace list while value is not given (default to None)
-        row = self.spark.createDataFrame(
-            [(u'Alice', 10, 80.0)], schema).replace(["Alice", "Bob"]).first()
-        self.assertTupleEqual(row, (None, 10, 80.0))
-
         # replace string with None and then drop None rows
         row = self.spark.createDataFrame(
             [(u'Alice', 10, 80.0)], schema).replace(u'Alice', None).dropna()
@@ -2277,6 +2272,12 @@ def test_replace(self):
             self.spark.createDataFrame(
                 [(u'Alice', 10, 80.1)], schema).replace({u"Alice": u"Bob", 10: 20}).first()
 
+        with self.assertRaisesRegexp(
+                TypeError,
+                'value argument is required when to_replace is not a dictionary.'):
+            self.spark.createDataFrame(
+                [(u'Alice', 10, 80.0)], schema).replace(["Alice", "Bob"]).first()
+
     def test_capture_analysis_exception(self):
         self.assertRaises(AnalysisException, lambda: self.spark.sql("select abc"))
         self.assertRaises(AnalysisException, lambda: self.df.selectExpr("a + b"))

From 08eb95f609f5d356c89dedcefa768b12a7a8b96c Mon Sep 17 00:00:00 2001
From: liuxian <liu.xian3@zte.com.cn>
Date: Fri, 9 Feb 2018 08:45:06 -0600
Subject: [PATCH 269/566] [SPARK-23358][CORE] When the number of partitions is
 greater than 2^28, it will result in an error result

## What changes were proposed in this pull request?
In the `checkIndexAndDataFile`,the `blocks` is the ` Int` type,  when it is greater than 2^28, `blocks*8` will overflow, and this will result in an error result.
In fact, `blocks` is actually the number of partitions.

## How was this patch tested?
Manual test

Author: liuxian <liu.xian3@zte.com.cn>

Closes #20544 from 10110346/overflow.

(cherry picked from commit f77270b8811bbd8956d0c08fa556265d2c5ee20e)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../org/apache/spark/shuffle/IndexShuffleBlockResolver.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
index 266ee42e39cca..2414b94dcf379 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
@@ -84,7 +84,7 @@ private[spark] class IndexShuffleBlockResolver(
    */
   private def checkIndexAndDataFile(index: File, data: File, blocks: Int): Array[Long] = {
     // the index file should have `block + 1` longs as offset.
-    if (index.length() != (blocks + 1) * 8) {
+    if (index.length() != (blocks + 1) * 8L) {
       return null
     }
     val lengths = new Array[Long](blocks)

From 49771ac8da8e68e8412d9f5d181953eaf0de7973 Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek@japila.pl>
Date: Fri, 9 Feb 2018 18:18:30 -0600
Subject: [PATCH 270/566] [MINOR][HIVE] Typo fixes

## What changes were proposed in this pull request?

Typo fixes (with expanding a Hive property)

## How was this patch tested?

local build. Awaiting Jenkins

Author: Jacek Laskowski <jacek@japila.pl>

Closes #20550 from jaceklaskowski/hiveutils-typos.

(cherry picked from commit 557938e2839afce26a10a849a2a4be8fc4580427)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
index 93f3f38e52aa9..c448c5a9821be 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
@@ -304,7 +304,7 @@ private[spark] object HiveUtils extends Logging {
         throw new IllegalArgumentException(
           "Builtin jars can only be used when hive execution version == hive metastore version. " +
             s"Execution: $builtinHiveVersion != Metastore: $hiveMetastoreVersion. " +
-            "Specify a vaild path to the correct hive jars using $HIVE_METASTORE_JARS " +
+            s"Specify a valid path to the correct hive jars using ${HIVE_METASTORE_JARS.key} " +
             s"or change ${HIVE_METASTORE_VERSION.key} to $builtinHiveVersion.")
       }
 
@@ -324,7 +324,7 @@ private[spark] object HiveUtils extends Logging {
       if (jars.length == 0) {
         throw new IllegalArgumentException(
           "Unable to locate hive jars to connect to metastore. " +
-            "Please set spark.sql.hive.metastore.jars.")
+            s"Please set ${HIVE_METASTORE_JARS.key}.")
       }
 
       logInfo(

From f3a9a7f6b6eac4421bd74ff73a74105982604ce6 Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliu@databricks.com>
Date: Fri, 9 Feb 2018 16:21:47 -0800
Subject: [PATCH 271/566] [SPARK-23275][SQL] fix the thread leaking in
 hive/tests

## What changes were proposed in this pull request?

This is a follow up of https://github.com/apache/spark/pull/20441.

The two lines actually can trigger the hive metastore bug: https://issues.apache.org/jira/browse/HIVE-16844

The two configs are not in the default `ObjectStore` properties, so any run hive commands after these two lines will set the `propsChanged` flag in the `ObjectStore.setConf` and then cause thread leaks.

I don't think the two lines are very useful. They can be removed safely.

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Feng Liu <fengliu@databricks.com>

Closes #20562 from liufengdb/fix-omm.

(cherry picked from commit 6d7c38330e68c7beb10f54eee8b4f607ee3c4136)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../main/scala/org/apache/spark/sql/hive/test/TestHive.scala    | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 59708e7a0f2ff..19028939f3673 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -530,8 +530,6 @@ private[hive] class TestHiveSparkSession(
       // For some reason, RESET does not reset the following variables...
       // https://issues.apache.org/jira/browse/HIVE-9004
       metadataHive.runSqlHive("set hive.table.parameters.default=")
-      metadataHive.runSqlHive("set datanucleus.cache.collections=true")
-      metadataHive.runSqlHive("set datanucleus.cache.collections.lazy=true")
       // Lots of tests fail if we do not change the partition whitelist from the default.
       metadataHive.runSqlHive("set hive.metastore.partition.name.whitelist.pattern=.*")
 

From b7571b9bfcced2e08f87e815c2ea9474bfd5fe2a Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Sun, 11 Feb 2018 01:08:02 +0900
Subject: [PATCH 272/566] [SPARK-23360][SQL][PYTHON] Get local timezone from
 environment via pytz, or dateutil.

## What changes were proposed in this pull request?

Currently we use `tzlocal()` to get Python local timezone, but it sometimes causes unexpected behavior.
I changed the way to get Python local timezone to use pytz if the timezone is specified in environment variable, or timezone file via dateutil .

## How was this patch tested?

Added a test and existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #20559 from ueshin/issues/SPARK-23360/master.

(cherry picked from commit 97a224a855c4410b2dfb9c0bcc6aae583bd28e92)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 python/pyspark/sql/tests.py | 28 ++++++++++++++++++++++++++++
 python/pyspark/sql/types.py | 23 +++++++++++++++++++----
 2 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 59e08c72744b3..95a057d4921e8 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -2862,6 +2862,34 @@ def test_create_dataframe_required_pandas_not_found(self):
                                     "d": [pd.Timestamp.now().date()]})
                 self.spark.createDataFrame(pdf)
 
+    # Regression test for SPARK-23360
+    @unittest.skipIf(not _have_pandas, _pandas_requirement_message)
+    def test_create_dateframe_from_pandas_with_dst(self):
+        import pandas as pd
+        from datetime import datetime
+
+        pdf = pd.DataFrame({'time': [datetime(2015, 10, 31, 22, 30)]})
+
+        df = self.spark.createDataFrame(pdf)
+        self.assertPandasEqual(pdf, df.toPandas())
+
+        orig_env_tz = os.environ.get('TZ', None)
+        orig_session_tz = self.spark.conf.get('spark.sql.session.timeZone')
+        try:
+            tz = 'America/Los_Angeles'
+            os.environ['TZ'] = tz
+            time.tzset()
+            self.spark.conf.set('spark.sql.session.timeZone', tz)
+
+            df = self.spark.createDataFrame(pdf)
+            self.assertPandasEqual(pdf, df.toPandas())
+        finally:
+            del os.environ['TZ']
+            if orig_env_tz is not None:
+                os.environ['TZ'] = orig_env_tz
+            time.tzset()
+            self.spark.conf.set('spark.sql.session.timeZone', orig_session_tz)
+
 
 class HiveSparkSubmitTests(SparkSubmitTests):
 
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 093dae5a22e1f..2599dc5fdc599 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1709,6 +1709,21 @@ def _check_dataframe_convert_date(pdf, schema):
     return pdf
 
 
+def _get_local_timezone():
+    """ Get local timezone using pytz with environment variable, or dateutil.
+
+    If there is a 'TZ' environment variable, pass it to pandas to use pytz and use it as timezone
+    string, otherwise use the special word 'dateutil/:' which means that pandas uses dateutil and
+    it reads system configuration to know the system local timezone.
+
+    See also:
+    - https://github.com/pandas-dev/pandas/blob/0.19.x/pandas/tslib.pyx#L1753
+    - https://github.com/dateutil/dateutil/blob/2.6.1/dateutil/tz/tz.py#L1338
+    """
+    import os
+    return os.environ.get('TZ', 'dateutil/:')
+
+
 def _check_dataframe_localize_timestamps(pdf, timezone):
     """
     Convert timezone aware timestamps to timezone-naive in the specified timezone or local timezone
@@ -1721,7 +1736,7 @@ def _check_dataframe_localize_timestamps(pdf, timezone):
     require_minimum_pandas_version()
 
     from pandas.api.types import is_datetime64tz_dtype
-    tz = timezone or 'tzlocal()'
+    tz = timezone or _get_local_timezone()
     for column, series in pdf.iteritems():
         # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
         if is_datetime64tz_dtype(series.dtype):
@@ -1744,7 +1759,7 @@ def _check_series_convert_timestamps_internal(s, timezone):
     from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
     # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
     if is_datetime64_dtype(s.dtype):
-        tz = timezone or 'tzlocal()'
+        tz = timezone or _get_local_timezone()
         return s.dt.tz_localize(tz).dt.tz_convert('UTC')
     elif is_datetime64tz_dtype(s.dtype):
         return s.dt.tz_convert('UTC')
@@ -1766,8 +1781,8 @@ def _check_series_convert_timestamps_localize(s, from_timezone, to_timezone):
 
     import pandas as pd
     from pandas.api.types import is_datetime64tz_dtype, is_datetime64_dtype
-    from_tz = from_timezone or 'tzlocal()'
-    to_tz = to_timezone or 'tzlocal()'
+    from_tz = from_timezone or _get_local_timezone()
+    to_tz = to_timezone or _get_local_timezone()
     # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
     if is_datetime64tz_dtype(s.dtype):
         return s.dt.tz_convert(to_tz).dt.tz_localize(None)

From 9fa7b0e107c283557648160195ce179077752e4c Mon Sep 17 00:00:00 2001
From: Li Jin <ice.xelloss@gmail.com>
Date: Sun, 11 Feb 2018 17:31:35 +0900
Subject: [PATCH 273/566] [SPARK-23314][PYTHON] Add ambiguous=False when
 localizing tz-naive timestamps in Arrow codepath to deal with dst

## What changes were proposed in this pull request?
When tz_localize a tz-naive timetamp, pandas will throw exception if the timestamp is during daylight saving time period, e.g., `2015-11-01 01:30:00`. This PR fixes this issue by setting `ambiguous=False` when calling tz_localize, which is the same default behavior of pytz.

## How was this patch tested?
Add `test_timestamp_dst`

Author: Li Jin <ice.xelloss@gmail.com>

Closes #20537 from icexelloss/SPARK-23314.

(cherry picked from commit a34fce19bc0ee5a7e36c6ecba75d2aeb70fdcbc7)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 python/pyspark/sql/tests.py | 39 +++++++++++++++++++++++++++++++++++++
 python/pyspark/sql/types.py | 37 ++++++++++++++++++++++++++++++++---
 2 files changed, 73 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 95a057d4921e8..0f76c963664e1 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3644,6 +3644,21 @@ def test_createDataFrame_with_int_col_names(self):
         self.assertEqual(pdf_col_names, df.columns)
         self.assertEqual(pdf_col_names, df_arrow.columns)
 
+    # Regression test for SPARK-23314
+    def test_timestamp_dst(self):
+        import pandas as pd
+        # Daylight saving time for Los Angeles for 2015 is Sun, Nov 1 at 2:00 am
+        dt = [datetime.datetime(2015, 11, 1, 0, 30),
+              datetime.datetime(2015, 11, 1, 1, 30),
+              datetime.datetime(2015, 11, 1, 2, 30)]
+        pdf = pd.DataFrame({'time': dt})
+
+        df_from_python = self.spark.createDataFrame(dt, 'timestamp').toDF('time')
+        df_from_pandas = self.spark.createDataFrame(pdf)
+
+        self.assertPandasEqual(pdf, df_from_python.toPandas())
+        self.assertPandasEqual(pdf, df_from_pandas.toPandas())
+
 
 @unittest.skipIf(
     not _have_pandas or not _have_pyarrow,
@@ -4285,6 +4300,18 @@ def test_register_vectorized_udf_basic(self):
         self.assertEquals(expected.collect(), res1.collect())
         self.assertEquals(expected.collect(), res2.collect())
 
+    # Regression test for SPARK-23314
+    def test_timestamp_dst(self):
+        from pyspark.sql.functions import pandas_udf
+        # Daylight saving time for Los Angeles for 2015 is Sun, Nov 1 at 2:00 am
+        dt = [datetime.datetime(2015, 11, 1, 0, 30),
+              datetime.datetime(2015, 11, 1, 1, 30),
+              datetime.datetime(2015, 11, 1, 2, 30)]
+        df = self.spark.createDataFrame(dt, 'timestamp').toDF('time')
+        foo_udf = pandas_udf(lambda x: x, 'timestamp')
+        result = df.withColumn('time', foo_udf(df.time))
+        self.assertEquals(df.collect(), result.collect())
+
 
 @unittest.skipIf(
     not _have_pandas or not _have_pyarrow,
@@ -4462,6 +4489,18 @@ def test_unsupported_types(self):
             with self.assertRaisesRegexp(Exception, 'Unsupported data type'):
                 df.groupby('id').apply(f).collect()
 
+    # Regression test for SPARK-23314
+    def test_timestamp_dst(self):
+        from pyspark.sql.functions import pandas_udf, PandasUDFType
+        # Daylight saving time for Los Angeles for 2015 is Sun, Nov 1 at 2:00 am
+        dt = [datetime.datetime(2015, 11, 1, 0, 30),
+              datetime.datetime(2015, 11, 1, 1, 30),
+              datetime.datetime(2015, 11, 1, 2, 30)]
+        df = self.spark.createDataFrame(dt, 'timestamp').toDF('time')
+        foo_udf = pandas_udf(lambda pdf: pdf, 'time timestamp', PandasUDFType.GROUPED_MAP)
+        result = df.groupby('time').apply(foo_udf).sort('time')
+        self.assertPandasEqual(df.toPandas(), result.toPandas())
+
 
 if __name__ == "__main__":
     from pyspark.sql.tests import *
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 2599dc5fdc599..f7141b4549e4e 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1759,8 +1759,38 @@ def _check_series_convert_timestamps_internal(s, timezone):
     from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
     # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
     if is_datetime64_dtype(s.dtype):
+        # When tz_localize a tz-naive timestamp, the result is ambiguous if the tz-naive
+        # timestamp is during the hour when the clock is adjusted backward during due to
+        # daylight saving time (dst).
+        # E.g., for America/New_York, the clock is adjusted backward on 2015-11-01 2:00 to
+        # 2015-11-01 1:00 from dst-time to standard time, and therefore, when tz_localize
+        # a tz-naive timestamp 2015-11-01 1:30 with America/New_York timezone, it can be either
+        # dst time (2015-01-01 1:30-0400) or standard time (2015-11-01 1:30-0500).
+        #
+        # Here we explicit choose to use standard time. This matches the default behavior of
+        # pytz.
+        #
+        # Here are some code to help understand this behavior:
+        # >>> import datetime
+        # >>> import pandas as pd
+        # >>> import pytz
+        # >>>
+        # >>> t = datetime.datetime(2015, 11, 1, 1, 30)
+        # >>> ts = pd.Series([t])
+        # >>> tz = pytz.timezone('America/New_York')
+        # >>>
+        # >>> ts.dt.tz_localize(tz, ambiguous=True)
+        # 0   2015-11-01 01:30:00-04:00
+        # dtype: datetime64[ns, America/New_York]
+        # >>>
+        # >>> ts.dt.tz_localize(tz, ambiguous=False)
+        # 0   2015-11-01 01:30:00-05:00
+        # dtype: datetime64[ns, America/New_York]
+        # >>>
+        # >>> str(tz.localize(t))
+        # '2015-11-01 01:30:00-05:00'
         tz = timezone or _get_local_timezone()
-        return s.dt.tz_localize(tz).dt.tz_convert('UTC')
+        return s.dt.tz_localize(tz, ambiguous=False).dt.tz_convert('UTC')
     elif is_datetime64tz_dtype(s.dtype):
         return s.dt.tz_convert('UTC')
     else:
@@ -1788,8 +1818,9 @@ def _check_series_convert_timestamps_localize(s, from_timezone, to_timezone):
         return s.dt.tz_convert(to_tz).dt.tz_localize(None)
     elif is_datetime64_dtype(s.dtype) and from_tz != to_tz:
         # `s.dt.tz_localize('tzlocal()')` doesn't work properly when including NaT.
-        return s.apply(lambda ts: ts.tz_localize(from_tz).tz_convert(to_tz).tz_localize(None)
-                       if ts is not pd.NaT else pd.NaT)
+        return s.apply(
+            lambda ts: ts.tz_localize(from_tz, ambiguous=False).tz_convert(to_tz).tz_localize(None)
+            if ts is not pd.NaT else pd.NaT)
     else:
         return s
 

From 8875e47cec01ae8da4ffb855409b54089e1016fb Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Sun, 11 Feb 2018 22:16:47 +0900
Subject: [PATCH 274/566] [SPARK-23387][SQL][PYTHON][TEST][BRANCH-2.3] Backport
 assertPandasEqual to branch-2.3.

## What changes were proposed in this pull request?

When backporting a pr with tests using `assertPandasEqual` from master to branch-2.3, the tests fail because `assertPandasEqual` doesn't exist in branch-2.3.
We should backport `assertPandasEqual` to branch-2.3 to avoid the failures.

## How was this patch tested?

Modified tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #20577 from ueshin/issues/SPARK-23387/branch-2.3.
---
 python/pyspark/sql/tests.py | 44 ++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 25 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 0f76c963664e1..5480144f33c9e 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -195,6 +195,12 @@ def tearDownClass(cls):
         ReusedPySparkTestCase.tearDownClass()
         cls.spark.stop()
 
+    def assertPandasEqual(self, expected, result):
+        msg = ("DataFrames are not equal: " +
+               "\n\nExpected:\n%s\n%s" % (expected, expected.dtypes) +
+               "\n\nResult:\n%s\n%s" % (result, result.dtypes))
+        self.assertTrue(expected.equals(result), msg=msg)
+
 
 class DataTypeTests(unittest.TestCase):
     # regression test for SPARK-6055
@@ -3422,12 +3428,6 @@ def tearDownClass(cls):
         time.tzset()
         ReusedSQLTestCase.tearDownClass()
 
-    def assertFramesEqual(self, df_with_arrow, df_without):
-        msg = ("DataFrame from Arrow is not equal" +
-               ("\n\nWith Arrow:\n%s\n%s" % (df_with_arrow, df_with_arrow.dtypes)) +
-               ("\n\nWithout:\n%s\n%s" % (df_without, df_without.dtypes)))
-        self.assertTrue(df_without.equals(df_with_arrow), msg=msg)
-
     def create_pandas_data_frame(self):
         import pandas as pd
         import numpy as np
@@ -3466,8 +3466,8 @@ def test_toPandas_arrow_toggle(self):
         df = self.spark.createDataFrame(self.data, schema=self.schema)
         pdf, pdf_arrow = self._toPandas_arrow_toggle(df)
         expected = self.create_pandas_data_frame()
-        self.assertFramesEqual(expected, pdf)
-        self.assertFramesEqual(expected, pdf_arrow)
+        self.assertPandasEqual(expected, pdf)
+        self.assertPandasEqual(expected, pdf_arrow)
 
     def test_toPandas_respect_session_timezone(self):
         df = self.spark.createDataFrame(self.data, schema=self.schema)
@@ -3478,11 +3478,11 @@ def test_toPandas_respect_session_timezone(self):
             self.spark.conf.set("spark.sql.execution.pandas.respectSessionTimeZone", "false")
             try:
                 pdf_la, pdf_arrow_la = self._toPandas_arrow_toggle(df)
-                self.assertFramesEqual(pdf_arrow_la, pdf_la)
+                self.assertPandasEqual(pdf_arrow_la, pdf_la)
             finally:
                 self.spark.conf.set("spark.sql.execution.pandas.respectSessionTimeZone", "true")
             pdf_ny, pdf_arrow_ny = self._toPandas_arrow_toggle(df)
-            self.assertFramesEqual(pdf_arrow_ny, pdf_ny)
+            self.assertPandasEqual(pdf_arrow_ny, pdf_ny)
 
             self.assertFalse(pdf_ny.equals(pdf_la))
 
@@ -3492,7 +3492,7 @@ def test_toPandas_respect_session_timezone(self):
                 if isinstance(field.dataType, TimestampType):
                     pdf_la_corrected[field.name] = _check_series_convert_timestamps_local_tz(
                         pdf_la_corrected[field.name], timezone)
-            self.assertFramesEqual(pdf_ny, pdf_la_corrected)
+            self.assertPandasEqual(pdf_ny, pdf_la_corrected)
         finally:
             self.spark.conf.set("spark.sql.session.timeZone", orig_tz)
 
@@ -3500,7 +3500,7 @@ def test_pandas_round_trip(self):
         pdf = self.create_pandas_data_frame()
         df = self.spark.createDataFrame(self.data, schema=self.schema)
         pdf_arrow = df.toPandas()
-        self.assertFramesEqual(pdf_arrow, pdf)
+        self.assertPandasEqual(pdf_arrow, pdf)
 
     def test_filtered_frame(self):
         df = self.spark.range(3).toDF("i")
@@ -3558,7 +3558,7 @@ def test_createDataFrame_with_schema(self):
         df = self.spark.createDataFrame(pdf, schema=self.schema)
         self.assertEquals(self.schema, df.schema)
         pdf_arrow = df.toPandas()
-        self.assertFramesEqual(pdf_arrow, pdf)
+        self.assertPandasEqual(pdf_arrow, pdf)
 
     def test_createDataFrame_with_incorrect_schema(self):
         pdf = self.create_pandas_data_frame()
@@ -4318,12 +4318,6 @@ def test_timestamp_dst(self):
     _pandas_requirement_message or _pyarrow_requirement_message)
 class GroupedMapPandasUDFTests(ReusedSQLTestCase):
 
-    def assertFramesEqual(self, expected, result):
-        msg = ("DataFrames are not equal: " +
-               ("\n\nExpected:\n%s\n%s" % (expected, expected.dtypes)) +
-               ("\n\nResult:\n%s\n%s" % (result, result.dtypes)))
-        self.assertTrue(expected.equals(result), msg=msg)
-
     @property
     def data(self):
         from pyspark.sql.functions import array, explode, col, lit
@@ -4347,7 +4341,7 @@ def test_simple(self):
 
         result = df.groupby('id').apply(foo_udf).sort('id').toPandas()
         expected = df.toPandas().groupby('id').apply(foo_udf.func).reset_index(drop=True)
-        self.assertFramesEqual(expected, result)
+        self.assertPandasEqual(expected, result)
 
     def test_register_grouped_map_udf(self):
         from pyspark.sql.functions import pandas_udf, PandasUDFType
@@ -4371,7 +4365,7 @@ def foo(pdf):
 
         result = df.groupby('id').apply(foo).sort('id').toPandas()
         expected = df.toPandas().groupby('id').apply(foo.func).reset_index(drop=True)
-        self.assertFramesEqual(expected, result)
+        self.assertPandasEqual(expected, result)
 
     def test_coerce(self):
         from pyspark.sql.functions import pandas_udf, PandasUDFType
@@ -4386,7 +4380,7 @@ def test_coerce(self):
         result = df.groupby('id').apply(foo).sort('id').toPandas()
         expected = df.toPandas().groupby('id').apply(foo.func).reset_index(drop=True)
         expected = expected.assign(v=expected.v.astype('float64'))
-        self.assertFramesEqual(expected, result)
+        self.assertPandasEqual(expected, result)
 
     def test_complex_groupby(self):
         from pyspark.sql.functions import pandas_udf, col, PandasUDFType
@@ -4405,7 +4399,7 @@ def normalize(pdf):
         expected = pdf.groupby(pdf['id'] % 2 == 0).apply(normalize.func)
         expected = expected.sort_values(['id', 'v']).reset_index(drop=True)
         expected = expected.assign(norm=expected.norm.astype('float64'))
-        self.assertFramesEqual(expected, result)
+        self.assertPandasEqual(expected, result)
 
     def test_empty_groupby(self):
         from pyspark.sql.functions import pandas_udf, col, PandasUDFType
@@ -4424,7 +4418,7 @@ def normalize(pdf):
         expected = normalize.func(pdf)
         expected = expected.sort_values(['id', 'v']).reset_index(drop=True)
         expected = expected.assign(norm=expected.norm.astype('float64'))
-        self.assertFramesEqual(expected, result)
+        self.assertPandasEqual(expected, result)
 
     def test_datatype_string(self):
         from pyspark.sql.functions import pandas_udf, PandasUDFType
@@ -4438,7 +4432,7 @@ def test_datatype_string(self):
 
         result = df.groupby('id').apply(foo_udf).sort('id').toPandas()
         expected = df.toPandas().groupby('id').apply(foo_udf.func).reset_index(drop=True)
-        self.assertFramesEqual(expected, result)
+        self.assertPandasEqual(expected, result)
 
     def test_wrong_return_type(self):
         from pyspark.sql.functions import pandas_udf, PandasUDFType

From 7e2a2b33c0664b3638a1428688b28f68323994c1 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Mon, 12 Feb 2018 00:03:49 +0800
Subject: [PATCH 275/566] [SPARK-23376][SQL] creating UnsafeKVExternalSorter
 with BytesToBytesMap may fail

## What changes were proposed in this pull request?

This is a long-standing bug in `UnsafeKVExternalSorter` and was reported in the dev list multiple times.

When creating `UnsafeKVExternalSorter` with `BytesToBytesMap`, we need to create a `UnsafeInMemorySorter` to sort the data in `BytesToBytesMap`. The data format of the sorter and the map is same, so no data movement is required. However, both the sorter and the map need a point array for some bookkeeping work.

There is an optimization in `UnsafeKVExternalSorter`: reuse the point array between the sorter and the map, to avoid an extra memory allocation. This sounds like a reasonable optimization, the length of the `BytesToBytesMap` point array is at least 4 times larger than the number of keys(to avoid hash collision, the hash table size should be at least 2 times larger than the number of keys, and each key occupies 2 slots). `UnsafeInMemorySorter` needs the pointer array size to be 4 times of the number of entries, so we are safe to reuse the point array.

However, the number of keys of the map doesn't equal to the number of entries in the map, because `BytesToBytesMap` supports duplicated keys. This breaks the assumption of the above optimization and we may run out of space when inserting data into the sorter, and hit error
```
java.lang.IllegalStateException: There is no space for new record
   at org.apache.spark.util.collection.unsafe.sort.UnsafeInMemorySorter.insertRecord(UnsafeInMemorySorter.java:239)
   at org.apache.spark.sql.execution.UnsafeKVExternalSorter.<init>(UnsafeKVExternalSorter.java:149)
...
```

This PR fixes this bug by creating a new point array if the existing one is not big enough.

## How was this patch tested?

a new test

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20561 from cloud-fan/bug.

(cherry picked from commit 4bbd7443ebb005f81ed6bc39849940ac8db3b3cc)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/execution/UnsafeKVExternalSorter.java | 31 +++++++++++----
 .../UnsafeKVExternalSorterSuite.scala         | 39 +++++++++++++++++++
 2 files changed, 62 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
index b0b5383a081a0..9eb03430a7db2 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
@@ -34,6 +34,7 @@
 import org.apache.spark.storage.BlockManager;
 import org.apache.spark.unsafe.KVIterator;
 import org.apache.spark.unsafe.Platform;
+import org.apache.spark.unsafe.array.LongArray;
 import org.apache.spark.unsafe.map.BytesToBytesMap;
 import org.apache.spark.unsafe.memory.MemoryBlock;
 import org.apache.spark.util.collection.unsafe.sort.*;
@@ -98,19 +99,33 @@ public UnsafeKVExternalSorter(
         numElementsForSpillThreshold,
         canUseRadixSort);
     } else {
-      // The array will be used to do in-place sort, which require half of the space to be empty.
-      // Note: each record in the map takes two entries in the array, one is record pointer,
-      // another is the key prefix.
-      assert(map.numKeys() * 2 <= map.getArray().size() / 2);
-      // During spilling, the array in map will not be used, so we can borrow that and use it
-      // as the underlying array for in-memory sorter (it's always large enough).
-      // Since we will not grow the array, it's fine to pass `null` as consumer.
+      // During spilling, the pointer array in `BytesToBytesMap` will not be used, so we can borrow
+      // that and use it as the pointer array for `UnsafeInMemorySorter`.
+      LongArray pointerArray = map.getArray();
+      // `BytesToBytesMap`'s pointer array is only guaranteed to hold all the distinct keys, but
+      // `UnsafeInMemorySorter`'s pointer array need to hold all the entries. Since
+      // `BytesToBytesMap` can have duplicated keys, here we need a check to make sure the pointer
+      // array can hold all the entries in `BytesToBytesMap`.
+      // The pointer array will be used to do in-place sort, which requires half of the space to be
+      // empty. Note: each record in the map takes two entries in the pointer array, one is record
+      // pointer, another is key prefix. So the required size of pointer array is `numRecords * 4`.
+      // TODO: It's possible to change UnsafeInMemorySorter to have multiple entries with same key,
+      // so that we can always reuse the pointer array.
+      if (map.numValues() > pointerArray.size() / 4) {
+        // Here we ask the map to allocate memory, so that the memory manager won't ask the map
+        // to spill, if the memory is not enough.
+        pointerArray = map.allocateArray(map.numValues() * 4L);
+      }
+
+      // Since the pointer array(either reuse the one in the map, or create a new one) is guaranteed
+      // to be large enough, it's fine to pass `null` as consumer because we won't allocate more
+      // memory.
       final UnsafeInMemorySorter inMemSorter = new UnsafeInMemorySorter(
         null,
         taskMemoryManager,
         comparatorSupplier.get(),
         prefixComparator,
-        map.getArray(),
+        pointerArray,
         canUseRadixSort);
 
       // We cannot use the destructive iterator here because we are reusing the existing memory
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala
index 6af9f8b77f8d3..bf588d3bb7841 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.{InterpretedOrdering, UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.map.BytesToBytesMap
 
 /**
  * Test suite for [[UnsafeKVExternalSorter]], with randomly generated test data.
@@ -205,4 +206,42 @@ class UnsafeKVExternalSorterSuite extends SparkFunSuite with SharedSQLContext {
       spill = true
     )
   }
+
+  test("SPARK-23376: Create UnsafeKVExternalSorter with BytesToByteMap having duplicated keys") {
+    val memoryManager = new TestMemoryManager(new SparkConf())
+    val taskMemoryManager = new TaskMemoryManager(memoryManager, 0)
+    val map = new BytesToBytesMap(taskMemoryManager, 64, taskMemoryManager.pageSizeBytes())
+
+    // Key/value are a unsafe rows with a single int column
+    val schema = new StructType().add("i", IntegerType)
+    val key = new UnsafeRow(1)
+    key.pointTo(new Array[Byte](32), 32)
+    key.setInt(0, 1)
+    val value = new UnsafeRow(1)
+    value.pointTo(new Array[Byte](32), 32)
+    value.setInt(0, 2)
+
+    for (_ <- 1 to 65) {
+      val loc = map.lookup(key.getBaseObject, key.getBaseOffset, key.getSizeInBytes)
+      loc.append(
+        key.getBaseObject, key.getBaseOffset, key.getSizeInBytes,
+        value.getBaseObject, value.getBaseOffset, value.getSizeInBytes)
+    }
+
+    // Make sure we can successfully create a UnsafeKVExternalSorter with a `BytesToBytesMap`
+    // which has duplicated keys and the number of entries exceeds its capacity.
+    try {
+      TaskContext.setTaskContext(new TaskContextImpl(0, 0, 0, 0, 0, taskMemoryManager, null, null))
+      new UnsafeKVExternalSorter(
+        schema,
+        schema,
+        sparkContext.env.blockManager,
+        sparkContext.env.serializerManager,
+        taskMemoryManager.pageSizeBytes(),
+        Int.MaxValue,
+        map)
+    } finally {
+      TaskContext.unset()
+    }
+  }
 }

From 79e8650cccb00c7886efba6dd691d9733084cb81 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Sun, 11 Feb 2018 23:46:23 -0800
Subject: [PATCH 276/566] [SPARK-23390][SQL] Flaky Test Suite:
 FileBasedDataSourceSuite in Spark 2.3/hadoop 2.7

## What changes were proposed in this pull request?

This test only fails with sbt on Hadoop 2.7, I can't reproduce it locally, but here is my speculation by looking at the code:
1. FileSystem.delete doesn't delete the directory entirely, somehow we can still open the file as a 0-length empty file.(just speculation)
2. ORC intentionally allow empty files, and the reader fails during reading without closing the file stream.

This PR improves the test to make sure all files are deleted and can't be opened.

## How was this patch tested?

N/A

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20584 from cloud-fan/flaky-test.

(cherry picked from commit 6efd5d117e98074d1b16a5c991fbd38df9aa196e)
Signed-off-by: Sameer Agarwal <sameerag@apache.org>
---
 .../spark/sql/FileBasedDataSourceSuite.scala       | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
index 640d6b1583663..2e332362ea644 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql
 
+import java.io.FileNotFoundException
+
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkException
@@ -102,17 +104,27 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext {
       def testIgnoreMissingFiles(): Unit = {
         withTempDir { dir =>
           val basePath = dir.getCanonicalPath
+
           Seq("0").toDF("a").write.format(format).save(new Path(basePath, "first").toString)
           Seq("1").toDF("a").write.format(format).save(new Path(basePath, "second").toString)
+
           val thirdPath = new Path(basePath, "third")
+          val fs = thirdPath.getFileSystem(spark.sparkContext.hadoopConfiguration)
           Seq("2").toDF("a").write.format(format).save(thirdPath.toString)
+          val files = fs.listStatus(thirdPath).filter(_.isFile).map(_.getPath)
+
           val df = spark.read.format(format).load(
             new Path(basePath, "first").toString,
             new Path(basePath, "second").toString,
             new Path(basePath, "third").toString)
 
-          val fs = thirdPath.getFileSystem(spark.sparkContext.hadoopConfiguration)
+          // Make sure all data files are deleted and can't be opened.
+          files.foreach(f => fs.delete(f, false))
           assert(fs.delete(thirdPath, true))
+          for (f <- files) {
+            intercept[FileNotFoundException](fs.open(f))
+          }
+
           checkAnswer(df, Seq(Row("0"), Row("1")))
         }
       }

From 1e3118c2ee0fe7d2c59cb3e2055709bb2809a6db Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Mon, 12 Feb 2018 22:07:59 +0800
Subject: [PATCH 277/566] [SPARK-22977][SQL] fix web UI SQL tab for CTAS

## What changes were proposed in this pull request?

This is a regression in Spark 2.3.

In Spark 2.2, we have a fragile UI support for SQL data writing commands. We only track the input query plan of `FileFormatWriter` and display its metrics. This is not ideal because we don't know who triggered the writing(can be table insertion, CTAS, etc.), but it's still useful to see the metrics of the input query.

In Spark 2.3, we introduced a new mechanism: `DataWritigCommand`, to fix the UI issue entirely. Now these writing commands have real children, and we don't need to hack into the `FileFormatWriter` for the UI. This also helps with `explain`, now `explain` can show the physical plan of the input query, while in 2.2 the physical writing plan is simply `ExecutedCommandExec` and it has no child.

However there is a regression in CTAS. CTAS commands don't extend `DataWritigCommand`, and we don't have the UI hack in `FileFormatWriter` anymore, so the UI for CTAS is just an empty node. See https://issues.apache.org/jira/browse/SPARK-22977 for more information about this UI issue.

To fix it, we should apply the `DataWritigCommand` mechanism to CTAS commands.

TODO: In the future, we should refactor this part and create some physical layer code pieces for data writing, and reuse them in different writing commands. We should have different logical nodes for different operators, even some of them share some same logic, e.g. CTAS, CREATE TABLE, INSERT TABLE. Internally we can share the same physical logic.

## How was this patch tested?

manually tested.
For data source table
<img width="644" alt="1" src="https://user-images.githubusercontent.com/3182036/35874155-bdffab28-0ba6-11e8-94a8-e32e106ba069.png">
For hive table
<img width="666" alt="2" src="https://user-images.githubusercontent.com/3182036/35874161-c437e2a8-0ba6-11e8-98ed-7930f01432c5.png">

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20521 from cloud-fan/UI.

(cherry picked from commit 0e2c266de7189473177f45aa68ea6a45c7e47ec3)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../command/createDataSourceTables.scala      | 21 +++----
 .../execution/datasources/DataSource.scala    | 44 +++++++++++++--
 .../datasources/DataSourceStrategy.scala      |  2 +-
 .../spark/sql/hive/HiveStrategies.scala       |  2 +-
 .../CreateHiveTableAsSelectCommand.scala      | 55 ++++++++++---------
 .../sql/hive/execution/HiveExplainSuite.scala | 26 ---------
 6 files changed, 80 insertions(+), 70 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index 306f43dc4214a..e9747769dfcfc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -21,7 +21,9 @@ import java.net.URI
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.catalog._
+import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.sources.BaseRelation
 import org.apache.spark.sql.types.StructType
@@ -136,12 +138,11 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo
 case class CreateDataSourceTableAsSelectCommand(
     table: CatalogTable,
     mode: SaveMode,
-    query: LogicalPlan)
-  extends RunnableCommand {
-
-  override protected def innerChildren: Seq[LogicalPlan] = Seq(query)
+    query: LogicalPlan,
+    outputColumns: Seq[Attribute])
+  extends DataWritingCommand {
 
-  override def run(sparkSession: SparkSession): Seq[Row] = {
+  override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = {
     assert(table.tableType != CatalogTableType.VIEW)
     assert(table.provider.isDefined)
 
@@ -163,7 +164,7 @@ case class CreateDataSourceTableAsSelectCommand(
       }
 
       saveDataIntoTable(
-        sparkSession, table, table.storage.locationUri, query, SaveMode.Append, tableExists = true)
+        sparkSession, table, table.storage.locationUri, child, SaveMode.Append, tableExists = true)
     } else {
       assert(table.schema.isEmpty)
 
@@ -173,7 +174,7 @@ case class CreateDataSourceTableAsSelectCommand(
         table.storage.locationUri
       }
       val result = saveDataIntoTable(
-        sparkSession, table, tableLocation, query, SaveMode.Overwrite, tableExists = false)
+        sparkSession, table, tableLocation, child, SaveMode.Overwrite, tableExists = false)
       val newTable = table.copy(
         storage = table.storage.copy(locationUri = tableLocation),
         // We will use the schema of resolved.relation as the schema of the table (instead of
@@ -198,10 +199,10 @@ case class CreateDataSourceTableAsSelectCommand(
       session: SparkSession,
       table: CatalogTable,
       tableLocation: Option[URI],
-      data: LogicalPlan,
+      physicalPlan: SparkPlan,
       mode: SaveMode,
       tableExists: Boolean): BaseRelation = {
-    // Create the relation based on the input logical plan: `data`.
+    // Create the relation based on the input logical plan: `query`.
     val pathOption = tableLocation.map("path" -> CatalogUtils.URIToString(_))
     val dataSource = DataSource(
       session,
@@ -212,7 +213,7 @@ case class CreateDataSourceTableAsSelectCommand(
       catalogTable = if (tableExists) Some(table) else None)
 
     try {
-      dataSource.writeAndRead(mode, query)
+      dataSource.writeAndRead(mode, query, outputColumns, physicalPlan)
     } catch {
       case ex: AnalysisException =>
         logError(s"Failed to write to table ${table.identifier.unquotedString}", ex)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index 25e1210504273..6e1b5727e3fd5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -31,8 +31,10 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogUtils}
+import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
+import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
 import org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider
 import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
@@ -435,10 +437,11 @@ case class DataSource(
   }
 
   /**
-   * Writes the given [[LogicalPlan]] out in this [[FileFormat]].
+   * Creates a command node to write the given [[LogicalPlan]] out to the given [[FileFormat]].
+   * The returned command is unresolved and need to be analyzed.
    */
   private def planForWritingFileFormat(
-      format: FileFormat, mode: SaveMode, data: LogicalPlan): LogicalPlan = {
+      format: FileFormat, mode: SaveMode, data: LogicalPlan): InsertIntoHadoopFsRelationCommand = {
     // Don't glob path for the write path.  The contracts here are:
     //  1. Only one output path can be specified on the write path;
     //  2. Output path must be a legal HDFS style file system path;
@@ -482,9 +485,24 @@ case class DataSource(
   /**
    * Writes the given [[LogicalPlan]] out to this [[DataSource]] and returns a [[BaseRelation]] for
    * the following reading.
+   *
+   * @param mode The save mode for this writing.
+   * @param data The input query plan that produces the data to be written. Note that this plan
+   *             is analyzed and optimized.
+   * @param outputColumns The original output columns of the input query plan. The optimizer may not
+   *                      preserve the output column's names' case, so we need this parameter
+   *                      instead of `data.output`.
+   * @param physicalPlan The physical plan of the input query plan. We should run the writing
+   *                     command with this physical plan instead of creating a new physical plan,
+   *                     so that the metrics can be correctly linked to the given physical plan and
+   *                     shown in the web UI.
    */
-  def writeAndRead(mode: SaveMode, data: LogicalPlan): BaseRelation = {
-    if (data.schema.map(_.dataType).exists(_.isInstanceOf[CalendarIntervalType])) {
+  def writeAndRead(
+      mode: SaveMode,
+      data: LogicalPlan,
+      outputColumns: Seq[Attribute],
+      physicalPlan: SparkPlan): BaseRelation = {
+    if (outputColumns.map(_.dataType).exists(_.isInstanceOf[CalendarIntervalType])) {
       throw new AnalysisException("Cannot save interval data type into external storage.")
     }
 
@@ -493,9 +511,23 @@ case class DataSource(
         dataSource.createRelation(
           sparkSession.sqlContext, mode, caseInsensitiveOptions, Dataset.ofRows(sparkSession, data))
       case format: FileFormat =>
-        sparkSession.sessionState.executePlan(planForWritingFileFormat(format, mode, data)).toRdd
+        val cmd = planForWritingFileFormat(format, mode, data)
+        val resolvedPartCols = cmd.partitionColumns.map { col =>
+          // The partition columns created in `planForWritingFileFormat` should always be
+          // `UnresolvedAttribute` with a single name part.
+          assert(col.isInstanceOf[UnresolvedAttribute])
+          val unresolved = col.asInstanceOf[UnresolvedAttribute]
+          assert(unresolved.nameParts.length == 1)
+          val name = unresolved.nameParts.head
+          outputColumns.find(a => equality(a.name, name)).getOrElse {
+            throw new AnalysisException(
+              s"Unable to resolve $name given [${data.output.map(_.name).mkString(", ")}]")
+          }
+        }
+        val resolved = cmd.copy(partitionColumns = resolvedPartCols, outputColumns = outputColumns)
+        resolved.run(sparkSession, physicalPlan)
         // Replace the schema with that of the DataFrame we just wrote out to avoid re-inferring
-        copy(userSpecifiedSchema = Some(data.schema.asNullable)).resolveRelation()
+        copy(userSpecifiedSchema = Some(outputColumns.toStructType.asNullable)).resolveRelation()
       case _ =>
         sys.error(s"${providingClass.getCanonicalName} does not allow create table as select.")
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index d94c5bbccdd84..3f41612c08065 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -139,7 +139,7 @@ case class DataSourceAnalysis(conf: SQLConf) extends Rule[LogicalPlan] with Cast
     case CreateTable(tableDesc, mode, Some(query))
         if query.resolved && DDLUtils.isDatasourceTable(tableDesc) =>
       DDLUtils.checkDataColNames(tableDesc.copy(schema = query.schema))
-      CreateDataSourceTableAsSelectCommand(tableDesc, mode, query)
+      CreateDataSourceTableAsSelectCommand(tableDesc, mode, query, query.output)
 
     case InsertIntoTable(l @ LogicalRelation(_: InsertableRelation, _, _, _),
         parts, query, overwrite, false) if parts.isEmpty =>
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index ab857b9055720..8df05cbb20361 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -157,7 +157,7 @@ object HiveAnalysis extends Rule[LogicalPlan] {
 
     case CreateTable(tableDesc, mode, Some(query)) if DDLUtils.isHiveTable(tableDesc) =>
       DDLUtils.checkDataColNames(tableDesc)
-      CreateHiveTableAsSelectCommand(tableDesc, query, mode)
+      CreateHiveTableAsSelectCommand(tableDesc, query, query.output, mode)
 
     case InsertIntoDir(isLocal, storage, provider, child, overwrite)
         if DDLUtils.isHiveTable(provider) =>
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala
index 65e8b4e3c725c..1e801fe1845c4 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala
@@ -20,10 +20,11 @@ package org.apache.spark.sql.hive.execution
 import scala.util.control.NonFatal
 
 import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession}
-import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
 import org.apache.spark.sql.catalyst.catalog.CatalogTable
-import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan}
-import org.apache.spark.sql.execution.command.RunnableCommand
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.command.DataWritingCommand
 
 
 /**
@@ -36,15 +37,15 @@ import org.apache.spark.sql.execution.command.RunnableCommand
 case class CreateHiveTableAsSelectCommand(
     tableDesc: CatalogTable,
     query: LogicalPlan,
+    outputColumns: Seq[Attribute],
     mode: SaveMode)
-  extends RunnableCommand {
+  extends DataWritingCommand {
 
   private val tableIdentifier = tableDesc.identifier
 
-  override def innerChildren: Seq[LogicalPlan] = Seq(query)
-
-  override def run(sparkSession: SparkSession): Seq[Row] = {
-    if (sparkSession.sessionState.catalog.tableExists(tableIdentifier)) {
+  override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = {
+    val catalog = sparkSession.sessionState.catalog
+    if (catalog.tableExists(tableIdentifier)) {
       assert(mode != SaveMode.Overwrite,
         s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite")
 
@@ -56,34 +57,36 @@ case class CreateHiveTableAsSelectCommand(
         return Seq.empty
       }
 
-      sparkSession.sessionState.executePlan(
-        InsertIntoTable(
-          UnresolvedRelation(tableIdentifier),
-          Map(),
-          query,
-          overwrite = false,
-          ifPartitionNotExists = false)).toRdd
+      InsertIntoHiveTable(
+        tableDesc,
+        Map.empty,
+        query,
+        overwrite = false,
+        ifPartitionNotExists = false,
+        outputColumns = outputColumns).run(sparkSession, child)
     } else {
       // TODO ideally, we should get the output data ready first and then
       // add the relation into catalog, just in case of failure occurs while data
       // processing.
       assert(tableDesc.schema.isEmpty)
-      sparkSession.sessionState.catalog.createTable(
-        tableDesc.copy(schema = query.schema), ignoreIfExists = false)
+      catalog.createTable(tableDesc.copy(schema = query.schema), ignoreIfExists = false)
 
       try {
-        sparkSession.sessionState.executePlan(
-          InsertIntoTable(
-            UnresolvedRelation(tableIdentifier),
-            Map(),
-            query,
-            overwrite = true,
-            ifPartitionNotExists = false)).toRdd
+        // Read back the metadata of the table which was created just now.
+        val createdTableMeta = catalog.getTableMetadata(tableDesc.identifier)
+        // For CTAS, there is no static partition values to insert.
+        val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap
+        InsertIntoHiveTable(
+          createdTableMeta,
+          partition,
+          query,
+          overwrite = true,
+          ifPartitionNotExists = false,
+          outputColumns = outputColumns).run(sparkSession, child)
       } catch {
         case NonFatal(e) =>
           // drop the created table.
-          sparkSession.sessionState.catalog.dropTable(tableIdentifier, ignoreIfNotExists = true,
-            purge = false)
+          catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false)
           throw e
       }
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
index f84d188075b72..5d56f89c2271c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
@@ -128,32 +128,6 @@ class HiveExplainSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
       "src")
   }
 
-  test("SPARK-17409: The EXPLAIN output of CTAS only shows the analyzed plan") {
-    withTempView("jt") {
-      val ds = (1 to 10).map(i => s"""{"a":$i, "b":"str$i"}""").toDS()
-      spark.read.json(ds).createOrReplaceTempView("jt")
-      val outputs = sql(
-        s"""
-           |EXPLAIN EXTENDED
-           |CREATE TABLE t1
-           |AS
-           |SELECT * FROM jt
-         """.stripMargin).collect().map(_.mkString).mkString
-
-      val shouldContain =
-        "== Parsed Logical Plan ==" :: "== Analyzed Logical Plan ==" :: "Subquery" ::
-        "== Optimized Logical Plan ==" :: "== Physical Plan ==" ::
-        "CreateHiveTableAsSelect" :: "InsertIntoHiveTable" :: "jt" :: Nil
-      for (key <- shouldContain) {
-        assert(outputs.contains(key), s"$key doesn't exist in result")
-      }
-
-      val physicalIndex = outputs.indexOf("== Physical Plan ==")
-      assert(outputs.substring(physicalIndex).contains("Subquery"),
-        "Physical Plan should contain SubqueryAlias since the query should not be optimized")
-    }
-  }
-
   test("explain output of physical plan should contain proper codegen stage ID") {
     checkKeywordsExist(sql(
       """

From d31c4ae7ba734356c849347b9a7b448da9a5a9a1 Mon Sep 17 00:00:00 2001
From: liuxian <liu.xian3@zte.com.cn>
Date: Mon, 12 Feb 2018 08:49:45 -0600
Subject: [PATCH 278/566] [SPARK-23391][CORE] It may lead to overflow for some
 integer multiplication

## What changes were proposed in this pull request?
In the `getBlockData`,`blockId.reduceId` is the `Int` type, when it is greater than 2^28, `blockId.reduceId*8` will overflow
In the `decompress0`, `len` and  `unitSize` are  Int type, so `len * unitSize` may lead to  overflow
## How was this patch tested?
N/A

Author: liuxian <liu.xian3@zte.com.cn>

Closes #20581 from 10110346/overflow2.

(cherry picked from commit 4a4dd4f36f65410ef5c87f7b61a960373f044e61)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../org/apache/spark/shuffle/IndexShuffleBlockResolver.scala  | 4 ++--
 .../execution/columnar/compression/compressionSchemes.scala   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
index 2414b94dcf379..449f60273b42b 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
@@ -203,13 +203,13 @@ private[spark] class IndexShuffleBlockResolver(
     // class of issue from re-occurring in the future which is why they are left here even though
     // SPARK-22982 is fixed.
     val channel = Files.newByteChannel(indexFile.toPath)
-    channel.position(blockId.reduceId * 8)
+    channel.position(blockId.reduceId * 8L)
     val in = new DataInputStream(Channels.newInputStream(channel))
     try {
       val offset = in.readLong()
       val nextOffset = in.readLong()
       val actualPosition = channel.position()
-      val expectedPosition = blockId.reduceId * 8 + 16
+      val expectedPosition = blockId.reduceId * 8L + 16
       if (actualPosition != expectedPosition) {
         throw new Exception(s"SPARK-22982: Incorrect channel position after index file reads: " +
           s"expected $expectedPosition but actual position was $actualPosition.")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/compressionSchemes.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/compressionSchemes.scala
index 79dcf3a6105ce..00a1d54b41709 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/compressionSchemes.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/compressionSchemes.scala
@@ -116,7 +116,7 @@ private[columnar] case object PassThrough extends CompressionScheme {
       while (pos < capacity) {
         if (pos != nextNullIndex) {
           val len = nextNullIndex - pos
-          assert(len * unitSize < Int.MaxValue)
+          assert(len * unitSize.toLong < Int.MaxValue)
           putFunction(columnVector, pos, bufferPos, len)
           bufferPos += len * unitSize
           pos += len

From 89f6fcbafcfb0a7aeb897fba6036cb085bd35121 Mon Sep 17 00:00:00 2001
From: Sameer Agarwal <sameerag@apache.org>
Date: Mon, 12 Feb 2018 11:08:28 -0800
Subject: [PATCH 279/566] Preparing Spark release v2.3.0-rc3

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 29a8a0062873f..6d46c31906260 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.1
+Version: 2.3.0
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 5c5a8e9b8232e..2ca9ab66462a8 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 2a625da682cdc..404c744f98261 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index adb1890439094..3c0b5288862fa 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 4cdcfa2a4da99..fe3bcfd68f988 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 494137f8e9ec1..90ca4016ecc79 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 32bebf5eeca99..8bb2c38f7d097 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 8dc4ddeea3ec0..0d5d4b1661012 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 5613f5e81384c..d6136e6c29a84 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index c34c9df39063b..364aaab74e67b 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index 213579037f51a..aabf3151890f9 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.1-SNAPSHOT
-SPARK_VERSION_SHORT: 2.3.1
+SPARK_VERSION: 2.3.0
+SPARK_VERSION_SHORT: 2.3.0
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index e584feed1b337..28de7a180c2e7 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 6c4031b392f65..9b74a4d327136 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 6935ade33546a..1da013353775c 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 60aa5fb1a1031..da8ce5928b8bb 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index d8ed3364c4bd4..65a9f9972d398 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 15d550d3bb445..f5a2dbbef3b6f 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 7ee7156efe22c..e9b8e4e18d6df 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 1f28798fd245a..46a5bd1611d30 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index ad3cd0059e183..72446ecb5cd82 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 53d903fc58aee..e49c8281c6ddb 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 4206a1ac4ad99..7ad92b07094f2 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 7e838812bdc2e..126624ecd8922 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index ee7eb98e3dfc8..a8bcff8cdc14c 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 2ea2e4a7adbdb..6e60fbac9ef5a 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 35ddfac1d6624..7e09e304cb1b7 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 29c1e53f480a3..d8c1792f5b0dd 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 61eb9dbcf6f7a..d9380ed2c6d4d 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index edba0a72ee6a4..9cb3ed2e7ae1d 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 9aa531ec65746..e31b5e692e641 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.1-SNAPSHOT</version>
+  <version>2.3.0</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index b09fa9822184c..8065b058f8c0e 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.1.dev0"
+__version__ = "2.3.0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 3b445bfaf773d..c21a1cd108a8a 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index ae1c69f305589..544e08b333aaf 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index ea31b9c121433..3dc6955ce312c 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index d42bfff851572..7e1397ef9ba79 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 865d3b3f6ad4e..936267ce560e6 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index f39bffe417666..77d4cf7ec8459 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 6e04f528a1c2f..7ade122666a83 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index da50c9bd60fa6..da34cb8253272 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 6e8e034fc4de4..adbafac9bc458 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 9523085cf5513..4e005623e2b26 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 70be6038df38d5e80af8565120eedd8242c5a7c5 Mon Sep 17 00:00:00 2001
From: Sameer Agarwal <sameerag@apache.org>
Date: Mon, 12 Feb 2018 11:08:34 -0800
Subject: [PATCH 280/566] Preparing development version 2.3.1-SNAPSHOT

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 6d46c31906260..29a8a0062873f 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.0
+Version: 2.3.1
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 2ca9ab66462a8..5c5a8e9b8232e 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 404c744f98261..2a625da682cdc 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 3c0b5288862fa..adb1890439094 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index fe3bcfd68f988..4cdcfa2a4da99 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 90ca4016ecc79..494137f8e9ec1 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 8bb2c38f7d097..32bebf5eeca99 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 0d5d4b1661012..8dc4ddeea3ec0 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index d6136e6c29a84..5613f5e81384c 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 364aaab74e67b..c34c9df39063b 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index aabf3151890f9..213579037f51a 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.0
-SPARK_VERSION_SHORT: 2.3.0
+SPARK_VERSION: 2.3.1-SNAPSHOT
+SPARK_VERSION_SHORT: 2.3.1
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 28de7a180c2e7..e584feed1b337 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 9b74a4d327136..6c4031b392f65 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 1da013353775c..6935ade33546a 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index da8ce5928b8bb..60aa5fb1a1031 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 65a9f9972d398..d8ed3364c4bd4 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index f5a2dbbef3b6f..15d550d3bb445 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index e9b8e4e18d6df..7ee7156efe22c 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 46a5bd1611d30..1f28798fd245a 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 72446ecb5cd82..ad3cd0059e183 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index e49c8281c6ddb..53d903fc58aee 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 7ad92b07094f2..4206a1ac4ad99 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 126624ecd8922..7e838812bdc2e 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index a8bcff8cdc14c..ee7eb98e3dfc8 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 6e60fbac9ef5a..2ea2e4a7adbdb 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 7e09e304cb1b7..35ddfac1d6624 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index d8c1792f5b0dd..29c1e53f480a3 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index d9380ed2c6d4d..61eb9dbcf6f7a 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 9cb3ed2e7ae1d..edba0a72ee6a4 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index e31b5e692e641..9aa531ec65746 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.0</version>
+  <version>2.3.1-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 8065b058f8c0e..b09fa9822184c 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.0"
+__version__ = "2.3.1.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index c21a1cd108a8a..3b445bfaf773d 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 544e08b333aaf..ae1c69f305589 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 3dc6955ce312c..ea31b9c121433 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 7e1397ef9ba79..d42bfff851572 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 936267ce560e6..865d3b3f6ad4e 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 77d4cf7ec8459..f39bffe417666 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 7ade122666a83..6e04f528a1c2f 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index da34cb8253272..da50c9bd60fa6 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index adbafac9bc458..6e8e034fc4de4 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 4e005623e2b26..9523085cf5513 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 4e138207ebb11a08393c15e5e39f46a5dc1e7c66 Mon Sep 17 00:00:00 2001
From: James Thompson <jamesthomp@users.noreply.github.com>
Date: Mon, 12 Feb 2018 11:34:56 -0800
Subject: [PATCH 281/566] [SPARK-23388][SQL] Support for Parquet Binary
 DecimalType in VectorizedColumnReader

## What changes were proposed in this pull request?

Re-add support for parquet binary DecimalType in VectorizedColumnReader

## How was this patch tested?

Existing test suite

Author: James Thompson <jamesthomp@users.noreply.github.com>

Closes #20580 from jamesthomp/jt/add-back-binary-decimal.

(cherry picked from commit 5bb11411aec18b8d623e54caba5397d7cb8e89f0)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../execution/datasources/parquet/VectorizedColumnReader.java  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
index c120863152a96..47dd625f4b154 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -444,7 +444,8 @@ private void readBinaryBatch(int rowId, int num, WritableColumnVector column) {
     // This is where we implement support for the valid type conversions.
     // TODO: implement remaining type conversions
     VectorizedValuesReader data = (VectorizedValuesReader) dataColumn;
-    if (column.dataType() == DataTypes.StringType || column.dataType() == DataTypes.BinaryType) {
+    if (column.dataType() == DataTypes.StringType || column.dataType() == DataTypes.BinaryType
+            || DecimalType.isByteArrayDecimalType(column.dataType())) {
       defColumn.readBinarys(num, column, rowId, maxDefLevel, data);
     } else if (column.dataType() == DataTypes.TimestampType) {
       if (!shouldConvertTimestamps()) {

From 9632c461e6931a1a4d05684d0f62ee36f9e90b77 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Mon, 12 Feb 2018 12:20:29 -0800
Subject: [PATCH 282/566] [SPARK-22002][SQL][FOLLOWUP][TEST] Add a test to
 check if the original schema doesn't have metadata.

## What changes were proposed in this pull request?

This is a follow-up pr of #19231 which modified the behavior to remove metadata from JDBC table schema.
This pr adds a test to check if the schema doesn't have metadata.

## How was this patch tested?

Added a test and existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #20585 from ueshin/issues/SPARK-22002/fup1.

(cherry picked from commit 0c66fe4f22f8af4932893134bb0fd56f00fabeae)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index cb2df0ac54f4c..5238adce4a699 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -1168,4 +1168,26 @@ class JDBCSuite extends SparkFunSuite
       val df3 = sql("SELECT * FROM test_sessionInitStatement")
       assert(df3.collect() === Array(Row(21519, 1234)))
     }
+
+  test("jdbc data source shouldn't have unnecessary metadata in its schema") {
+    val schema = StructType(Seq(
+      StructField("NAME", StringType, true), StructField("THEID", IntegerType, true)))
+
+    val df = spark.read.format("jdbc")
+      .option("Url", urlWithUserAndPass)
+      .option("DbTaBle", "TEST.PEOPLE")
+      .load()
+    assert(df.schema === schema)
+
+    withTempView("people_view") {
+      sql(
+        s"""
+          |CREATE TEMPORARY VIEW people_view
+          |USING org.apache.spark.sql.jdbc
+          |OPTIONS (uRl '$url', DbTaBlE 'TEST.PEOPLE', User 'testUser', PassWord 'testPass')
+        """.stripMargin.replaceAll("\n", " "))
+
+      assert(sql("select * from people_view").schema === schema)
+    }
+  }
 }

From 2b80571e215d56d15c59f0fc5db053569a79efae Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Mon, 12 Feb 2018 15:26:37 -0800
Subject: [PATCH 283/566] [SPARK-23313][DOC] Add a migration guide for ORC

## What changes were proposed in this pull request?

This PR adds a migration guide documentation for ORC.

![orc-guide](https://user-images.githubusercontent.com/9700541/36123859-ec165cae-1002-11e8-90b7-7313be7a81a5.png)

## How was this patch tested?

N/A.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20484 from dongjoon-hyun/SPARK-23313.

(cherry picked from commit 6cb59708c70c03696c772fbb5d158eed57fe67d4)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 docs/sql-programming-guide.md | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index eab4030ee25d2..dcef6e58b4ba1 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1776,6 +1776,35 @@ working with timestamps in `pandas_udf`s to get the best performance, see
 
 ## Upgrading From Spark SQL 2.2 to 2.3
 
+  - Since Spark 2.3, Spark supports a vectorized ORC reader with a new ORC file format for ORC files. To do that, the following configurations are newly added or change their default values. The vectorized reader is used for the native ORC tables (e.g., the ones created using the clause `USING ORC`) when `spark.sql.orc.impl` is set to `native` and `spark.sql.orc.enableVectorizedReader` is set to `true`. For the Hive ORC serde table (e.g., the ones created using the clause `USING HIVE OPTIONS (fileFormat 'ORC')`), the vectorized reader is used when `spark.sql.hive.convertMetastoreOrc` is set to `true`.
+
+    - New configurations
+
+    <table class="table">
+      <tr><th><b>Property Name</b></th><th><b>Default</b></th><th><b>Meaning</b></th></tr>
+      <tr>
+        <td><code>spark.sql.orc.impl</code></td>
+        <td><code>native</code></td>
+        <td>The name of ORC implementation. It can be one of <code>native</code> and <code>hive</code>. <code>native</code> means the native ORC support that is built on Apache ORC 1.4.1. `hive` means the ORC library in Hive 1.2.1 which is used prior to Spark 2.3.</td>
+      </tr>
+      <tr>
+        <td><code>spark.sql.orc.enableVectorizedReader</code></td>
+        <td><code>true</code></td>
+        <td>Enables vectorized orc decoding in <code>native</code> implementation. If <code>false</code>, a new non-vectorized ORC reader is used in <code>native</code> implementation. For <code>hive</code> implementation, this is ignored.</td>
+      </tr>
+    </table>
+
+    - Changed configurations
+
+    <table class="table">
+      <tr><th><b>Property Name</b></th><th><b>Default</b></th><th><b>Meaning</b></th></tr>
+      <tr>
+        <td><code>spark.sql.orc.filterPushdown</code></td>
+        <td><code>true</code></td>
+        <td>Enables filter pushdown for ORC files. It is <code>false</code> by default prior to Spark 2.3.</td>
+      </tr>
+    </table>
+
   - Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the referenced columns only include the internal corrupt record column (named `_corrupt_record` by default). For example, `spark.read.schema(schema).json(file).filter($"_corrupt_record".isNotNull).count()` and `spark.read.schema(schema).json(file).select("_corrupt_record").show()`. Instead, you can cache or save the parsed results and then send the same query. For example, `val df = spark.read.schema(schema).json(file).cache()` and then `df.filter($"_corrupt_record".isNotNull).count()`.
   - The `percentile_approx` function previously accepted numeric type input and output double type results. Now it supports date type, timestamp type and numeric types as input types. The result type is also changed to be the same as the input type, which is more reasonable for percentiles.
   - Since Spark 2.3, the Join/Filter's deterministic predicates that are after the first non-deterministic predicates are also pushed down/through the child operators, if possible. In prior Spark versions, these filters are not eligible for predicate pushdown.

From befb22de81aad41673eec9dba7585b80c6cb2564 Mon Sep 17 00:00:00 2001
From: sychen <sychen@ctrip.com>
Date: Mon, 12 Feb 2018 16:00:47 -0800
Subject: [PATCH 284/566] [SPARK-23230][SQL] When hive.default.fileformat is
 other kinds of file types, create textfile table cause a serde error

When hive.default.fileformat is other kinds of file types, create textfile table cause a serde error.
We should take the default type of textfile and sequencefile both as org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe.

```
set hive.default.fileformat=orc;
create table tbl( i string ) stored as textfile;
desc formatted tbl;

Serde Library org.apache.hadoop.hive.ql.io.orc.OrcSerde
InputFormat  org.apache.hadoop.mapred.TextInputFormat
OutputFormat  org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
```

Author: sychen <sychen@ctrip.com>

Closes #20406 from cxzl25/default_serde.

(cherry picked from commit 4104b68e958cd13975567a96541dac7cccd8195c)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../apache/spark/sql/internal/HiveSerDe.scala |  6 ++++--
 .../sql/hive/execution/HiveSerDeSuite.scala   | 19 +++++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala
index dac463641cfab..eca612f06f9bb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala
@@ -31,7 +31,8 @@ object HiveSerDe {
     "sequencefile" ->
       HiveSerDe(
         inputFormat = Option("org.apache.hadoop.mapred.SequenceFileInputFormat"),
-        outputFormat = Option("org.apache.hadoop.mapred.SequenceFileOutputFormat")),
+        outputFormat = Option("org.apache.hadoop.mapred.SequenceFileOutputFormat"),
+        serde = Option("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")),
 
     "rcfile" ->
       HiveSerDe(
@@ -54,7 +55,8 @@ object HiveSerDe {
     "textfile" ->
       HiveSerDe(
         inputFormat = Option("org.apache.hadoop.mapred.TextInputFormat"),
-        outputFormat = Option("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")),
+        outputFormat = Option("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"),
+        serde = Option("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")),
 
     "avro" ->
       HiveSerDe(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
index 1c9f00141ae1d..d7752e987cb4b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
@@ -100,6 +100,25 @@ class HiveSerDeSuite extends HiveComparisonTest with PlanTest with BeforeAndAfte
       assert(output == Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"))
       assert(serde == Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"))
     }
+
+    withSQLConf("hive.default.fileformat" -> "orc") {
+      val (desc, exists) = extractTableDesc(
+        "CREATE TABLE IF NOT EXISTS fileformat_test (id int) STORED AS textfile")
+      assert(exists)
+      assert(desc.storage.inputFormat == Some("org.apache.hadoop.mapred.TextInputFormat"))
+      assert(desc.storage.outputFormat ==
+        Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"))
+      assert(desc.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"))
+    }
+
+    withSQLConf("hive.default.fileformat" -> "orc") {
+      val (desc, exists) = extractTableDesc(
+        "CREATE TABLE IF NOT EXISTS fileformat_test (id int) STORED AS sequencefile")
+      assert(exists)
+      assert(desc.storage.inputFormat == Some("org.apache.hadoop.mapred.SequenceFileInputFormat"))
+      assert(desc.storage.outputFormat == Some("org.apache.hadoop.mapred.SequenceFileOutputFormat"))
+      assert(desc.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"))
+    }
   }
 
   test("create hive serde table with new syntax - basic") {

From 43f5e40679f771326b2ee72f14cf1ab0ed2ad692 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Mon, 12 Feb 2018 16:47:28 -0800
Subject: [PATCH 285/566] [SPARK-23352][PYTHON][BRANCH-2.3] Explicitly specify
 supported types in Pandas UDFs

## What changes were proposed in this pull request?

This PR backports https://github.com/apache/spark/pull/20531:

It explicitly specifies supported types in Pandas UDFs.
The main change here is to add a deduplicated and explicit type checking in `returnType` ahead with documenting this; however, it happened to fix multiple things.

1. Currently, we don't support `BinaryType` in Pandas UDFs, for example, see:

    ```python
    from pyspark.sql.functions import pandas_udf
    pudf = pandas_udf(lambda x: x, "binary")
    df = spark.createDataFrame([[bytearray(1)]])
    df.select(pudf("_1")).show()
    ```
    ```
    ...
    TypeError: Unsupported type in conversion to Arrow: BinaryType
    ```

    We can document this behaviour for its guide.

2. Since we can check the return type ahead, we can fail fast before actual execution.

    ```python
    # we can fail fast at this stage because we know the schema ahead
    pandas_udf(lambda x: x, BinaryType())
    ```

## How was this patch tested?

Manually tested and unit tests for `BinaryType` and `ArrayType(...)` were added.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20588 from HyukjinKwon/PR_TOOL_PICK_PR_20531_BRANCH-2.3.
---
 docs/sql-programming-guide.md                 |  4 +-
 python/pyspark/sql/tests.py                   | 86 +++++++++++--------
 python/pyspark/sql/types.py                   |  4 +
 python/pyspark/sql/udf.py                     | 25 ++++--
 .../apache/spark/sql/internal/SQLConf.scala   |  2 +-
 5 files changed, 77 insertions(+), 44 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index dcef6e58b4ba1..0f9f01e18682f 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1676,7 +1676,7 @@ Using the above optimizations with Arrow will produce the same results as when A
 enabled. Note that even with Arrow, `toPandas()` results in the collection of all records in the
 DataFrame to the driver program and should be done on a small subset of the data. Not all Spark
 data types are currently supported and an error can be raised if a column has an unsupported type,
-see [Supported Types](#supported-sql-arrow-types). If an error occurs during `createDataFrame()`,
+see [Supported SQL Types](#supported-sql-arrow-types). If an error occurs during `createDataFrame()`,
 Spark will fall back to create the DataFrame without Arrow.
 
 ## Pandas UDFs (a.k.a. Vectorized UDFs)
@@ -1734,7 +1734,7 @@ For detailed usage, please see [`pyspark.sql.functions.pandas_udf`](api/python/p
 
 ### Supported SQL Types
 
-Currently, all Spark SQL data types are supported by Arrow-based conversion except `MapType`,
+Currently, all Spark SQL data types are supported by Arrow-based conversion except `BinaryType`, `MapType`,
 `ArrayType` of `TimestampType`, and nested `StructType`.
 
 ### Setting Arrow Batch Size
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 5480144f33c9e..904fa7abd7d5c 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3736,10 +3736,10 @@ def foo(x):
         self.assertEqual(foo.returnType, schema)
         self.assertEqual(foo.evalType, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF)
 
-        @pandas_udf(returnType='v double', functionType=PandasUDFType.SCALAR)
+        @pandas_udf(returnType='double', functionType=PandasUDFType.SCALAR)
         def foo(x):
             return x
-        self.assertEqual(foo.returnType, schema)
+        self.assertEqual(foo.returnType, DoubleType())
         self.assertEqual(foo.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF)
 
         @pandas_udf(returnType=schema, functionType=PandasUDFType.GROUPED_MAP)
@@ -3776,7 +3776,7 @@ def zero_with_type():
                 @pandas_udf(returnType=PandasUDFType.GROUPED_MAP)
                 def foo(df):
                     return df
-            with self.assertRaisesRegexp(ValueError, 'Invalid returnType'):
+            with self.assertRaisesRegexp(TypeError, 'Invalid returnType'):
                 @pandas_udf(returnType='double', functionType=PandasUDFType.GROUPED_MAP)
                 def foo(df):
                     return df
@@ -3825,7 +3825,7 @@ def random_udf(v):
         return random_udf
 
     def test_vectorized_udf_basic(self):
-        from pyspark.sql.functions import pandas_udf, col
+        from pyspark.sql.functions import pandas_udf, col, array
         df = self.spark.range(10).select(
             col('id').cast('string').alias('str'),
             col('id').cast('int').alias('int'),
@@ -3833,7 +3833,8 @@ def test_vectorized_udf_basic(self):
             col('id').cast('float').alias('float'),
             col('id').cast('double').alias('double'),
             col('id').cast('decimal').alias('decimal'),
-            col('id').cast('boolean').alias('bool'))
+            col('id').cast('boolean').alias('bool'),
+            array(col('id')).alias('array_long'))
         f = lambda x: x
         str_f = pandas_udf(f, StringType())
         int_f = pandas_udf(f, IntegerType())
@@ -3842,10 +3843,11 @@ def test_vectorized_udf_basic(self):
         double_f = pandas_udf(f, DoubleType())
         decimal_f = pandas_udf(f, DecimalType())
         bool_f = pandas_udf(f, BooleanType())
+        array_long_f = pandas_udf(f, ArrayType(LongType()))
         res = df.select(str_f(col('str')), int_f(col('int')),
                         long_f(col('long')), float_f(col('float')),
                         double_f(col('double')), decimal_f('decimal'),
-                        bool_f(col('bool')))
+                        bool_f(col('bool')), array_long_f('array_long'))
         self.assertEquals(df.collect(), res.collect())
 
     def test_register_nondeterministic_vectorized_udf_basic(self):
@@ -4050,10 +4052,11 @@ def test_vectorized_udf_chained(self):
     def test_vectorized_udf_wrong_return_type(self):
         from pyspark.sql.functions import pandas_udf, col
         df = self.spark.range(10)
-        f = pandas_udf(lambda x: x * 1.0, MapType(LongType(), LongType()))
         with QuietTest(self.sc):
-            with self.assertRaisesRegexp(Exception, 'Unsupported.*type.*conversion'):
-                df.select(f(col('id'))).collect()
+            with self.assertRaisesRegexp(
+                    NotImplementedError,
+                    'Invalid returnType.*scalar Pandas UDF.*MapType'):
+                pandas_udf(lambda x: x * 1.0, MapType(LongType(), LongType()))
 
     def test_vectorized_udf_return_scalar(self):
         from pyspark.sql.functions import pandas_udf, col
@@ -4088,13 +4091,18 @@ def test_vectorized_udf_varargs(self):
         self.assertEquals(df.collect(), res.collect())
 
     def test_vectorized_udf_unsupported_types(self):
-        from pyspark.sql.functions import pandas_udf, col
-        schema = StructType([StructField("map", MapType(StringType(), IntegerType()), True)])
-        df = self.spark.createDataFrame([(None,)], schema=schema)
-        f = pandas_udf(lambda x: x, MapType(StringType(), IntegerType()))
+        from pyspark.sql.functions import pandas_udf
         with QuietTest(self.sc):
-            with self.assertRaisesRegexp(Exception, 'Unsupported data type'):
-                df.select(f(col('map'))).collect()
+            with self.assertRaisesRegexp(
+                    NotImplementedError,
+                    'Invalid returnType.*scalar Pandas UDF.*MapType'):
+                pandas_udf(lambda x: x, MapType(StringType(), IntegerType()))
+
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(
+                    NotImplementedError,
+                    'Invalid returnType.*scalar Pandas UDF.*BinaryType'):
+                pandas_udf(lambda x: x, BinaryType())
 
     def test_vectorized_udf_dates(self):
         from pyspark.sql.functions import pandas_udf, col
@@ -4325,15 +4333,16 @@ def data(self):
             .withColumn("vs", array([lit(i) for i in range(20, 30)])) \
             .withColumn("v", explode(col('vs'))).drop('vs')
 
-    def test_simple(self):
-        from pyspark.sql.functions import pandas_udf, PandasUDFType
-        df = self.data
+    def test_supported_types(self):
+        from pyspark.sql.functions import pandas_udf, PandasUDFType, array, col
+        df = self.data.withColumn("arr", array(col("id")))
 
         foo_udf = pandas_udf(
             lambda pdf: pdf.assign(v1=pdf.v * pdf.id * 1.0, v2=pdf.v + pdf.id),
             StructType(
                 [StructField('id', LongType()),
                  StructField('v', IntegerType()),
+                 StructField('arr', ArrayType(LongType())),
                  StructField('v1', DoubleType()),
                  StructField('v2', LongType())]),
             PandasUDFType.GROUPED_MAP
@@ -4436,17 +4445,15 @@ def test_datatype_string(self):
 
     def test_wrong_return_type(self):
         from pyspark.sql.functions import pandas_udf, PandasUDFType
-        df = self.data
-
-        foo = pandas_udf(
-            lambda pdf: pdf,
-            'id long, v map<int, int>',
-            PandasUDFType.GROUPED_MAP
-        )
 
         with QuietTest(self.sc):
-            with self.assertRaisesRegexp(Exception, 'Unsupported.*type.*conversion'):
-                df.groupby('id').apply(foo).sort('id').toPandas()
+            with self.assertRaisesRegexp(
+                    NotImplementedError,
+                    'Invalid returnType.*grouped map Pandas UDF.*MapType'):
+                pandas_udf(
+                    lambda pdf: pdf,
+                    'id long, v map<int, int>',
+                    PandasUDFType.GROUPED_MAP)
 
     def test_wrong_args(self):
         from pyspark.sql.functions import udf, pandas_udf, sum, PandasUDFType
@@ -4465,23 +4472,30 @@ def test_wrong_args(self):
                 df.groupby('id').apply(
                     pandas_udf(lambda: 1, StructType([StructField("d", DoubleType())])))
             with self.assertRaisesRegexp(ValueError, 'Invalid udf'):
-                df.groupby('id').apply(
-                    pandas_udf(lambda x, y: x, StructType([StructField("d", DoubleType())])))
+                df.groupby('id').apply(pandas_udf(lambda x, y: x, DoubleType()))
             with self.assertRaisesRegexp(ValueError, 'Invalid udf.*GROUPED_MAP'):
                 df.groupby('id').apply(
-                    pandas_udf(lambda x, y: x, StructType([StructField("d", DoubleType())]),
-                               PandasUDFType.SCALAR))
+                    pandas_udf(lambda x, y: x, DoubleType(), PandasUDFType.SCALAR))
 
     def test_unsupported_types(self):
-        from pyspark.sql.functions import pandas_udf, col, PandasUDFType
+        from pyspark.sql.functions import pandas_udf, PandasUDFType
         schema = StructType(
             [StructField("id", LongType(), True),
              StructField("map", MapType(StringType(), IntegerType()), True)])
-        df = self.spark.createDataFrame([(1, None,)], schema=schema)
-        f = pandas_udf(lambda x: x, df.schema, PandasUDFType.GROUPED_MAP)
         with QuietTest(self.sc):
-            with self.assertRaisesRegexp(Exception, 'Unsupported data type'):
-                df.groupby('id').apply(f).collect()
+            with self.assertRaisesRegexp(
+                    NotImplementedError,
+                    'Invalid returnType.*grouped map Pandas UDF.*MapType'):
+                pandas_udf(lambda x: x, schema, PandasUDFType.GROUPED_MAP)
+
+        schema = StructType(
+            [StructField("id", LongType(), True),
+             StructField("arr_ts", ArrayType(TimestampType()), True)])
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(
+                    NotImplementedError,
+                    'Invalid returnType.*grouped map Pandas UDF.*ArrayType.*TimestampType'):
+                pandas_udf(lambda x: x, schema, PandasUDFType.GROUPED_MAP)
 
     # Regression test for SPARK-23314
     def test_timestamp_dst(self):
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index f7141b4549e4e..e25941cd37595 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1638,6 +1638,8 @@ def to_arrow_type(dt):
         # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read
         arrow_type = pa.timestamp('us', tz='UTC')
     elif type(dt) == ArrayType:
+        if type(dt.elementType) == TimestampType:
+            raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
         arrow_type = pa.list_(to_arrow_type(dt.elementType))
     else:
         raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
@@ -1680,6 +1682,8 @@ def from_arrow_type(at):
     elif types.is_timestamp(at):
         spark_type = TimestampType()
     elif types.is_list(at):
+        if types.is_timestamp(at.value_type):
+            raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
         spark_type = ArrayType(from_arrow_type(at.value_type))
     else:
         raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index 5a848c26b5fff..2b16b7148bfae 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -22,7 +22,8 @@
 from pyspark import SparkContext, since
 from pyspark.rdd import _prepare_for_python_RDD, PythonEvalType, ignore_unicode_prefix
 from pyspark.sql.column import Column, _to_java_column, _to_seq
-from pyspark.sql.types import StringType, DataType, StructType, _parse_datatype_string
+from pyspark.sql.types import StringType, DataType, StructType, _parse_datatype_string, \
+    to_arrow_type, to_arrow_schema
 
 __all__ = ["UDFRegistration"]
 
@@ -109,10 +110,24 @@ def returnType(self):
             else:
                 self._returnType_placeholder = _parse_datatype_string(self._returnType)
 
-        if self.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF \
-                and not isinstance(self._returnType_placeholder, StructType):
-            raise ValueError("Invalid returnType: returnType must be a StructType for "
-                             "pandas_udf with function type GROUPED_MAP")
+        if self.evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF:
+            try:
+                to_arrow_type(self._returnType_placeholder)
+            except TypeError:
+                raise NotImplementedError(
+                    "Invalid returnType with scalar Pandas UDFs: %s is "
+                    "not supported" % str(self._returnType_placeholder))
+        elif self.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
+            if isinstance(self._returnType_placeholder, StructType):
+                try:
+                    to_arrow_schema(self._returnType_placeholder)
+                except TypeError:
+                    raise NotImplementedError(
+                        "Invalid returnType with grouped map Pandas UDFs: "
+                        "%s is not supported" % str(self._returnType_placeholder))
+            else:
+                raise TypeError("Invalid returnType for grouped map Pandas "
+                                "UDFs: returnType must be a StructType.")
 
         return self._returnType_placeholder
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index e498f5520f6b4..54d4f63472e27 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1052,7 +1052,7 @@ object SQLConf {
         "for use with pyspark.sql.DataFrame.toPandas, and " +
         "pyspark.sql.SparkSession.createDataFrame when its input is a Pandas DataFrame. " +
         "The following data types are unsupported: " +
-        "MapType, ArrayType of TimestampType, and nested StructType.")
+        "BinaryType, MapType, ArrayType of TimestampType, and nested StructType.")
       .booleanConf
       .createWithDefault(false)
 

From 3737c3d32bb92e73cadaf3b1b9759d9be00b288d Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 13 Feb 2018 15:05:13 +0900
Subject: [PATCH 286/566] [SPARK-20090][FOLLOW-UP] Revert the deprecation of
 `names` in PySpark

## What changes were proposed in this pull request?
Deprecating the field `name` in PySpark is not expected. This PR is to revert the change.

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20595 from gatorsmile/removeDeprecate.

(cherry picked from commit 407f67249639709c40c46917700ed6dd736daa7d)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 python/pyspark/sql/types.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index e25941cd37595..cd857402db8f7 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -455,9 +455,6 @@ class StructType(DataType):
     Iterating a :class:`StructType` will iterate its :class:`StructField`\\s.
     A contained :class:`StructField` can be accessed by name or position.
 
-    .. note:: `names` attribute is deprecated in 2.3. Use `fieldNames` method instead
-        to get a list of field names.
-
     >>> struct1 = StructType([StructField("f1", StringType(), True)])
     >>> struct1["f1"]
     StructField(f1,StringType,true)

From 1c81c0c626f115fbfe121ad6f6367b695e9f3b5f Mon Sep 17 00:00:00 2001
From: guoxiaolong <guo.xiaolong1@zte.com.cn>
Date: Tue, 13 Feb 2018 06:23:10 -0600
Subject: [PATCH 287/566] [SPARK-23384][WEB-UI] When it has no
 incomplete(completed) applications found, the last updated time is not
 formatted and client local time zone is not show in history server web ui.

## What changes were proposed in this pull request?

When it has no incomplete(completed) applications found, the last updated time is not formatted and client local time zone is not show in history server web ui. It is a bug.

fix before:
![1](https://user-images.githubusercontent.com/26266482/36070635-264d7cf0-0f3a-11e8-8426-14135ffedb16.png)

fix after:
![2](https://user-images.githubusercontent.com/26266482/36070651-8ec3800e-0f3a-11e8-991c-6122cc9539fe.png)

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: guoxiaolong <guo.xiaolong1@zte.com.cn>

Closes #20573 from guoxiaolongzte/SPARK-23384.

(cherry picked from commit 300c40f50ab4258d697f06a814d1491dc875c847)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../scala/org/apache/spark/deploy/history/HistoryPage.scala   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
index 5d62a7d8bebb4..6fc12d721e6f1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
@@ -37,7 +37,8 @@ private[history] class HistoryPage(parent: HistoryServer) extends WebUIPage("")
     val lastUpdatedTime = parent.getLastUpdatedTime()
     val providerConfig = parent.getProviderConfig()
     val content =
-      <script src={UIUtils.prependBaseUri("/static/historypage-common.js")}></script>
+      <script src={UIUtils.prependBaseUri("/static/historypage-common.js")}></script> ++
+      <script src={UIUtils.prependBaseUri("/static/utils.js")}></script>
       <div>
           <div class="container-fluid">
             <ul class="unstyled">
@@ -65,7 +66,6 @@ private[history] class HistoryPage(parent: HistoryServer) extends WebUIPage("")
             if (allAppsSize > 0) {
               <script src={UIUtils.prependBaseUri("/static/dataTables.rowsGroup.js")}></script> ++
                 <div id="history-summary" class="row-fluid"></div> ++
-                <script src={UIUtils.prependBaseUri("/static/utils.js")}></script> ++
                 <script src={UIUtils.prependBaseUri("/static/historypage.js")}></script> ++
                 <script>setAppLimit({parent.maxApplications})</script>
             } else if (requestedIncomplete) {

From dbb1b399b6cf8372a3659c472f380142146b1248 Mon Sep 17 00:00:00 2001
From: huangtengfei <huangtengfei@huangtengfeideMacBook-Pro.local>
Date: Tue, 13 Feb 2018 09:59:21 -0600
Subject: [PATCH 288/566] [SPARK-23053][CORE] taskBinarySerialization and task
 partitions calculate in DagScheduler.submitMissingTasks should keep the same
 RDD checkpoint status

## What changes were proposed in this pull request?

When we run concurrent jobs using the same rdd which is marked to do checkpoint. If one job has finished running the job, and start the process of RDD.doCheckpoint, while another job is submitted, then submitStage and submitMissingTasks will be called. In [submitMissingTasks](https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala#L961), will serialize taskBinaryBytes and calculate task partitions which are both affected by the status of checkpoint, if the former is calculated before doCheckpoint finished, while the latter is calculated after doCheckpoint finished, when run task, rdd.compute will be called, for some rdds with particular partition type such as [UnionRDD](https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala) who will do partition type cast, will get a ClassCastException because the part params is actually a CheckpointRDDPartition.
This error occurs  because rdd.doCheckpoint occurs in the same thread that called sc.runJob, while the task serialization occurs in the DAGSchedulers event loop.

## How was this patch tested?

the exist uts and also add a test case in DAGScheduerSuite to show the exception case.

Author: huangtengfei <huangtengfei@huangtengfeideMacBook-Pro.local>

Closes #20244 from ivoson/branch-taskpart-mistype.

(cherry picked from commit 091a000d27f324de8c5c527880854ecfcf5de9a4)
Signed-off-by: Imran Rashid <irashid@cloudera.com>
---
 .../apache/spark/scheduler/DAGScheduler.scala | 27 ++++++++++++-------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 199937b8c27af..8c46a84323392 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -39,7 +39,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config
 import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.partial.{ApproximateActionListener, ApproximateEvaluator, PartialResult}
-import org.apache.spark.rdd.RDD
+import org.apache.spark.rdd.{RDD, RDDCheckpointData}
 import org.apache.spark.rpc.RpcTimeout
 import org.apache.spark.storage._
 import org.apache.spark.storage.BlockManagerMessages.BlockManagerHeartbeat
@@ -1016,15 +1016,24 @@ class DAGScheduler(
     // might modify state of objects referenced in their closures. This is necessary in Hadoop
     // where the JobConf/Configuration object is not thread-safe.
     var taskBinary: Broadcast[Array[Byte]] = null
+    var partitions: Array[Partition] = null
     try {
       // For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).
       // For ResultTask, serialize and broadcast (rdd, func).
-      val taskBinaryBytes: Array[Byte] = stage match {
-        case stage: ShuffleMapStage =>
-          JavaUtils.bufferToArray(
-            closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef))
-        case stage: ResultStage =>
-          JavaUtils.bufferToArray(closureSerializer.serialize((stage.rdd, stage.func): AnyRef))
+      var taskBinaryBytes: Array[Byte] = null
+      // taskBinaryBytes and partitions are both effected by the checkpoint status. We need
+      // this synchronization in case another concurrent job is checkpointing this RDD, so we get a
+      // consistent view of both variables.
+      RDDCheckpointData.synchronized {
+        taskBinaryBytes = stage match {
+          case stage: ShuffleMapStage =>
+            JavaUtils.bufferToArray(
+              closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef))
+          case stage: ResultStage =>
+            JavaUtils.bufferToArray(closureSerializer.serialize((stage.rdd, stage.func): AnyRef))
+        }
+
+        partitions = stage.rdd.partitions
       }
 
       taskBinary = sc.broadcast(taskBinaryBytes)
@@ -1049,7 +1058,7 @@ class DAGScheduler(
           stage.pendingPartitions.clear()
           partitionsToCompute.map { id =>
             val locs = taskIdToLocations(id)
-            val part = stage.rdd.partitions(id)
+            val part = partitions(id)
             stage.pendingPartitions += id
             new ShuffleMapTask(stage.id, stage.latestInfo.attemptNumber,
               taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),
@@ -1059,7 +1068,7 @@ class DAGScheduler(
         case stage: ResultStage =>
           partitionsToCompute.map { id =>
             val p: Int = stage.partitions(id)
-            val part = stage.rdd.partitions(p)
+            val part = partitions(p)
             val locs = taskIdToLocations(id)
             new ResultTask(stage.id, stage.latestInfo.attemptNumber,
               taskBinary, part, locs, id, properties, serializedTaskMetrics,

From ab01ba718c7752b564e801a1ea546aedc2055dc0 Mon Sep 17 00:00:00 2001
From: Bogdan Raducanu <bogdan@databricks.com>
Date: Tue, 13 Feb 2018 09:49:52 -0800
Subject: [PATCH 289/566] [SPARK-23316][SQL] AnalysisException after max
 iteration reached for IN query

## What changes were proposed in this pull request?
Added flag ignoreNullability to DataType.equalsStructurally.
The previous semantic is for ignoreNullability=false.
When ignoreNullability=true equalsStructurally ignores nullability of contained types (map key types, value types, array element types, structure field types).
In.checkInputTypes calls equalsStructurally to check if the children types match. They should match regardless of nullability (which is just a hint), so it is now called with ignoreNullability=true.

## How was this patch tested?
New test in SubquerySuite

Author: Bogdan Raducanu <bogdan@databricks.com>

Closes #20548 from bogdanrdc/SPARK-23316.

(cherry picked from commit 05d051293fe46938e9cb012342fea6e8a3715cd4)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../sql/catalyst/expressions/predicates.scala  |  3 ++-
 .../org/apache/spark/sql/types/DataType.scala  | 18 ++++++++++++------
 .../org/apache/spark/sql/SubquerySuite.scala   |  5 +++++
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index b469f5cb7586a..a6d41ea7d00d4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -157,7 +157,8 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {
   require(list != null, "list should not be null")
 
   override def checkInputDataTypes(): TypeCheckResult = {
-    val mismatchOpt = list.find(l => !DataType.equalsStructurally(l.dataType, value.dataType))
+    val mismatchOpt = list.find(l => !DataType.equalsStructurally(l.dataType, value.dataType,
+      ignoreNullability = true))
     if (mismatchOpt.isDefined) {
       list match {
         case ListQuery(_, _, _, childOutputs) :: Nil =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
index d6e0df12218ad..0bef11659fc9e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -295,25 +295,31 @@ object DataType {
   }
 
   /**
-   * Returns true if the two data types share the same "shape", i.e. the types (including
-   * nullability) are the same, but the field names don't need to be the same.
+   * Returns true if the two data types share the same "shape", i.e. the types
+   * are the same, but the field names don't need to be the same.
+   *
+   * @param ignoreNullability whether to ignore nullability when comparing the types
    */
-  def equalsStructurally(from: DataType, to: DataType): Boolean = {
+  def equalsStructurally(
+      from: DataType,
+      to: DataType,
+      ignoreNullability: Boolean = false): Boolean = {
     (from, to) match {
       case (left: ArrayType, right: ArrayType) =>
         equalsStructurally(left.elementType, right.elementType) &&
-          left.containsNull == right.containsNull
+          (ignoreNullability || left.containsNull == right.containsNull)
 
       case (left: MapType, right: MapType) =>
         equalsStructurally(left.keyType, right.keyType) &&
           equalsStructurally(left.valueType, right.valueType) &&
-          left.valueContainsNull == right.valueContainsNull
+          (ignoreNullability || left.valueContainsNull == right.valueContainsNull)
 
       case (StructType(fromFields), StructType(toFields)) =>
         fromFields.length == toFields.length &&
           fromFields.zip(toFields)
             .forall { case (l, r) =>
-              equalsStructurally(l.dataType, r.dataType) && l.nullable == r.nullable
+              equalsStructurally(l.dataType, r.dataType) &&
+                (ignoreNullability || l.nullable == r.nullable)
             }
 
       case (fromDataType, toDataType) => fromDataType == toDataType
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
index 8673dc14f7597..31e8b0e8dede0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -950,4 +950,9 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
     assert(join.duplicateResolved)
     assert(optimizedPlan.resolved)
   }
+
+  test("SPARK-23316: AnalysisException after max iteration reached for IN query") {
+    // before the fix this would throw AnalysisException
+    spark.range(10).where("(id,id) in (select id, null from range(3))").count
+  }
 }

From 320ffb1309571faedb271f2c769b4ab1ee1cd267 Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Tue, 13 Feb 2018 11:18:45 -0800
Subject: [PATCH 290/566] [SPARK-23154][ML][DOC] Document backwards
 compatibility guarantees for ML persistence

## What changes were proposed in this pull request?

Added documentation about what MLlib guarantees in terms of loading ML models and Pipelines from old Spark versions.  Discussed & confirmed on linked JIRA.

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #20592 from jkbradley/SPARK-23154-backwards-compat-doc.

(cherry picked from commit d58fe28836639e68e262812d911f167cb071007b)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 docs/ml-pipeline.md | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/docs/ml-pipeline.md b/docs/ml-pipeline.md
index aa92c0a37c0f4..e22e9003c30f6 100644
--- a/docs/ml-pipeline.md
+++ b/docs/ml-pipeline.md
@@ -188,9 +188,36 @@ Parameters belong to specific instances of `Estimator`s and `Transformer`s.
 For example, if we have two `LogisticRegression` instances `lr1` and `lr2`, then we can build a `ParamMap` with both `maxIter` parameters specified: `ParamMap(lr1.maxIter -> 10, lr2.maxIter -> 20)`.
 This is useful if there are two algorithms with the `maxIter` parameter in a `Pipeline`.
 
-## Saving and Loading Pipelines
+## ML persistence: Saving and Loading Pipelines
 
-Often times it is worth it to save a model or a pipeline to disk for later use. In Spark 1.6, a model import/export functionality was added to the Pipeline API. Most basic transformers are supported as well as some of the more basic ML models. Please refer to the algorithm's API documentation to see if saving and loading is supported.
+Often times it is worth it to save a model or a pipeline to disk for later use. In Spark 1.6, a model import/export functionality was added to the Pipeline API.
+As of Spark 2.3, the DataFrame-based API in `spark.ml` and `pyspark.ml` has complete coverage.
+
+ML persistence works across Scala, Java and Python.  However, R currently uses a modified format,
+so models saved in R can only be loaded back in R; this should be fixed in the future and is
+tracked in [SPARK-15572](https://issues.apache.org/jira/browse/SPARK-15572).
+
+### Backwards compatibility for ML persistence
+
+In general, MLlib maintains backwards compatibility for ML persistence.  I.e., if you save an ML
+model or Pipeline in one version of Spark, then you should be able to load it back and use it in a
+future version of Spark.  However, there are rare exceptions, described below.
+
+Model persistence: Is a model or Pipeline saved using Apache Spark ML persistence in Spark
+version X loadable by Spark version Y?
+
+* Major versions: No guarantees, but best-effort.
+* Minor and patch versions: Yes; these are backwards compatible.
+* Note about the format: There are no guarantees for a stable persistence format, but model loading itself is designed to be backwards compatible.
+
+Model behavior: Does a model or Pipeline in Spark version X behave identically in Spark version Y?
+
+* Major versions: No guarantees, but best-effort.
+* Minor and patch versions: Identical behavior, except for bug fixes.
+
+For both model persistence and model behavior, any breaking changes across a minor version or patch
+version are reported in the Spark version release notes. If a breakage is not reported in release
+notes, then it should be treated as a bug to be fixed.
 
 # Code examples
 

From 4f6a457d464096d791e13e57c55bcf23c01c418f Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 13 Feb 2018 11:56:49 -0800
Subject: [PATCH 291/566] [SPARK-23400][SQL] Add a constructors for ScalaUDF

## What changes were proposed in this pull request?

In this upcoming 2.3 release, we changed the interface of `ScalaUDF`. Unfortunately, some Spark packages (e.g., spark-deep-learning) are using our internal class `ScalaUDF`. In the release 2.3, we added new parameters into this class. The users hit the binary compatibility issues and got the exception:

```
> java.lang.NoSuchMethodError: org.apache.spark.sql.catalyst.expressions.ScalaUDF.&lt;init&gt;(Ljava/lang/Object;Lorg/apache/spark/sql/types/DataType;Lscala/collection/Seq;Lscala/collection/Seq;Lscala/Option;)V
```

This PR is to improve the backward compatibility. However, we definitely should not encourage the external packages to use our internal classes. This might make us hard to maintain/develop the codes in Spark.

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20591 from gatorsmile/scalaUDF.

(cherry picked from commit 2ee76c22b6e48e643694c9475e5f0d37124215e7)
Signed-off-by: Shixiong Zhu <zsxwing@gmail.com>
---
 .../spark/sql/catalyst/expressions/ScalaUDF.scala     | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
index 388ef42883ad3..989c02305620a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
@@ -49,6 +49,17 @@ case class ScalaUDF(
     udfDeterministic: Boolean = true)
   extends Expression with ImplicitCastInputTypes with NonSQLExpression with UserDefinedExpression {
 
+  // The constructor for SPARK 2.1 and 2.2
+  def this(
+      function: AnyRef,
+      dataType: DataType,
+      children: Seq[Expression],
+      inputTypes: Seq[DataType],
+      udfName: Option[String]) = {
+    this(
+      function, dataType, children, inputTypes, udfName, nullable = true, udfDeterministic = true)
+  }
+
   override lazy val deterministic: Boolean = udfDeterministic && children.forall(_.deterministic)
 
   override def toString: String =

From bb26bdb55fdf84c4e36fd66af9a15e325a3982d6 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Wed, 14 Feb 2018 10:55:24 +0800
Subject: [PATCH 292/566] [SPARK-23399][SQL] Register a task completion
 listener first for OrcColumnarBatchReader

This PR aims to resolve an open file leakage issue reported at [SPARK-23390](https://issues.apache.org/jira/browse/SPARK-23390) by moving the listener registration position. Currently, the sequence is like the following.

1. Create `batchReader`
2. `batchReader.initialize` opens a ORC file.
3. `batchReader.initBatch` may take a long time to alloc memory in some environment and cause errors.
4. `Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => iter.close()))`

This PR moves 4 before 2 and 3. To sum up, the new sequence is 1 -> 4 -> 2 -> 3.

Manual. The following test case makes OOM intentionally to cause leaked filesystem connection in the current code base. With this patch, leakage doesn't occurs.

```scala
  // This should be tested manually because it raises OOM intentionally
  // in order to cause `Leaked filesystem connection`.
  test("SPARK-23399 Register a task completion listener first for OrcColumnarBatchReader") {
    withSQLConf(SQLConf.ORC_VECTORIZED_READER_BATCH_SIZE.key -> s"${Int.MaxValue}") {
      withTempDir { dir =>
        val basePath = dir.getCanonicalPath
        Seq(0).toDF("a").write.format("orc").save(new Path(basePath, "first").toString)
        Seq(1).toDF("a").write.format("orc").save(new Path(basePath, "second").toString)
        val df = spark.read.orc(
          new Path(basePath, "first").toString,
          new Path(basePath, "second").toString)
        val e = intercept[SparkException] {
          df.collect()
        }
        assert(e.getCause.isInstanceOf[OutOfMemoryError])
      }
    }
  }
```

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20590 from dongjoon-hyun/SPARK-23399.

(cherry picked from commit 357babde5a8eb9710de7016d7ae82dee21fa4ef3)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/execution/datasources/orc/OrcFileFormat.scala     | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
index 2dd314d165348..94403c3be8c7b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
@@ -187,6 +187,12 @@ class OrcFileFormat
         if (enableVectorizedReader) {
           val batchReader = new OrcColumnarBatchReader(
             enableOffHeapColumnVector && taskContext.isDefined, copyToSpark)
+          // SPARK-23399 Register a task completion listener first to call `close()` in all cases.
+          // There is a possibility that `initialize` and `initBatch` hit some errors (like OOM)
+          // after opening a file.
+          val iter = new RecordReaderIterator(batchReader)
+          Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => iter.close()))
+
           batchReader.initialize(fileSplit, taskAttemptContext)
           batchReader.initBatch(
             reader.getSchema,
@@ -195,8 +201,6 @@ class OrcFileFormat
             partitionSchema,
             file.partitionValues)
 
-          val iter = new RecordReaderIterator(batchReader)
-          Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => iter.close()))
           iter.asInstanceOf[Iterator[InternalRow]]
         } else {
           val orcRecordReader = new OrcInputFormat[OrcStruct]

From fd66a3b7b151514a9f626444ef8710f64dab6813 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Cattilapiros=E2=80=9D?=
 <piros.attila.zsolt@gmail.com>
Date: Wed, 14 Feb 2018 06:45:54 -0800
Subject: [PATCH 293/566] [SPARK-23394][UI] In RDD storage page show the
 executor addresses instead of the IDs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

Extending RDD storage page to show executor addresses in the block table.

## How was this patch tested?

Manually:

![screen shot 2018-02-13 at 10 30 59](https://user-images.githubusercontent.com/2017933/36142668-0b3578f8-10a9-11e8-95ea-2f57703ee4af.png)

Author: “attilapiros” <piros.attila.zsolt@gmail.com>

Closes #20589 from attilapiros/SPARK-23394.

(cherry picked from commit 140f87533a468b1046504fc3ff01fbe1637e41cd)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../org/apache/spark/ui/storage/RDDPage.scala | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala b/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala
index 02cee7f8c5b33..2674b9291203a 100644
--- a/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala
@@ -23,7 +23,7 @@ import javax.servlet.http.HttpServletRequest
 import scala.xml.{Node, Unparsed}
 
 import org.apache.spark.status.AppStatusStore
-import org.apache.spark.status.api.v1.{RDDDataDistribution, RDDPartitionInfo}
+import org.apache.spark.status.api.v1.{ExecutorSummary, RDDDataDistribution, RDDPartitionInfo}
 import org.apache.spark.ui._
 import org.apache.spark.util.Utils
 
@@ -76,7 +76,8 @@ private[ui] class RDDPage(parent: SparkUITab, store: AppStatusStore) extends Web
         rddStorageInfo.partitions.get,
         blockPageSize,
         blockSortColumn,
-        blockSortDesc)
+        blockSortDesc,
+        store.executorList(true))
       _blockTable.table(page)
     } catch {
       case e @ (_ : IllegalArgumentException | _ : IndexOutOfBoundsException) =>
@@ -182,7 +183,8 @@ private[ui] class BlockDataSource(
     rddPartitions: Seq[RDDPartitionInfo],
     pageSize: Int,
     sortColumn: String,
-    desc: Boolean) extends PagedDataSource[BlockTableRowData](pageSize) {
+    desc: Boolean,
+    executorIdToAddress: Map[String, String]) extends PagedDataSource[BlockTableRowData](pageSize) {
 
   private val data = rddPartitions.map(blockRow).sorted(ordering(sortColumn, desc))
 
@@ -198,7 +200,10 @@ private[ui] class BlockDataSource(
       rddPartition.storageLevel,
       rddPartition.memoryUsed,
       rddPartition.diskUsed,
-      rddPartition.executors.mkString(" "))
+      rddPartition.executors
+        .map { id => executorIdToAddress.get(id).getOrElse(id) }
+        .sorted
+        .mkString(" "))
   }
 
   /**
@@ -226,7 +231,8 @@ private[ui] class BlockPagedTable(
     rddPartitions: Seq[RDDPartitionInfo],
     pageSize: Int,
     sortColumn: String,
-    desc: Boolean) extends PagedTable[BlockTableRowData] {
+    desc: Boolean,
+    executorSummaries: Seq[ExecutorSummary]) extends PagedTable[BlockTableRowData] {
 
   override def tableId: String = "rdd-storage-by-block-table"
 
@@ -243,7 +249,8 @@ private[ui] class BlockPagedTable(
     rddPartitions,
     pageSize,
     sortColumn,
-    desc)
+    desc,
+    executorSummaries.map { ex => (ex.id, ex.hostPort) }.toMap)
 
   override def pageLink(page: Int): String = {
     val encodedSortColumn = URLEncoder.encode(sortColumn, "UTF-8")

From a5a8a86e213c34d6fb32f0ae52db24d8f1ef0905 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Wed, 14 Feb 2018 10:59:36 -0800
Subject: [PATCH 294/566] Revert "[SPARK-23249][SQL] Improved block merging
 logic for partitions"

This reverts commit f5f21e8c4261c0dfe8e3e788a30b38b188a18f67.
---
 .../sql/execution/DataSourceScanExec.scala    | 29 +++++--------------
 .../datasources/FileSourceStrategySuite.scala | 15 ++++++----
 2 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index f7732e2098c29..aa66ee7e948ea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -445,29 +445,16 @@ case class FileSourceScanExec(
       currentSize = 0
     }
 
-    def addFile(file: PartitionedFile): Unit = {
-        currentFiles += file
-        currentSize += file.length + openCostInBytes
-    }
-
-    var frontIndex = 0
-    var backIndex = splitFiles.length - 1
-
-    while (frontIndex <= backIndex) {
-      addFile(splitFiles(frontIndex))
-      frontIndex += 1
-      while (frontIndex <= backIndex &&
-             currentSize + splitFiles(frontIndex).length <= maxSplitBytes) {
-        addFile(splitFiles(frontIndex))
-        frontIndex += 1
-      }
-      while (backIndex > frontIndex &&
-             currentSize + splitFiles(backIndex).length <= maxSplitBytes) {
-        addFile(splitFiles(backIndex))
-        backIndex -= 1
+    // Assign files to partitions using "Next Fit Decreasing"
+    splitFiles.foreach { file =>
+      if (currentSize + file.length > maxSplitBytes) {
+        closePartition()
       }
-      closePartition()
+      // Add the given file to the current partition.
+      currentSize += file.length + openCostInBytes
+      currentFiles += file
     }
+    closePartition()
 
     new FileScanRDD(fsRelation.sparkSession, readFile, partitions)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
index bfccc9335b361..c1d61b843d899 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
@@ -141,17 +141,16 @@ class FileSourceStrategySuite extends QueryTest with SharedSQLContext with Predi
     withSQLConf(SQLConf.FILES_MAX_PARTITION_BYTES.key -> "4",
         SQLConf.FILES_OPEN_COST_IN_BYTES.key -> "1") {
       checkScan(table.select('c1)) { partitions =>
-        // Files should be laid out [(file1, file6), (file2, file3), (file4, file5)]
-        assert(partitions.size == 3, "when checking partitions")
-        assert(partitions(0).files.size == 2, "when checking partition 1")
+        // Files should be laid out [(file1), (file2, file3), (file4, file5), (file6)]
+        assert(partitions.size == 4, "when checking partitions")
+        assert(partitions(0).files.size == 1, "when checking partition 1")
         assert(partitions(1).files.size == 2, "when checking partition 2")
         assert(partitions(2).files.size == 2, "when checking partition 3")
+        assert(partitions(3).files.size == 1, "when checking partition 4")
 
-        // First partition reads (file1, file6)
+        // First partition reads (file1)
         assert(partitions(0).files(0).start == 0)
         assert(partitions(0).files(0).length == 2)
-        assert(partitions(0).files(1).start == 0)
-        assert(partitions(0).files(1).length == 1)
 
         // Second partition reads (file2, file3)
         assert(partitions(1).files(0).start == 0)
@@ -164,6 +163,10 @@ class FileSourceStrategySuite extends QueryTest with SharedSQLContext with Predi
         assert(partitions(2).files(0).length == 1)
         assert(partitions(2).files(1).start == 0)
         assert(partitions(2).files(1).length == 1)
+
+        // Final partition reads (file6)
+        assert(partitions(3).files(0).start == 0)
+        assert(partitions(3).files(0).length == 1)
       }
 
       checkPartitionSchema(StructType(Nil))

From bd83f7ba097d9bca9a0e8c072f7566a645887a96 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Wed, 14 Feb 2018 23:52:59 -0800
Subject: [PATCH 295/566] [SPARK-23421][SPARK-22356][SQL] Document the behavior
 change in

## What changes were proposed in this pull request?
https://github.com/apache/spark/pull/19579 introduces a behavior change. We need to document it in the migration guide.

## How was this patch tested?
Also update the HiveExternalCatalogVersionsSuite to verify it.

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20606 from gatorsmile/addMigrationGuide.

(cherry picked from commit a77ebb0921e390cf4fc6279a8c0a92868ad7e69b)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 docs/sql-programming-guide.md                                 | 2 ++
 .../spark/sql/hive/HiveExternalCatalogVersionsSuite.scala     | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 0f9f01e18682f..cf9529a79f4f9 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1963,6 +1963,8 @@ working with timestamps in `pandas_udf`s to get the best performance, see
 ## Upgrading From Spark SQL 2.1 to 2.2
 
   - Spark 2.1.1 introduced a new configuration key: `spark.sql.hive.caseSensitiveInferenceMode`. It had a default setting of `NEVER_INFER`, which kept behavior identical to 2.1.0. However, Spark 2.2.0 changes this setting's default value to `INFER_AND_SAVE` to restore compatibility with reading Hive metastore tables whose underlying file schema have mixed-case column names. With the `INFER_AND_SAVE` configuration value, on first access Spark will perform schema inference on any Hive metastore table for which it has not already saved an inferred schema. Note that schema inference can be a very time consuming operation for tables with thousands of partitions. If compatibility with mixed-case column names is not a concern, you can safely set `spark.sql.hive.caseSensitiveInferenceMode` to `NEVER_INFER` to avoid the initial overhead of schema inference. Note that with the new default `INFER_AND_SAVE` setting, the results of the schema inference are saved as a metastore key for future use. Therefore, the initial schema inference occurs only at a table's first access.
+  
+  - Since Spark 2.2.1 and 2.3.0, the schema is always inferred at runtime when the data source tables have the columns that exist in both partition schema and data schema. The inferred schema does not have the partitioned columns. When reading the table, Spark respects the partition values of these overlapping columns instead of the values stored in the data source files. In 2.2.0 and 2.1.x release, the inferred schema is partitioned but the data of the table is invisible to users (i.e., the result set is empty).
 
 ## Upgrading From Spark SQL 2.0 to 2.1
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
index ae4aeb7b4ce4a..c13a750dbb270 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
@@ -195,7 +195,7 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
 
 object PROCESS_TABLES extends QueryTest with SQLTestUtils {
   // Tests the latest version of every release line.
-  val testingVersions = Seq("2.0.2", "2.1.2", "2.2.0")
+  val testingVersions = Seq("2.0.2", "2.1.2", "2.2.0", "2.2.1")
 
   protected var spark: SparkSession = _
 
@@ -249,7 +249,7 @@ object PROCESS_TABLES extends QueryTest with SQLTestUtils {
 
       // SPARK-22356: overlapped columns between data and partition schema in data source tables
       val tbl_with_col_overlap = s"tbl_with_col_overlap_$index"
-      // For Spark 2.2.0 and 2.1.x, the behavior is different from Spark 2.0.
+      // For Spark 2.2.0 and 2.1.x, the behavior is different from Spark 2.0, 2.2.1, 2.3+
       if (testingVersions(index).startsWith("2.1") || testingVersions(index) == "2.2.0") {
         spark.sql("msck repair table " + tbl_with_col_overlap)
         assert(spark.table(tbl_with_col_overlap).columns === Array("i", "j", "p"))

From 129fd45efb418c6afa95aa26e5b96f03a39dcdd0 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Wed, 14 Feb 2018 23:56:02 -0800
Subject: [PATCH 296/566] [SPARK-23094] Revert [] Fix invalid character
 handling in JsonDataSource

## What changes were proposed in this pull request?
This PR is to revert the PR https://github.com/apache/spark/pull/20302, because it causes a regression.

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20614 from gatorsmile/revertJsonFix.

(cherry picked from commit 95e4b4916065e66a4f8dba57e98e725796f75e04)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../catalyst/json/CreateJacksonParser.scala   |  5 ++-
 .../sources/JsonHadoopFsRelationSuite.scala   | 34 -------------------
 2 files changed, 2 insertions(+), 37 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala
index b1672e7e2fca2..025a388aacaa5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala
@@ -40,11 +40,10 @@ private[sql] object CreateJacksonParser extends Serializable {
   }
 
   def text(jsonFactory: JsonFactory, record: Text): JsonParser = {
-    val bain = new ByteArrayInputStream(record.getBytes, 0, record.getLength)
-    jsonFactory.createParser(new InputStreamReader(bain, "UTF-8"))
+    jsonFactory.createParser(record.getBytes, 0, record.getLength)
   }
 
   def inputStream(jsonFactory: JsonFactory, record: InputStream): JsonParser = {
-    jsonFactory.createParser(new InputStreamReader(record, "UTF-8"))
+    jsonFactory.createParser(record)
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
index 27f398ebf301a..49be30435ad2f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
@@ -28,8 +28,6 @@ import org.apache.spark.sql.types._
 class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
   override val dataSourceName: String = "json"
 
-  private val badJson = "\u0000\u0000\u0000A\u0001AAA"
-
   // JSON does not write data of NullType and does not play well with BinaryType.
   override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
     case _: NullType => false
@@ -107,36 +105,4 @@ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
       )
     }
   }
-
-  test("invalid json with leading nulls - from file (multiLine=true)") {
-    import testImplicits._
-    withTempDir { tempDir =>
-      val path = tempDir.getAbsolutePath
-      Seq(badJson, """{"a":1}""").toDS().write.mode("overwrite").text(path)
-      val expected = s"""$badJson\n{"a":1}\n"""
-      val schema = new StructType().add("a", IntegerType).add("_corrupt_record", StringType)
-      val df =
-        spark.read.format(dataSourceName).option("multiLine", true).schema(schema).load(path)
-      checkAnswer(df, Row(null, expected))
-    }
-  }
-
-  test("invalid json with leading nulls - from file (multiLine=false)") {
-    import testImplicits._
-    withTempDir { tempDir =>
-      val path = tempDir.getAbsolutePath
-      Seq(badJson, """{"a":1}""").toDS().write.mode("overwrite").text(path)
-      val schema = new StructType().add("a", IntegerType).add("_corrupt_record", StringType)
-      val df =
-        spark.read.format(dataSourceName).option("multiLine", false).schema(schema).load(path)
-      checkAnswer(df, Seq(Row(1, null), Row(null, badJson)))
-    }
-  }
-
-  test("invalid json with leading nulls - from dataset") {
-    import testImplicits._
-    checkAnswer(
-      spark.read.json(Seq(badJson).toDS()),
-      Row(badJson))
-  }
 }

From f2c0585652a8262801e92a02b56c56f16b8926e5 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 15 Feb 2018 16:59:44 +0800
Subject: [PATCH 297/566] [SPARK-23419][SPARK-23416][SS] data source v2 write
 path should re-throw interruption exceptions directly

## What changes were proposed in this pull request?

Streaming execution has a list of exceptions that means interruption, and handle them specially. `WriteToDataSourceV2Exec` should also respect this list and not wrap them with `SparkException`.

## How was this patch tested?

existing test.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20605 from cloud-fan/write.

(cherry picked from commit f38c760638063f1fb45e9ee2c772090fb203a4a0)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../datasources/v2/WriteToDataSourceV2.scala  | 11 ++++-
 .../execution/streaming/StreamExecution.scala | 40 ++++++++++---------
 2 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
index 6592bd72fa338..d02faacd9c19a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution.datasources.v2
 
+import scala.util.control.NonFatal
+
 import org.apache.spark.{SparkEnv, SparkException, TaskContext}
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
@@ -26,6 +28,7 @@ import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.streaming.StreamExecution
 import org.apache.spark.sql.execution.streaming.continuous.{CommitPartitionEpoch, ContinuousExecution, EpochCoordinatorRef, SetWriterPartitions}
 import org.apache.spark.sql.sources.v2.writer._
 import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
@@ -102,7 +105,13 @@ case class WriteToDataSourceV2Exec(writer: DataSourceWriter, query: SparkPlan) e
             throw new SparkException("Writing job failed.", cause)
         }
         logError(s"Data source writer $writer aborted.")
-        throw new SparkException("Writing job aborted.", cause)
+        cause match {
+          // Do not wrap interruption exceptions that will be handled by streaming specially.
+          case _ if StreamExecution.isInterruptionException(cause) => throw cause
+          // Only wrap non fatal exceptions.
+          case NonFatal(e) => throw new SparkException("Writing job aborted.", e)
+          case _ => throw cause
+        }
     }
 
     sparkContext.emptyRDD
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index e7982d7880ceb..3fc8c7887896a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -356,25 +356,7 @@ abstract class StreamExecution(
 
   private def isInterruptedByStop(e: Throwable): Boolean = {
     if (state.get == TERMINATED) {
-      e match {
-        // InterruptedIOException - thrown when an I/O operation is interrupted
-        // ClosedByInterruptException - thrown when an I/O operation upon a channel is interrupted
-        case _: InterruptedException | _: InterruptedIOException | _: ClosedByInterruptException =>
-          true
-        // The cause of the following exceptions may be one of the above exceptions:
-        //
-        // UncheckedIOException - thrown by codes that cannot throw a checked IOException, such as
-        //                        BiFunction.apply
-        // ExecutionException - thrown by codes running in a thread pool and these codes throw an
-        //                      exception
-        // UncheckedExecutionException - thrown by codes that cannot throw a checked
-        //                               ExecutionException, such as BiFunction.apply
-        case e2 @ (_: UncheckedIOException | _: ExecutionException | _: UncheckedExecutionException)
-          if e2.getCause != null =>
-          isInterruptedByStop(e2.getCause)
-        case _ =>
-          false
-      }
+      StreamExecution.isInterruptionException(e)
     } else {
       false
     }
@@ -565,6 +547,26 @@ abstract class StreamExecution(
 
 object StreamExecution {
   val QUERY_ID_KEY = "sql.streaming.queryId"
+
+  def isInterruptionException(e: Throwable): Boolean = e match {
+    // InterruptedIOException - thrown when an I/O operation is interrupted
+    // ClosedByInterruptException - thrown when an I/O operation upon a channel is interrupted
+    case _: InterruptedException | _: InterruptedIOException | _: ClosedByInterruptException =>
+      true
+    // The cause of the following exceptions may be one of the above exceptions:
+    //
+    // UncheckedIOException - thrown by codes that cannot throw a checked IOException, such as
+    //                        BiFunction.apply
+    // ExecutionException - thrown by codes running in a thread pool and these codes throw an
+    //                      exception
+    // UncheckedExecutionException - thrown by codes that cannot throw a checked
+    //                               ExecutionException, such as BiFunction.apply
+    case e2 @ (_: UncheckedIOException | _: ExecutionException | _: UncheckedExecutionException)
+        if e2.getCause != null =>
+      isInterruptionException(e2.getCause)
+    case _ =>
+      false
+  }
 }
 
 /**

From d24d13179f0e9d125eaaebfcc225e1ec30c5cb83 Mon Sep 17 00:00:00 2001
From: Gabor Somogyi <gabor.g.somogyi@gmail.com>
Date: Thu, 15 Feb 2018 03:52:40 -0800
Subject: [PATCH 298/566] =?UTF-8?q?[SPARK-23422][CORE]=20YarnShuffleIntegr?=
 =?UTF-8?q?ationSuite=20fix=20when=20SPARK=5FPREPEN=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…D_CLASSES set to 1

## What changes were proposed in this pull request?

YarnShuffleIntegrationSuite fails when SPARK_PREPEND_CLASSES set to 1.

Normally mllib built before yarn module. When SPARK_PREPEND_CLASSES used mllib classes are on yarn test classpath.

Before 2.3 that did not cause issues. But 2.3 has SPARK-22450, which registered some mllib classes with the kryo serializer. Now it dies with the following error:

`
18/02/13 07:33:29 INFO SparkContext: Starting job: collect at YarnShuffleIntegrationSuite.scala:143
Exception in thread "dag-scheduler-event-loop" java.lang.NoClassDefFoundError: breeze/linalg/DenseMatrix
`

In this PR NoClassDefFoundError caught only in case of testing and then do nothing.

## How was this patch tested?

Automated: Pass the Jenkins.

Author: Gabor Somogyi <gabor.g.somogyi@gmail.com>

Closes #20608 from gaborgsomogyi/SPARK-23422.

(cherry picked from commit 44e20c42254bc6591b594f54cd94ced5fcfadae3)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../main/scala/org/apache/spark/serializer/KryoSerializer.scala  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
index 538ae05e4eea1..72427dd6ce4d4 100644
--- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -206,6 +206,7 @@ class KryoSerializer(conf: SparkConf)
         kryo.register(clazz)
       } catch {
         case NonFatal(_) => // do nothing
+        case _: NoClassDefFoundError if Utils.isTesting => // See SPARK-23422.
       }
     }
 

From bae4449ad836a64db853e297d33bfd1a725faa0b Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Thu, 15 Feb 2018 08:55:39 -0800
Subject: [PATCH 299/566] [SPARK-23426][SQL] Use `hive` ORC impl and disable
 PPD for Spark 2.3.0

## What changes were proposed in this pull request?

To prevent any regressions, this PR changes ORC implementation to `hive` by default like Spark 2.2.X.
Users can enable `native` ORC. Also, ORC PPD is also restored to `false` like Spark 2.2.X.

![orc_section](https://user-images.githubusercontent.com/9700541/36221575-57a1d702-1173-11e8-89fe-dca5842f4ca7.png)

## How was this patch tested?

Pass all test cases.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20610 from dongjoon-hyun/SPARK-ORC-DISABLE.

(cherry picked from commit 2f0498d1e85a53b60da6a47d20bbdf56b42b7dcb)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 docs/sql-programming-guide.md                 | 52 ++++++++-----------
 .../apache/spark/sql/internal/SQLConf.scala   |  6 +--
 .../spark/sql/FileBasedDataSourceSuite.scala  | 17 +++++-
 .../sql/streaming/FileStreamSinkSuite.scala   | 13 +++++
 .../sql/streaming/FileStreamSourceSuite.scala | 13 +++++
 5 files changed, 68 insertions(+), 33 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index cf9529a79f4f9..91e43678481d6 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1004,6 +1004,29 @@ Configuration of Parquet can be done using the `setConf` method on `SparkSession
 </tr>
 </table>
 
+## ORC Files
+
+Since Spark 2.3, Spark supports a vectorized ORC reader with a new ORC file format for ORC files.
+To do that, the following configurations are newly added. The vectorized reader is used for the
+native ORC tables (e.g., the ones created using the clause `USING ORC`) when `spark.sql.orc.impl`
+is set to `native` and `spark.sql.orc.enableVectorizedReader` is set to `true`. For the Hive ORC
+serde tables (e.g., the ones created using the clause `USING HIVE OPTIONS (fileFormat 'ORC')`),
+the vectorized reader is used when `spark.sql.hive.convertMetastoreOrc` is also set to `true`.
+
+<table class="table">
+  <tr><th><b>Property Name</b></th><th><b>Default</b></th><th><b>Meaning</b></th></tr>
+  <tr>
+    <td><code>spark.sql.orc.impl</code></td>
+    <td><code>hive</code></td>
+    <td>The name of ORC implementation. It can be one of <code>native</code> and <code>hive</code>. <code>native</code> means the native ORC support that is built on Apache ORC 1.4.1. `hive` means the ORC library in Hive 1.2.1.</td>
+  </tr>
+  <tr>
+    <td><code>spark.sql.orc.enableVectorizedReader</code></td>
+    <td><code>true</code></td>
+    <td>Enables vectorized orc decoding in <code>native</code> implementation. If <code>false</code>, a new non-vectorized ORC reader is used in <code>native</code> implementation. For <code>hive</code> implementation, this is ignored.</td>
+  </tr>
+</table>
+
 ## JSON Datasets
 <div class="codetabs">
 
@@ -1776,35 +1799,6 @@ working with timestamps in `pandas_udf`s to get the best performance, see
 
 ## Upgrading From Spark SQL 2.2 to 2.3
 
-  - Since Spark 2.3, Spark supports a vectorized ORC reader with a new ORC file format for ORC files. To do that, the following configurations are newly added or change their default values. The vectorized reader is used for the native ORC tables (e.g., the ones created using the clause `USING ORC`) when `spark.sql.orc.impl` is set to `native` and `spark.sql.orc.enableVectorizedReader` is set to `true`. For the Hive ORC serde table (e.g., the ones created using the clause `USING HIVE OPTIONS (fileFormat 'ORC')`), the vectorized reader is used when `spark.sql.hive.convertMetastoreOrc` is set to `true`.
-
-    - New configurations
-
-    <table class="table">
-      <tr><th><b>Property Name</b></th><th><b>Default</b></th><th><b>Meaning</b></th></tr>
-      <tr>
-        <td><code>spark.sql.orc.impl</code></td>
-        <td><code>native</code></td>
-        <td>The name of ORC implementation. It can be one of <code>native</code> and <code>hive</code>. <code>native</code> means the native ORC support that is built on Apache ORC 1.4.1. `hive` means the ORC library in Hive 1.2.1 which is used prior to Spark 2.3.</td>
-      </tr>
-      <tr>
-        <td><code>spark.sql.orc.enableVectorizedReader</code></td>
-        <td><code>true</code></td>
-        <td>Enables vectorized orc decoding in <code>native</code> implementation. If <code>false</code>, a new non-vectorized ORC reader is used in <code>native</code> implementation. For <code>hive</code> implementation, this is ignored.</td>
-      </tr>
-    </table>
-
-    - Changed configurations
-
-    <table class="table">
-      <tr><th><b>Property Name</b></th><th><b>Default</b></th><th><b>Meaning</b></th></tr>
-      <tr>
-        <td><code>spark.sql.orc.filterPushdown</code></td>
-        <td><code>true</code></td>
-        <td>Enables filter pushdown for ORC files. It is <code>false</code> by default prior to Spark 2.3.</td>
-      </tr>
-    </table>
-
   - Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the referenced columns only include the internal corrupt record column (named `_corrupt_record` by default). For example, `spark.read.schema(schema).json(file).filter($"_corrupt_record".isNotNull).count()` and `spark.read.schema(schema).json(file).select("_corrupt_record").show()`. Instead, you can cache or save the parsed results and then send the same query. For example, `val df = spark.read.schema(schema).json(file).cache()` and then `df.filter($"_corrupt_record".isNotNull).count()`.
   - The `percentile_approx` function previously accepted numeric type input and output double type results. Now it supports date type, timestamp type and numeric types as input types. The result type is also changed to be the same as the input type, which is more reasonable for percentiles.
   - Since Spark 2.3, the Join/Filter's deterministic predicates that are after the first non-deterministic predicates are also pushed down/through the child operators, if possible. In prior Spark versions, these filters are not eligible for predicate pushdown.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 54d4f63472e27..47fcf34b2cd3d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -393,11 +393,11 @@ object SQLConf {
 
   val ORC_IMPLEMENTATION = buildConf("spark.sql.orc.impl")
     .doc("When native, use the native version of ORC support instead of the ORC library in Hive " +
-      "1.2.1. It is 'hive' by default prior to Spark 2.3.")
+      "1.2.1. It is 'hive' by default.")
     .internal()
     .stringConf
     .checkValues(Set("hive", "native"))
-    .createWithDefault("native")
+    .createWithDefault("hive")
 
   val ORC_VECTORIZED_READER_ENABLED = buildConf("spark.sql.orc.enableVectorizedReader")
     .doc("Enables vectorized orc decoding.")
@@ -414,7 +414,7 @@ object SQLConf {
   val ORC_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.orc.filterPushdown")
     .doc("When true, enable filter pushdown for ORC files.")
     .booleanConf
-    .createWithDefault(true)
+    .createWithDefault(false)
 
   val HIVE_VERIFY_PARTITION_PATH = buildConf("spark.sql.hive.verifyPartitionPath")
     .doc("When true, check all the partition paths under the table\'s root directory " +
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
index 2e332362ea644..b5d4c558f0d3e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
@@ -20,14 +20,29 @@ package org.apache.spark.sql
 import java.io.FileNotFoundException
 
 import org.apache.hadoop.fs.Path
+import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 
-class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext {
+
+class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext with BeforeAndAfterAll {
   import testImplicits._
 
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    spark.sessionState.conf.setConf(SQLConf.ORC_IMPLEMENTATION, "native")
+  }
+
+  override def afterAll(): Unit = {
+    try {
+      spark.sessionState.conf.unsetConf(SQLConf.ORC_IMPLEMENTATION)
+    } finally {
+      super.afterAll()
+    }
+  }
+
   private val allFileBasedDataSources = Seq("orc", "parquet", "csv", "json", "text")
   private val nameWithSpecialChars = "sp&cial%c hars"
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
index 8c4e1fd00b0a2..ba48bc1ce0c4d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
@@ -33,6 +33,19 @@ import org.apache.spark.util.Utils
 class FileStreamSinkSuite extends StreamTest {
   import testImplicits._
 
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    spark.sessionState.conf.setConf(SQLConf.ORC_IMPLEMENTATION, "native")
+  }
+
+  override def afterAll(): Unit = {
+    try {
+      spark.sessionState.conf.unsetConf(SQLConf.ORC_IMPLEMENTATION)
+    } finally {
+      super.afterAll()
+    }
+  }
+
   test("unpartitioned writing and batch reading") {
     val inputData = MemoryStream[Int]
     val df = inputData.toDF()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index 5bb0f4d643bbe..d4bd9c7987f2d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -207,6 +207,19 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
       .collect { case s @ StreamingRelation(dataSource, _, _) => s.schema }.head
   }
 
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    spark.sessionState.conf.setConf(SQLConf.ORC_IMPLEMENTATION, "native")
+  }
+
+  override def afterAll(): Unit = {
+    try {
+      spark.sessionState.conf.unsetConf(SQLConf.ORC_IMPLEMENTATION)
+    } finally {
+      super.afterAll()
+    }
+  }
+
   // ============= Basic parameter exists tests ================
 
   test("FileStreamSource schema: no path") {

From 03960faa621710388c0de91d16a71dda749d173f Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Thu, 15 Feb 2018 09:40:08 -0800
Subject: [PATCH 300/566] [MINOR][SQL] Fix an error message about inserting
 into bucketed tables

## What changes were proposed in this pull request?

This replaces `Sparkcurrently` to `Spark currently` in the following error message.

```scala
scala> sql("insert into t2 select * from v1")
org.apache.spark.sql.AnalysisException: Output Hive table `default`.`t2`
is bucketed but Sparkcurrently does NOT populate bucketed ...
```

## How was this patch tested?

Manual.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20617 from dongjoon-hyun/SPARK-ERROR-MSG.

(cherry picked from commit 6968c3cfd70961c4e86daffd6a156d0a9c1d7a2a)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../apache/spark/sql/hive/execution/InsertIntoHiveTable.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 3ce5b8469d6fc..02a60f16b3b3a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -172,7 +172,7 @@ case class InsertIntoHiveTable(
         val enforceBucketingConfig = "hive.enforce.bucketing"
         val enforceSortingConfig = "hive.enforce.sorting"
 
-        val message = s"Output Hive table ${table.identifier} is bucketed but Spark" +
+        val message = s"Output Hive table ${table.identifier} is bucketed but Spark " +
           "currently does NOT populate bucketed output which is compatible with Hive."
 
         if (hadoopConf.get(enforceBucketingConfig, "true").toBoolean ||

From 0bd7765cd9832bee348af87663f3d424b61e92fc Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 15 Feb 2018 09:54:39 -0800
Subject: [PATCH 301/566] [SPARK-23377][ML] Fixes Bucketizer with multiple
 columns persistence bug

## What changes were proposed in this pull request?

#### Problem:

Since 2.3, `Bucketizer` supports multiple input/output columns. We will check if exclusive params are set during transformation. E.g., if `inputCols` and `outputCol` are both set, an error will be thrown.

However, when we write `Bucketizer`, looks like the default params and user-supplied params are merged during writing. All saved params are loaded back and set to created model instance. So the default `outputCol` param in `HasOutputCol` trait will be set in `paramMap` and become an user-supplied param. That makes the check of exclusive params failed.

#### Fix:

This changes the saving logic of Bucketizer to handle this case. This is a quick fix to catch the time of 2.3. We should consider modify the persistence mechanism later.

Please see the discussion in the JIRA.

Note: The multi-column `QuantileDiscretizer` also has the same issue.

## How was this patch tested?

Modified tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #20594 from viirya/SPARK-23377-2.

(cherry picked from commit db45daab90ede4c03c1abc9096f4eac584e9db17)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 .../apache/spark/ml/feature/Bucketizer.scala  | 28 +++++++++++++++++++
 .../ml/feature/QuantileDiscretizer.scala      | 28 +++++++++++++++++++
 .../spark/ml/feature/BucketizerSuite.scala    | 12 ++++++--
 .../ml/feature/QuantileDiscretizerSuite.scala | 14 ++++++++--
 4 files changed, 78 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
index c13bf47eacb94..f49c410cbcfe2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -19,6 +19,10 @@ package org.apache.spark.ml.feature
 
 import java.{util => ju}
 
+import org.json4s.JsonDSL._
+import org.json4s.JValue
+import org.json4s.jackson.JsonMethods._
+
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.Since
 import org.apache.spark.ml.Model
@@ -213,6 +217,8 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
   override def copy(extra: ParamMap): Bucketizer = {
     defaultCopy[Bucketizer](extra).setParent(parent)
   }
+
+  override def write: MLWriter = new Bucketizer.BucketizerWriter(this)
 }
 
 @Since("1.6.0")
@@ -290,6 +296,28 @@ object Bucketizer extends DefaultParamsReadable[Bucketizer] {
     }
   }
 
+
+  private[Bucketizer] class BucketizerWriter(instance: Bucketizer) extends MLWriter {
+
+    override protected def saveImpl(path: String): Unit = {
+      // SPARK-23377: The default params will be saved and loaded as user-supplied params.
+      // Once `inputCols` is set, the default value of `outputCol` param causes the error
+      // when checking exclusive params. As a temporary to fix it, we skip the default value
+      // of `outputCol` if `inputCols` is set when saving the metadata.
+      // TODO: If we modify the persistence mechanism later to better handle default params,
+      // we can get rid of this.
+      var paramWithoutOutputCol: Option[JValue] = None
+      if (instance.isSet(instance.inputCols)) {
+        val params = instance.extractParamMap().toSeq
+        val jsonParams = params.filter(_.param != instance.outputCol).map { case ParamPair(p, v) =>
+          p.name -> parse(p.jsonEncode(v))
+        }.toList
+        paramWithoutOutputCol = Some(render(jsonParams))
+      }
+      DefaultParamsWriter.saveMetadata(instance, path, sc, paramMap = paramWithoutOutputCol)
+    }
+  }
+
   @Since("1.6.0")
   override def load(path: String): Bucketizer = super.load(path)
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
index 1ec5f8cb6139b..3b4c25478fb1d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -17,6 +17,10 @@
 
 package org.apache.spark.ml.feature
 
+import org.json4s.JsonDSL._
+import org.json4s.JValue
+import org.json4s.jackson.JsonMethods._
+
 import org.apache.spark.annotation.Since
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml._
@@ -249,11 +253,35 @@ final class QuantileDiscretizer @Since("1.6.0") (@Since("1.6.0") override val ui
 
   @Since("1.6.0")
   override def copy(extra: ParamMap): QuantileDiscretizer = defaultCopy(extra)
+
+  override def write: MLWriter = new QuantileDiscretizer.QuantileDiscretizerWriter(this)
 }
 
 @Since("1.6.0")
 object QuantileDiscretizer extends DefaultParamsReadable[QuantileDiscretizer] with Logging {
 
+  private[QuantileDiscretizer]
+  class QuantileDiscretizerWriter(instance: QuantileDiscretizer) extends MLWriter {
+
+    override protected def saveImpl(path: String): Unit = {
+      // SPARK-23377: The default params will be saved and loaded as user-supplied params.
+      // Once `inputCols` is set, the default value of `outputCol` param causes the error
+      // when checking exclusive params. As a temporary to fix it, we skip the default value
+      // of `outputCol` if `inputCols` is set when saving the metadata.
+      // TODO: If we modify the persistence mechanism later to better handle default params,
+      // we can get rid of this.
+      var paramWithoutOutputCol: Option[JValue] = None
+      if (instance.isSet(instance.inputCols)) {
+        val params = instance.extractParamMap().toSeq
+        val jsonParams = params.filter(_.param != instance.outputCol).map { case ParamPair(p, v) =>
+          p.name -> parse(p.jsonEncode(v))
+        }.toList
+        paramWithoutOutputCol = Some(render(jsonParams))
+      }
+      DefaultParamsWriter.saveMetadata(instance, path, sc, paramMap = paramWithoutOutputCol)
+    }
+  }
+
   @Since("1.6.0")
   override def load(path: String): QuantileDiscretizer = super.load(path)
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
index 7403680ae3fdc..41cf72fe3470a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
@@ -172,7 +172,10 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
       .setInputCol("myInputCol")
       .setOutputCol("myOutputCol")
       .setSplits(Array(0.1, 0.8, 0.9))
-    testDefaultReadWrite(t)
+
+    val bucketizer = testDefaultReadWrite(t)
+    val data = Seq((1.0, 2.0), (10.0, 100.0), (101.0, -1.0)).toDF("myInputCol", "myInputCol2")
+    bucketizer.transform(data)
   }
 
   test("Bucket numeric features") {
@@ -327,7 +330,12 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
       .setInputCols(Array("myInputCol"))
       .setOutputCols(Array("myOutputCol"))
       .setSplitsArray(Array(Array(0.1, 0.8, 0.9)))
-    testDefaultReadWrite(t)
+
+    val bucketizer = testDefaultReadWrite(t)
+    val data = Seq((1.0, 2.0), (10.0, 100.0), (101.0, -1.0)).toDF("myInputCol", "myInputCol2")
+    bucketizer.transform(data)
+    assert(t.hasDefault(t.outputCol))
+    assert(bucketizer.hasDefault(bucketizer.outputCol))
   }
 
   test("Bucketizer in a pipeline") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
index e9a75e931e6a8..6c363799dd300 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
@@ -27,6 +27,8 @@ import org.apache.spark.sql.functions.udf
 class QuantileDiscretizerSuite
   extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
 
+  import testImplicits._
+
   test("Test observed number of buckets and their sizes match expected values") {
     val spark = this.spark
     import spark.implicits._
@@ -132,7 +134,10 @@ class QuantileDiscretizerSuite
       .setInputCol("myInputCol")
       .setOutputCol("myOutputCol")
       .setNumBuckets(6)
-    testDefaultReadWrite(t)
+
+    val readDiscretizer = testDefaultReadWrite(t)
+    val data = sc.parallelize(1 to 100).map(Tuple1.apply).toDF("myInputCol")
+    readDiscretizer.fit(data)
   }
 
   test("Verify resulting model has parent") {
@@ -379,7 +384,12 @@ class QuantileDiscretizerSuite
       .setInputCols(Array("input1", "input2"))
       .setOutputCols(Array("result1", "result2"))
       .setNumBucketsArray(Array(5, 10))
-    testDefaultReadWrite(discretizer)
+
+    val readDiscretizer = testDefaultReadWrite(discretizer)
+    val data = Seq((1.0, 2.0), (2.0, 3.0), (3.0, 4.0)).toDF("input1", "input2")
+    readDiscretizer.fit(data)
+    assert(discretizer.hasDefault(discretizer.outputCol))
+    assert(readDiscretizer.hasDefault(readDiscretizer.outputCol))
   }
 
   test("Multiple Columns: Both inputCol and inputCols are set") {

From 75bb19a018f9260eab3ea0ba3ea46e84b87eabf2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Cattilapiros=E2=80=9D?=
 <piros.attila.zsolt@gmail.com>
Date: Thu, 15 Feb 2018 14:03:41 -0600
Subject: [PATCH 302/566] =?UTF-8?q?[SPARK-23413][UI]=20Fix=20sorting=20tas?=
 =?UTF-8?q?ks=20by=20Host=20/=20Executor=20ID=20at=20the=20Stag=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…e page

## What changes were proposed in this pull request?

Fixing exception got at sorting tasks by Host / Executor ID:
```
        java.lang.IllegalArgumentException: Invalid sort column: Host
	at org.apache.spark.ui.jobs.ApiHelper$.indexName(StagePage.scala:1017)
	at org.apache.spark.ui.jobs.TaskDataSource.sliceData(StagePage.scala:694)
	at org.apache.spark.ui.PagedDataSource.pageData(PagedTable.scala:61)
	at org.apache.spark.ui.PagedTable$class.table(PagedTable.scala:96)
	at org.apache.spark.ui.jobs.TaskPagedTable.table(StagePage.scala:708)
	at org.apache.spark.ui.jobs.StagePage.liftedTree1$1(StagePage.scala:293)
	at org.apache.spark.ui.jobs.StagePage.render(StagePage.scala:282)
	at org.apache.spark.ui.WebUI$$anonfun$2.apply(WebUI.scala:82)
	at org.apache.spark.ui.WebUI$$anonfun$2.apply(WebUI.scala:82)
	at org.apache.spark.ui.JettyUtils$$anon$3.doGet(JettyUtils.scala:90)
	at javax.servlet.http.HttpServlet.service(HttpServlet.java:687)
	at javax.servlet.http.HttpServlet.service(HttpServlet.java:790)
	at org.spark_project.jetty.servlet.ServletHolder.handle(ServletHolder.java:848)
	at org.spark_project.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:584)
```

Moreover some refactoring to avoid similar problems by introducing constants for each header name and reusing them at the identification of the corresponding sorting index.

## How was this patch tested?

Manually:

![screen shot 2018-02-13 at 18 57 10](https://user-images.githubusercontent.com/2017933/36166532-1cfdf3b8-10f3-11e8-8d32-5fcaad2af214.png)

(cherry picked from commit 1dc2c1d5e85c5f404f470aeb44c1f3c22786bdea)

Author: “attilapiros” <piros.attila.zsolt@gmail.com>

Closes #20623 from squito/fix_backport.
---
 .../org/apache/spark/status/storeTypes.scala  |   2 +
 .../org/apache/spark/ui/jobs/StagePage.scala  | 121 +++++++++++-------
 .../org/apache/spark/ui/StagePageSuite.scala  |  63 ++++++++-
 3 files changed, 139 insertions(+), 47 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/status/storeTypes.scala b/core/src/main/scala/org/apache/spark/status/storeTypes.scala
index 412644d3657b5..646cf25880e37 100644
--- a/core/src/main/scala/org/apache/spark/status/storeTypes.scala
+++ b/core/src/main/scala/org/apache/spark/status/storeTypes.scala
@@ -109,6 +109,7 @@ private[spark] object TaskIndexNames {
   final val DURATION = "dur"
   final val ERROR = "err"
   final val EXECUTOR = "exe"
+  final val HOST = "hst"
   final val EXEC_CPU_TIME = "ect"
   final val EXEC_RUN_TIME = "ert"
   final val GC_TIME = "gc"
@@ -165,6 +166,7 @@ private[spark] class TaskDataWrapper(
     val duration: Long,
     @KVIndexParam(value = TaskIndexNames.EXECUTOR, parent = TaskIndexNames.STAGE)
     val executorId: String,
+    @KVIndexParam(value = TaskIndexNames.HOST, parent = TaskIndexNames.STAGE)
     val host: String,
     @KVIndexParam(value = TaskIndexNames.STATUS, parent = TaskIndexNames.STAGE)
     val status: String,
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 8af2537aba7be..a4710f6eac06b 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -742,37 +742,39 @@ private[ui] class TaskPagedTable(
   }
 
   def headers: Seq[Node] = {
+    import ApiHelper._
+
     val taskHeadersAndCssClasses: Seq[(String, String)] =
       Seq(
-        ("Index", ""), ("ID", ""), ("Attempt", ""), ("Status", ""), ("Locality Level", ""),
-        ("Executor ID", ""), ("Host", ""), ("Launch Time", ""), ("Duration", ""),
-        ("Scheduler Delay", TaskDetailsClassNames.SCHEDULER_DELAY),
-        ("Task Deserialization Time", TaskDetailsClassNames.TASK_DESERIALIZATION_TIME),
-        ("GC Time", ""),
-        ("Result Serialization Time", TaskDetailsClassNames.RESULT_SERIALIZATION_TIME),
-        ("Getting Result Time", TaskDetailsClassNames.GETTING_RESULT_TIME),
-        ("Peak Execution Memory", TaskDetailsClassNames.PEAK_EXECUTION_MEMORY)) ++
-        {if (hasAccumulators(stage)) Seq(("Accumulators", "")) else Nil} ++
-        {if (hasInput(stage)) Seq(("Input Size / Records", "")) else Nil} ++
-        {if (hasOutput(stage)) Seq(("Output Size / Records", "")) else Nil} ++
+        (HEADER_TASK_INDEX, ""), (HEADER_ID, ""), (HEADER_ATTEMPT, ""), (HEADER_STATUS, ""),
+        (HEADER_LOCALITY, ""), (HEADER_EXECUTOR, ""), (HEADER_HOST, ""), (HEADER_LAUNCH_TIME, ""),
+        (HEADER_DURATION, ""), (HEADER_SCHEDULER_DELAY, TaskDetailsClassNames.SCHEDULER_DELAY),
+        (HEADER_DESER_TIME, TaskDetailsClassNames.TASK_DESERIALIZATION_TIME),
+        (HEADER_GC_TIME, ""),
+        (HEADER_SER_TIME, TaskDetailsClassNames.RESULT_SERIALIZATION_TIME),
+        (HEADER_GETTING_RESULT_TIME, TaskDetailsClassNames.GETTING_RESULT_TIME),
+        (HEADER_PEAK_MEM, TaskDetailsClassNames.PEAK_EXECUTION_MEMORY)) ++
+        {if (hasAccumulators(stage)) Seq((HEADER_ACCUMULATORS, "")) else Nil} ++
+        {if (hasInput(stage)) Seq((HEADER_INPUT_SIZE, "")) else Nil} ++
+        {if (hasOutput(stage)) Seq((HEADER_OUTPUT_SIZE, "")) else Nil} ++
         {if (hasShuffleRead(stage)) {
-          Seq(("Shuffle Read Blocked Time", TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME),
-            ("Shuffle Read Size / Records", ""),
-            ("Shuffle Remote Reads", TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE))
+          Seq((HEADER_SHUFFLE_READ_TIME, TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME),
+            (HEADER_SHUFFLE_TOTAL_READS, ""),
+            (HEADER_SHUFFLE_REMOTE_READS, TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE))
         } else {
           Nil
         }} ++
         {if (hasShuffleWrite(stage)) {
-          Seq(("Write Time", ""), ("Shuffle Write Size / Records", ""))
+          Seq((HEADER_SHUFFLE_WRITE_TIME, ""), (HEADER_SHUFFLE_WRITE_SIZE, ""))
         } else {
           Nil
         }} ++
         {if (hasBytesSpilled(stage)) {
-          Seq(("Shuffle Spill (Memory)", ""), ("Shuffle Spill (Disk)", ""))
+          Seq((HEADER_MEM_SPILL, ""), (HEADER_DISK_SPILL, ""))
         } else {
           Nil
         }} ++
-        Seq(("Errors", ""))
+        Seq((HEADER_ERROR, ""))
 
     if (!taskHeadersAndCssClasses.map(_._1).contains(sortColumn)) {
       throw new IllegalArgumentException(s"Unknown column: $sortColumn")
@@ -953,35 +955,62 @@ private[ui] class TaskPagedTable(
   }
 }
 
-private object ApiHelper {
-
-
-  private val COLUMN_TO_INDEX = Map(
-    "ID" -> null.asInstanceOf[String],
-    "Index" -> TaskIndexNames.TASK_INDEX,
-    "Attempt" -> TaskIndexNames.ATTEMPT,
-    "Status" -> TaskIndexNames.STATUS,
-    "Locality Level" -> TaskIndexNames.LOCALITY,
-    "Executor ID / Host" -> TaskIndexNames.EXECUTOR,
-    "Launch Time" -> TaskIndexNames.LAUNCH_TIME,
-    "Duration" -> TaskIndexNames.DURATION,
-    "Scheduler Delay" -> TaskIndexNames.SCHEDULER_DELAY,
-    "Task Deserialization Time" -> TaskIndexNames.DESER_TIME,
-    "GC Time" -> TaskIndexNames.GC_TIME,
-    "Result Serialization Time" -> TaskIndexNames.SER_TIME,
-    "Getting Result Time" -> TaskIndexNames.GETTING_RESULT_TIME,
-    "Peak Execution Memory" -> TaskIndexNames.PEAK_MEM,
-    "Accumulators" -> TaskIndexNames.ACCUMULATORS,
-    "Input Size / Records" -> TaskIndexNames.INPUT_SIZE,
-    "Output Size / Records" -> TaskIndexNames.OUTPUT_SIZE,
-    "Shuffle Read Blocked Time" -> TaskIndexNames.SHUFFLE_READ_TIME,
-    "Shuffle Read Size / Records" -> TaskIndexNames.SHUFFLE_TOTAL_READS,
-    "Shuffle Remote Reads" -> TaskIndexNames.SHUFFLE_REMOTE_READS,
-    "Write Time" -> TaskIndexNames.SHUFFLE_WRITE_TIME,
-    "Shuffle Write Size / Records" -> TaskIndexNames.SHUFFLE_WRITE_SIZE,
-    "Shuffle Spill (Memory)" -> TaskIndexNames.MEM_SPILL,
-    "Shuffle Spill (Disk)" -> TaskIndexNames.DISK_SPILL,
-    "Errors" -> TaskIndexNames.ERROR)
+private[ui] object ApiHelper {
+
+  val HEADER_ID = "ID"
+  val HEADER_TASK_INDEX = "Index"
+  val HEADER_ATTEMPT = "Attempt"
+  val HEADER_STATUS = "Status"
+  val HEADER_LOCALITY = "Locality Level"
+  val HEADER_EXECUTOR = "Executor ID"
+  val HEADER_HOST = "Host"
+  val HEADER_LAUNCH_TIME = "Launch Time"
+  val HEADER_DURATION = "Duration"
+  val HEADER_SCHEDULER_DELAY = "Scheduler Delay"
+  val HEADER_DESER_TIME = "Task Deserialization Time"
+  val HEADER_GC_TIME = "GC Time"
+  val HEADER_SER_TIME = "Result Serialization Time"
+  val HEADER_GETTING_RESULT_TIME = "Getting Result Time"
+  val HEADER_PEAK_MEM = "Peak Execution Memory"
+  val HEADER_ACCUMULATORS = "Accumulators"
+  val HEADER_INPUT_SIZE = "Input Size / Records"
+  val HEADER_OUTPUT_SIZE = "Output Size / Records"
+  val HEADER_SHUFFLE_READ_TIME = "Shuffle Read Blocked Time"
+  val HEADER_SHUFFLE_TOTAL_READS = "Shuffle Read Size / Records"
+  val HEADER_SHUFFLE_REMOTE_READS = "Shuffle Remote Reads"
+  val HEADER_SHUFFLE_WRITE_TIME = "Write Time"
+  val HEADER_SHUFFLE_WRITE_SIZE = "Shuffle Write Size / Records"
+  val HEADER_MEM_SPILL = "Shuffle Spill (Memory)"
+  val HEADER_DISK_SPILL = "Shuffle Spill (Disk)"
+  val HEADER_ERROR = "Errors"
+
+  private[ui] val COLUMN_TO_INDEX = Map(
+    HEADER_ID -> null.asInstanceOf[String],
+    HEADER_TASK_INDEX -> TaskIndexNames.TASK_INDEX,
+    HEADER_ATTEMPT -> TaskIndexNames.ATTEMPT,
+    HEADER_STATUS -> TaskIndexNames.STATUS,
+    HEADER_LOCALITY -> TaskIndexNames.LOCALITY,
+    HEADER_EXECUTOR -> TaskIndexNames.EXECUTOR,
+    HEADER_HOST -> TaskIndexNames.HOST,
+    HEADER_LAUNCH_TIME -> TaskIndexNames.LAUNCH_TIME,
+    HEADER_DURATION -> TaskIndexNames.DURATION,
+    HEADER_SCHEDULER_DELAY -> TaskIndexNames.SCHEDULER_DELAY,
+    HEADER_DESER_TIME -> TaskIndexNames.DESER_TIME,
+    HEADER_GC_TIME -> TaskIndexNames.GC_TIME,
+    HEADER_SER_TIME -> TaskIndexNames.SER_TIME,
+    HEADER_GETTING_RESULT_TIME -> TaskIndexNames.GETTING_RESULT_TIME,
+    HEADER_PEAK_MEM -> TaskIndexNames.PEAK_MEM,
+    HEADER_ACCUMULATORS -> TaskIndexNames.ACCUMULATORS,
+    HEADER_INPUT_SIZE -> TaskIndexNames.INPUT_SIZE,
+    HEADER_OUTPUT_SIZE -> TaskIndexNames.OUTPUT_SIZE,
+    HEADER_SHUFFLE_READ_TIME -> TaskIndexNames.SHUFFLE_READ_TIME,
+    HEADER_SHUFFLE_TOTAL_READS -> TaskIndexNames.SHUFFLE_TOTAL_READS,
+    HEADER_SHUFFLE_REMOTE_READS -> TaskIndexNames.SHUFFLE_REMOTE_READS,
+    HEADER_SHUFFLE_WRITE_TIME -> TaskIndexNames.SHUFFLE_WRITE_TIME,
+    HEADER_SHUFFLE_WRITE_SIZE -> TaskIndexNames.SHUFFLE_WRITE_SIZE,
+    HEADER_MEM_SPILL -> TaskIndexNames.MEM_SPILL,
+    HEADER_DISK_SPILL -> TaskIndexNames.DISK_SPILL,
+    HEADER_ERROR -> TaskIndexNames.ERROR)
 
   def hasAccumulators(stageData: StageData): Boolean = {
     stageData.accumulatorUpdates.exists { acc => acc.name != null && acc.value != null }
diff --git a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
index 0aeddf730cd35..6044563f7dde7 100644
--- a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
@@ -28,13 +28,74 @@ import org.apache.spark._
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler._
 import org.apache.spark.status.AppStatusStore
+import org.apache.spark.status.api.v1.{AccumulableInfo => UIAccumulableInfo, StageData, StageStatus}
 import org.apache.spark.status.config._
-import org.apache.spark.ui.jobs.{StagePage, StagesTab}
+import org.apache.spark.ui.jobs.{ApiHelper, StagePage, StagesTab, TaskPagedTable}
 
 class StagePageSuite extends SparkFunSuite with LocalSparkContext {
 
   private val peakExecutionMemory = 10
 
+  test("ApiHelper.COLUMN_TO_INDEX should match headers of the task table") {
+    val conf = new SparkConf(false).set(LIVE_ENTITY_UPDATE_PERIOD, 0L)
+    val statusStore = AppStatusStore.createLiveStore(conf)
+    try {
+      val stageData = new StageData(
+        status = StageStatus.ACTIVE,
+        stageId = 1,
+        attemptId = 1,
+        numTasks = 1,
+        numActiveTasks = 1,
+        numCompleteTasks = 1,
+        numFailedTasks = 1,
+        numKilledTasks = 1,
+        numCompletedIndices = 1,
+
+        executorRunTime = 1L,
+        executorCpuTime = 1L,
+        submissionTime = None,
+        firstTaskLaunchedTime = None,
+        completionTime = None,
+        failureReason = None,
+
+        inputBytes = 1L,
+        inputRecords = 1L,
+        outputBytes = 1L,
+        outputRecords = 1L,
+        shuffleReadBytes = 1L,
+        shuffleReadRecords = 1L,
+        shuffleWriteBytes = 1L,
+        shuffleWriteRecords = 1L,
+        memoryBytesSpilled = 1L,
+        diskBytesSpilled = 1L,
+
+        name = "stage1",
+        description = Some("description"),
+        details = "detail",
+        schedulingPool = "pool1",
+
+        rddIds = Seq(1),
+        accumulatorUpdates = Seq(new UIAccumulableInfo(0L, "acc", None, "value")),
+        tasks = None,
+        executorSummary = None,
+        killedTasksSummary = Map.empty
+      )
+      val taskTable = new TaskPagedTable(
+        stageData,
+        basePath = "/a/b/c",
+        currentTime = 0,
+        pageSize = 10,
+        sortColumn = "Index",
+        desc = false,
+        store = statusStore
+      )
+      val columnNames = (taskTable.headers \ "th" \ "a").map(_.child(1).text).toSet
+      assert(columnNames === ApiHelper.COLUMN_TO_INDEX.keySet)
+    } finally {
+      statusStore.close()
+    }
+  }
+
   test("peak execution memory should displayed") {
     val html = renderStagePage().toString().toLowerCase(Locale.ROOT)
     val targetString = "peak execution memory"

From ccb0a59d7383db451b86aee67423eb6e28f1f982 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Fri, 16 Feb 2018 09:41:17 -0800
Subject: [PATCH 303/566] [SPARK-23446][PYTHON] Explicitly check supported
 types in toPandas

## What changes were proposed in this pull request?

This PR explicitly specifies and checks the types we supported in `toPandas`. This was a hole. For example, we haven't finished the binary type support in Python side yet but now it allows as below:

```python
spark.conf.set("spark.sql.execution.arrow.enabled", "false")
df = spark.createDataFrame([[bytearray("a")]])
df.toPandas()
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
df.toPandas()
```

```
     _1
0  [97]
  _1
0  a
```

This should be disallowed. I think the same things also apply to nested timestamps too.

I also added some nicer message about `spark.sql.execution.arrow.enabled` in the error message.

## How was this patch tested?

Manually tested and tests added in `python/pyspark/sql/tests.py`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20625 from HyukjinKwon/pandas_convertion_supported_type.

(cherry picked from commit c5857e496ff0d170ed0339f14afc7d36b192da6d)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 python/pyspark/sql/dataframe.py | 15 +++++++++------
 python/pyspark/sql/tests.py     |  9 ++++++++-
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index faee870a2d2e2..930d177fa5b8f 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1943,10 +1943,11 @@ def toPandas(self):
         if self.sql_ctx.getConf("spark.sql.execution.arrow.enabled", "false").lower() == "true":
             try:
                 from pyspark.sql.types import _check_dataframe_convert_date, \
-                    _check_dataframe_localize_timestamps
+                    _check_dataframe_localize_timestamps, to_arrow_schema
                 from pyspark.sql.utils import require_minimum_pyarrow_version
-                import pyarrow
                 require_minimum_pyarrow_version()
+                import pyarrow
+                to_arrow_schema(self.schema)
                 tables = self._collectAsArrow()
                 if tables:
                     table = pyarrow.concat_tables(tables)
@@ -1955,10 +1956,12 @@ def toPandas(self):
                     return _check_dataframe_localize_timestamps(pdf, timezone)
                 else:
                     return pd.DataFrame.from_records([], columns=self.columns)
-            except ImportError as e:
-                msg = "note: pyarrow must be installed and available on calling Python process " \
-                      "if using spark.sql.execution.arrow.enabled=true"
-                raise ImportError("%s\n%s" % (_exception_message(e), msg))
+            except Exception as e:
+                msg = (
+                    "Note: toPandas attempted Arrow optimization because "
+                    "'spark.sql.execution.arrow.enabled' is set to true. Please set it to false "
+                    "to disable this.")
+                raise RuntimeError("%s\n%s" % (_exception_message(e), msg))
         else:
             pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 904fa7abd7d5c..da50b4d9ffa0f 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3443,7 +3443,14 @@ def test_unsupported_datatype(self):
         schema = StructType([StructField("map", MapType(StringType(), IntegerType()), True)])
         df = self.spark.createDataFrame([(None,)], schema=schema)
         with QuietTest(self.sc):
-            with self.assertRaisesRegexp(Exception, 'Unsupported data type'):
+            with self.assertRaisesRegexp(Exception, 'Unsupported type'):
+                df.toPandas()
+
+        df = self.spark.createDataFrame([(None,)], schema="a binary")
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(
+                    Exception,
+                    'Unsupported type.*\nNote: toPandas attempted Arrow optimization because'):
                 df.toPandas()
 
     def test_null_conversion(self):

From 8360da07110d847a01b243e6d786922a5057ad9f Mon Sep 17 00:00:00 2001
From: Shintaro Murakami <mrkm4ntr@gmail.com>
Date: Fri, 16 Feb 2018 17:17:55 -0800
Subject: [PATCH 304/566] [SPARK-23381][CORE] Murmur3 hash generates a
 different value from other implementations

## What changes were proposed in this pull request?
Murmur3 hash generates a different value from the original and other implementations (like Scala standard library and Guava or so) when the length of a bytes array is not multiple of 4.

## How was this patch tested?
Added a unit test.

**Note: When we merge this PR, please give all the credits to Shintaro Murakami.**

Author: Shintaro Murakami <mrkm4ntrgmail.com>

Author: gatorsmile <gatorsmile@gmail.com>
Author: Shintaro Murakami <mrkm4ntr@gmail.com>

Closes #20630 from gatorsmile/pr-20568.

(cherry picked from commit d5ed2108d32e1d95b26ee7fed39e8a733e935e2c)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../spark/util/sketch/Murmur3_x86_32.java     | 16 +++++++++
 .../spark/unsafe/hash/Murmur3_x86_32.java     | 16 +++++++++
 .../unsafe/hash/Murmur3_x86_32Suite.java      | 19 +++++++++++
 .../spark/ml/feature/FeatureHasher.scala      | 33 ++++++++++++++++++-
 .../spark/mllib/feature/HashingTF.scala       |  2 +-
 .../spark/ml/feature/FeatureHasherSuite.scala | 11 ++++++-
 python/pyspark/ml/feature.py                  |  4 +--
 7 files changed, 96 insertions(+), 5 deletions(-)

diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java
index a61ce4fb7241d..e83b331391e39 100644
--- a/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java
+++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java
@@ -60,6 +60,8 @@ public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, i
   }
 
   public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed) {
+    // This is not compatible with original and another implementations.
+    // But remain it for backward compatibility for the components existing before 2.3.
     assert (lengthInBytes >= 0): "lengthInBytes cannot be negative";
     int lengthAligned = lengthInBytes - lengthInBytes % 4;
     int h1 = hashBytesByInt(base, offset, lengthAligned, seed);
@@ -71,6 +73,20 @@ public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, i
     return fmix(h1, lengthInBytes);
   }
 
+  public static int hashUnsafeBytes2(Object base, long offset, int lengthInBytes, int seed) {
+    // This is compatible with original and another implementations.
+    // Use this method for new components after Spark 2.3.
+    assert (lengthInBytes >= 0): "lengthInBytes cannot be negative";
+    int lengthAligned = lengthInBytes - lengthInBytes % 4;
+    int h1 = hashBytesByInt(base, offset, lengthAligned, seed);
+    int k1 = 0;
+    for (int i = lengthAligned, shift = 0; i < lengthInBytes; i++, shift += 8) {
+      k1 ^= (Platform.getByte(base, offset + i) & 0xFF) << shift;
+    }
+    h1 ^= mixK1(k1);
+    return fmix(h1, lengthInBytes);
+  }
+
   private static int hashBytesByInt(Object base, long offset, int lengthInBytes, int seed) {
     assert (lengthInBytes % 4 == 0);
     int h1 = seed;
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
index 5e7ee480cafd1..d239de6083ad0 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
@@ -60,6 +60,8 @@ public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, i
   }
 
   public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed) {
+    // This is not compatible with original and another implementations.
+    // But remain it for backward compatibility for the components existing before 2.3.
     assert (lengthInBytes >= 0): "lengthInBytes cannot be negative";
     int lengthAligned = lengthInBytes - lengthInBytes % 4;
     int h1 = hashBytesByInt(base, offset, lengthAligned, seed);
@@ -71,6 +73,20 @@ public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, i
     return fmix(h1, lengthInBytes);
   }
 
+  public static int hashUnsafeBytes2(Object base, long offset, int lengthInBytes, int seed) {
+    // This is compatible with original and another implementations.
+    // Use this method for new components after Spark 2.3.
+    assert (lengthInBytes >= 0): "lengthInBytes cannot be negative";
+    int lengthAligned = lengthInBytes - lengthInBytes % 4;
+    int h1 = hashBytesByInt(base, offset, lengthAligned, seed);
+    int k1 = 0;
+    for (int i = lengthAligned, shift = 0; i < lengthInBytes; i++, shift += 8) {
+      k1 ^= (Platform.getByte(base, offset + i) & 0xFF) << shift;
+    }
+    h1 ^= mixK1(k1);
+    return fmix(h1, lengthInBytes);
+  }
+
   private static int hashBytesByInt(Object base, long offset, int lengthInBytes, int seed) {
     assert (lengthInBytes % 4 == 0);
     int h1 = seed;
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/hash/Murmur3_x86_32Suite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/hash/Murmur3_x86_32Suite.java
index e759cb33b3e6a..6348a73bf3895 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/hash/Murmur3_x86_32Suite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/hash/Murmur3_x86_32Suite.java
@@ -22,6 +22,8 @@
 import java.util.Random;
 import java.util.Set;
 
+import scala.util.hashing.MurmurHash3$;
+
 import org.apache.spark.unsafe.Platform;
 import org.junit.Assert;
 import org.junit.Test;
@@ -51,6 +53,23 @@ public void testKnownLongInputs() {
     Assert.assertEquals(-2106506049, hasher.hashLong(Long.MAX_VALUE));
   }
 
+  // SPARK-23381 Check whether the hash of the byte array is the same as another implementations
+  @Test
+  public void testKnownBytesInputs() {
+    byte[] test = "test".getBytes(StandardCharsets.UTF_8);
+    Assert.assertEquals(MurmurHash3$.MODULE$.bytesHash(test, 0),
+      Murmur3_x86_32.hashUnsafeBytes2(test, Platform.BYTE_ARRAY_OFFSET, test.length, 0));
+    byte[] test1 = "test1".getBytes(StandardCharsets.UTF_8);
+    Assert.assertEquals(MurmurHash3$.MODULE$.bytesHash(test1, 0),
+      Murmur3_x86_32.hashUnsafeBytes2(test1, Platform.BYTE_ARRAY_OFFSET, test1.length, 0));
+    byte[] te = "te".getBytes(StandardCharsets.UTF_8);
+    Assert.assertEquals(MurmurHash3$.MODULE$.bytesHash(te, 0),
+      Murmur3_x86_32.hashUnsafeBytes2(te, Platform.BYTE_ARRAY_OFFSET, te.length, 0));
+    byte[] tes = "tes".getBytes(StandardCharsets.UTF_8);
+    Assert.assertEquals(MurmurHash3$.MODULE$.bytesHash(tes, 0),
+      Murmur3_x86_32.hashUnsafeBytes2(tes, Platform.BYTE_ARRAY_OFFSET, tes.length, 0));
+  }
+
   @Test
   public void randomizedStressTest() {
     int size = 65536;
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
index a918dd4c075da..c78f61ac3ef71 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.ml.feature
 
+import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.Transformer
 import org.apache.spark.ml.attribute.AttributeGroup
@@ -28,6 +29,8 @@ import org.apache.spark.mllib.feature.{HashingTF => OldHashingTF}
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.hash.Murmur3_x86_32.{hashInt, hashLong, hashUnsafeBytes2}
+import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.Utils
 import org.apache.spark.util.collection.OpenHashMap
 
@@ -138,7 +141,7 @@ class FeatureHasher(@Since("2.3.0") override val uid: String) extends Transforme
 
   @Since("2.3.0")
   override def transform(dataset: Dataset[_]): DataFrame = {
-    val hashFunc: Any => Int = OldHashingTF.murmur3Hash
+    val hashFunc: Any => Int = FeatureHasher.murmur3Hash
     val n = $(numFeatures)
     val localInputCols = $(inputCols)
     val catCols = if (isSet(categoricalCols)) {
@@ -218,4 +221,32 @@ object FeatureHasher extends DefaultParamsReadable[FeatureHasher] {
 
   @Since("2.3.0")
   override def load(path: String): FeatureHasher = super.load(path)
+
+  private val seed = OldHashingTF.seed
+
+  /**
+   * Calculate a hash code value for the term object using
+   * Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32).
+   * This is the default hash algorithm used from Spark 2.0 onwards.
+   * Use hashUnsafeBytes2 to match the original algorithm with the value.
+   * See SPARK-23381.
+   */
+  @Since("2.3.0")
+  private[feature] def murmur3Hash(term: Any): Int = {
+    term match {
+      case null => seed
+      case b: Boolean => hashInt(if (b) 1 else 0, seed)
+      case b: Byte => hashInt(b, seed)
+      case s: Short => hashInt(s, seed)
+      case i: Int => hashInt(i, seed)
+      case l: Long => hashLong(l, seed)
+      case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed)
+      case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed)
+      case s: String =>
+        val utf8 = UTF8String.fromString(s)
+        hashUnsafeBytes2(utf8.getBaseObject, utf8.getBaseOffset, utf8.numBytes(), seed)
+      case _ => throw new SparkException("FeatureHasher with murmur3 algorithm does not " +
+        s"support type ${term.getClass.getCanonicalName} of input data.")
+    }
+  }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
index 9abdd44a635d1..8935c8496cdbb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
@@ -135,7 +135,7 @@ object HashingTF {
 
   private[HashingTF] val Murmur3: String = "murmur3"
 
-  private val seed = 42
+  private[spark] val seed = 42
 
   /**
    * Calculate a hash code value for the term object using the native Scala implementation.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala
index 3fc3cbb62d5b5..7bc1825b69c43 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala
@@ -27,6 +27,7 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.functions.col
 import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
 
 class FeatureHasherSuite extends SparkFunSuite
   with MLlibTestSparkContext
@@ -34,7 +35,7 @@ class FeatureHasherSuite extends SparkFunSuite
 
   import testImplicits._
 
-  import HashingTFSuite.murmur3FeatureIdx
+  import FeatureHasherSuite.murmur3FeatureIdx
 
   implicit private val vectorEncoder = ExpressionEncoder[Vector]()
 
@@ -216,3 +217,11 @@ class FeatureHasherSuite extends SparkFunSuite
     testDefaultReadWrite(t)
   }
 }
+
+object FeatureHasherSuite {
+
+  private[feature] def murmur3FeatureIdx(numFeatures: Int)(term: Any): Int = {
+    Utils.nonNegativeMod(FeatureHasher.murmur3Hash(term), numFeatures)
+  }
+
+}
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index da85ba761a145..04b07e6a05481 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -741,9 +741,9 @@ class FeatureHasher(JavaTransformer, HasInputCols, HasOutputCol, HasNumFeatures,
     >>> df = spark.createDataFrame(data, cols)
     >>> hasher = FeatureHasher(inputCols=cols, outputCol="features")
     >>> hasher.transform(df).head().features
-    SparseVector(262144, {51871: 1.0, 63643: 1.0, 174475: 2.0, 253195: 1.0})
+    SparseVector(262144, {174475: 2.0, 247670: 1.0, 257907: 1.0, 262126: 1.0})
     >>> hasher.setCategoricalCols(["real"]).transform(df).head().features
-    SparseVector(262144, {51871: 1.0, 63643: 1.0, 171257: 1.0, 253195: 1.0})
+    SparseVector(262144, {171257: 1.0, 247670: 1.0, 257907: 1.0, 262126: 1.0})
     >>> hasherPath = temp_path + "/hasher"
     >>> hasher.save(hasherPath)
     >>> loadedHasher = FeatureHasher.load(hasherPath)

From 44095cb65500739695b0324c177c19dfa1471472 Mon Sep 17 00:00:00 2001
From: Sameer Agarwal <sameerag@apache.org>
Date: Fri, 16 Feb 2018 17:29:46 -0800
Subject: [PATCH 305/566] Preparing Spark release v2.3.0-rc4

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 29a8a0062873f..6d46c31906260 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.1
+Version: 2.3.0
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 5c5a8e9b8232e..2ca9ab66462a8 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 2a625da682cdc..404c744f98261 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index adb1890439094..3c0b5288862fa 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 4cdcfa2a4da99..fe3bcfd68f988 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 494137f8e9ec1..90ca4016ecc79 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 32bebf5eeca99..8bb2c38f7d097 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 8dc4ddeea3ec0..0d5d4b1661012 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 5613f5e81384c..d6136e6c29a84 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index c34c9df39063b..364aaab74e67b 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index 213579037f51a..aabf3151890f9 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.1-SNAPSHOT
-SPARK_VERSION_SHORT: 2.3.1
+SPARK_VERSION: 2.3.0
+SPARK_VERSION_SHORT: 2.3.0
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index e584feed1b337..28de7a180c2e7 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 6c4031b392f65..9b74a4d327136 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 6935ade33546a..1da013353775c 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 60aa5fb1a1031..da8ce5928b8bb 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index d8ed3364c4bd4..65a9f9972d398 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 15d550d3bb445..f5a2dbbef3b6f 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 7ee7156efe22c..e9b8e4e18d6df 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 1f28798fd245a..46a5bd1611d30 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index ad3cd0059e183..72446ecb5cd82 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 53d903fc58aee..e49c8281c6ddb 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 4206a1ac4ad99..7ad92b07094f2 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 7e838812bdc2e..126624ecd8922 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index ee7eb98e3dfc8..a8bcff8cdc14c 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 2ea2e4a7adbdb..6e60fbac9ef5a 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 35ddfac1d6624..7e09e304cb1b7 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 29c1e53f480a3..d8c1792f5b0dd 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 61eb9dbcf6f7a..d9380ed2c6d4d 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index edba0a72ee6a4..9cb3ed2e7ae1d 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 9aa531ec65746..e31b5e692e641 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.1-SNAPSHOT</version>
+  <version>2.3.0</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index b09fa9822184c..8065b058f8c0e 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.1.dev0"
+__version__ = "2.3.0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 3b445bfaf773d..c21a1cd108a8a 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index ae1c69f305589..544e08b333aaf 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index ea31b9c121433..3dc6955ce312c 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index d42bfff851572..7e1397ef9ba79 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 865d3b3f6ad4e..936267ce560e6 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index f39bffe417666..77d4cf7ec8459 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 6e04f528a1c2f..7ade122666a83 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index da50c9bd60fa6..da34cb8253272 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 6e8e034fc4de4..adbafac9bc458 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 9523085cf5513..4e005623e2b26 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From c7a0dea46a251a27b304ac2ec9f07f97aca4b1d0 Mon Sep 17 00:00:00 2001
From: Sameer Agarwal <sameerag@apache.org>
Date: Fri, 16 Feb 2018 17:29:51 -0800
Subject: [PATCH 306/566] Preparing development version 2.3.1-SNAPSHOT

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 6d46c31906260..29a8a0062873f 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.0
+Version: 2.3.1
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 2ca9ab66462a8..5c5a8e9b8232e 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 404c744f98261..2a625da682cdc 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 3c0b5288862fa..adb1890439094 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index fe3bcfd68f988..4cdcfa2a4da99 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 90ca4016ecc79..494137f8e9ec1 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 8bb2c38f7d097..32bebf5eeca99 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 0d5d4b1661012..8dc4ddeea3ec0 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index d6136e6c29a84..5613f5e81384c 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 364aaab74e67b..c34c9df39063b 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index aabf3151890f9..213579037f51a 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.0
-SPARK_VERSION_SHORT: 2.3.0
+SPARK_VERSION: 2.3.1-SNAPSHOT
+SPARK_VERSION_SHORT: 2.3.1
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 28de7a180c2e7..e584feed1b337 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 9b74a4d327136..6c4031b392f65 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 1da013353775c..6935ade33546a 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index da8ce5928b8bb..60aa5fb1a1031 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 65a9f9972d398..d8ed3364c4bd4 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index f5a2dbbef3b6f..15d550d3bb445 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index e9b8e4e18d6df..7ee7156efe22c 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 46a5bd1611d30..1f28798fd245a 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 72446ecb5cd82..ad3cd0059e183 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index e49c8281c6ddb..53d903fc58aee 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 7ad92b07094f2..4206a1ac4ad99 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 126624ecd8922..7e838812bdc2e 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index a8bcff8cdc14c..ee7eb98e3dfc8 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 6e60fbac9ef5a..2ea2e4a7adbdb 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 7e09e304cb1b7..35ddfac1d6624 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index d8c1792f5b0dd..29c1e53f480a3 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index d9380ed2c6d4d..61eb9dbcf6f7a 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 9cb3ed2e7ae1d..edba0a72ee6a4 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index e31b5e692e641..9aa531ec65746 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.0</version>
+  <version>2.3.1-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 8065b058f8c0e..b09fa9822184c 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.0"
+__version__ = "2.3.1.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index c21a1cd108a8a..3b445bfaf773d 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 544e08b333aaf..ae1c69f305589 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 3dc6955ce312c..ea31b9c121433 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 7e1397ef9ba79..d42bfff851572 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 936267ce560e6..865d3b3f6ad4e 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 77d4cf7ec8459..f39bffe417666 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 7ade122666a83..6e04f528a1c2f 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index da34cb8253272..da50c9bd60fa6 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index adbafac9bc458..6e8e034fc4de4 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 4e005623e2b26..9523085cf5513 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From a1ee6f1fc543120763f1b373bb31bc6d84004318 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 20 Feb 2018 17:54:06 -0800
Subject: [PATCH 307/566] [SPARK-23470][UI] Use first attempt of last stage to
 define job description.

This is much faster than finding out what the last attempt is, and the
data should be the same.

There's room for improvement in this page (like only loading data for
the jobs being shown, instead of loading all available jobs and sorting
them), but this should bring performance on par with the 2.2 version.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20644 from vanzin/SPARK-23470.

(cherry picked from commit 2ba77ed9e51922303e3c3533e368b95788bd7de5)
Signed-off-by: Sameer Agarwal <sameerag@apache.org>
---
 core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index a4710f6eac06b..08a927a8b4885 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -1040,7 +1040,7 @@ private[ui] object ApiHelper {
   }
 
   def lastStageNameAndDescription(store: AppStatusStore, job: JobData): (String, String) = {
-    val stage = store.asOption(store.lastStageAttempt(job.stageIds.max))
+    val stage = store.asOption(store.stageAttempt(job.stageIds.max, 0))
     (stage.map(_.name).getOrElse(""), stage.flatMap(_.description).getOrElse(job.name))
   }
 

From 1d78f03ae3037f1ddbe6533a6733b7805a6705bf Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 20 Feb 2018 18:06:21 -0800
Subject: [PATCH 308/566] [SPARK-23468][CORE] Stringify auth secret before
 storing it in credentials.

The secret is used as a string in many parts of the code, so it has
to be turned into a hex string to avoid issues such as the random
byte sequence not containing a valid UTF8 sequence.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20643 from vanzin/SPARK-23468.

(cherry picked from commit 6d398c05cbad69aa9093429e04ae44c73b81cd5a)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 core/src/main/scala/org/apache/spark/SecurityManager.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/SecurityManager.scala b/core/src/main/scala/org/apache/spark/SecurityManager.scala
index 4c1dbe3ffb4ad..5b15a1c57779d 100644
--- a/core/src/main/scala/org/apache/spark/SecurityManager.scala
+++ b/core/src/main/scala/org/apache/spark/SecurityManager.scala
@@ -541,7 +541,8 @@ private[spark] class SecurityManager(
     rnd.nextBytes(secretBytes)
 
     val creds = new Credentials()
-    creds.addSecretKey(SECRET_LOOKUP_KEY, secretBytes)
+    val secretStr = HashCodes.fromBytes(secretBytes).toString()
+    creds.addSecretKey(SECRET_LOOKUP_KEY, secretStr.getBytes(UTF_8))
     UserGroupInformation.getCurrentUser().addCredentials(creds)
   }
 

From 3e7269eb904b591883300d7433e5c99be0b3b5b3 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Tue, 20 Feb 2018 18:16:10 -0800
Subject: [PATCH 309/566] [SPARK-23454][SS][DOCS] Added trigger information to
 the Structured Streaming programming guide

## What changes were proposed in this pull request?

- Added clear information about triggers
- Made the semantics guarantees of watermarks more clear for streaming aggregations and stream-stream joins.

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #20631 from tdas/SPARK-23454.

(cherry picked from commit 601d653bff9160db8477f86d961e609fc2190237)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../structured-streaming-programming-guide.md | 214 +++++++++++++++++-
 1 file changed, 207 insertions(+), 7 deletions(-)

diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index 48d6d0b542cc0..9a83f157452ad 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -904,7 +904,7 @@ windowedCounts <- count(
 </div>
 
 
-### Handling Late Data and Watermarking
+#### Handling Late Data and Watermarking
 Now consider what happens if one of the events arrives late to the application.
 For example, say, a word generated at 12:04 (i.e. event time) could be received by 
 the application at 12:11. The application should use the time 12:04 instead of 12:11
@@ -925,7 +925,9 @@ specifying the event time column and the threshold on how late the data is expec
 event time. For a specific window starting at time `T`, the engine will maintain state and allow late
 data to update the state until `(max event time seen by the engine - late threshold > T)`. 
 In other words, late data within the threshold will be aggregated, 
-but data later than the threshold will be dropped. Let's understand this with an example. We can 
+but data later than the threshold will start getting dropped
+(see [later]((#semantic-guarantees-of-aggregation-with-watermarking))
+in the section for the exact guarantees). Let's understand this with an example. We can
 easily define watermarking on the previous example using `withWatermark()` as shown below.
 
 <div class="codetabs">
@@ -1031,7 +1033,9 @@ then drops intermediate state of a window < watermark, and appends the final
 counts to the Result Table/sink. For example, the final counts of window `12:00 - 12:10` is 
 appended to the Result Table only after the watermark is updated to `12:11`. 
 
-**Conditions for watermarking to clean aggregation state**
+##### Conditions for watermarking to clean aggregation state
+{:.no_toc}
+
 It is important to note that the following conditions must be satisfied for the watermarking to 
 clean the state in aggregation queries *(as of Spark 2.1.1, subject to change in the future)*.
 
@@ -1051,6 +1055,16 @@ from the aggregation column.
 For example, `df.groupBy("time").count().withWatermark("time", "1 min")` is invalid in Append 
 output mode.
 
+##### Semantic Guarantees of Aggregation with Watermarking
+{:.no_toc}
+
+- A watermark delay (set with `withWatermark`) of "2 hours" guarantees that the engine will never
+drop any data that is less than 2 hours delayed. In other words, any data less than 2 hours behind
+(in terms of event-time) the latest data processed till then is guaranteed to be aggregated.
+
+- However, the guarantee is strict only in one direction. Data delayed by more than 2 hours is
+not guaranteed to be dropped; it may or may not get aggregated. More delayed is the data, less
+likely is the engine going to process it.
 
 ### Join Operations
 Structured Streaming supports joining a streaming Dataset/DataFrame with a static Dataset/DataFrame
@@ -1062,7 +1076,7 @@ Dataset/DataFrame will be the exactly the same as if it was with a static Datase
 containing the same data in the stream.
 
 
-#### Stream-static joins
+#### Stream-static Joins
 
 Since the introduction in Spark 2.0, Structured Streaming has supported joins (inner join and some
 type of outer joins) between a streaming and a static DataFrame/Dataset. Here is a simple example.
@@ -1269,6 +1283,12 @@ joined <- join(
 </div>
 </div>
 
+###### Semantic Guarantees of Stream-stream Inner Joins with Watermarking
+{:.no_toc}
+This is similar to the [guarantees provided by watermarking on aggregations](#semantic-guarantees-of-aggregation-with-watermarking).
+A watermark delay of "2 hours" guarantees that the engine will never drop any data that is less than
+ 2 hours delayed. But data delayed by more than 2 hours may or may not get processed.
+
 ##### Outer Joins with Watermarking
 While the watermark + event-time constraints is optional for inner joins, for left and right outer
 joins they must be specified. This is because for generating the NULL results in outer join, the
@@ -1347,7 +1367,14 @@ joined <- join(
 </div>
 
 
-There are a few points to note regarding outer joins.
+###### Semantic Guarantees of Stream-stream Outer Joins with Watermarking
+{:.no_toc}
+Outer joins have the same guarantees as [inner joins](#semantic-guarantees-of-stream-stream-inner-joins-with-watermarking)
+regarding watermark delays and whether data will be dropped or not.
+
+###### Caveats
+{:.no_toc}
+There are a few important characteristics to note regarding how the outer results are generated.
 
 - *The outer NULL results will be generated with a delay that depends on the specified watermark
 delay and the time range condition.* This is because the engine has to wait for that long to ensure
@@ -1962,7 +1989,7 @@ head(sql("select * from aggregates"))
 </div>
 </div>
 
-#### Using Foreach
+##### Using Foreach
 The `foreach` operation allows arbitrary operations to be computed on the output data. As of Spark 2.1, this is available only for Scala and Java. To use this, you will have to implement the interface `ForeachWriter`
 ([Scala](api/scala/index.html#org.apache.spark.sql.ForeachWriter)/[Java](api/java/org/apache/spark/sql/ForeachWriter.html) docs),
 which has methods that get called whenever there is a sequence of rows generated as output after a trigger. Note the following important points.
@@ -1979,6 +2006,172 @@ which has methods that get called whenever there is a sequence of rows generated
 
 - Whenever `open` is called, `close` will also be called (unless the JVM exits due to some error). This is true even if `open` returns false. If there is any error in processing and writing the data, `close` will be called with the error. It is your responsibility to clean up state (e.g. connections, transactions, etc.) that have been created in `open` such that there are no resource leaks.
 
+#### Triggers
+The trigger settings of a streaming query defines the timing of streaming data processing, whether
+the query is going to executed as micro-batch query with a fixed batch interval or as a continuous processing query.
+Here are the different kinds of triggers that are supported.
+
+<table class="table">
+  <tr>
+    <th>Trigger Type</th>
+    <th>Description</th>
+  </tr>
+  <tr>
+    <td><i>unspecified (default)</i></td>
+    <td>
+        If no trigger setting is explicitly specified, then by default, the query will be
+        executed in micro-batch mode, where micro-batches will be generated as soon as
+        the previous micro-batch has completed processing.
+    </td>
+  </tr>
+  <tr>
+    <td><b>Fixed interval micro-batches</b></td>
+    <td>
+        The query will be executed with micro-batches mode, where micro-batches will be kicked off
+        at the user-specified intervals.
+        <ul>
+          <li>If the previous micro-batch completes within the interval, then the engine will wait until
+          the interval is over before kicking off the next micro-batch.</li>
+
+          <li>If the previous micro-batch takes longer than the interval to complete (i.e. if an
+          interval boundary is missed), then the next micro-batch will start as soon as the
+          previous one completes (i.e., it will not wait for the next interval boundary).</li>
+
+          <li>If no new data is available, then no micro-batch will be kicked off.</li>
+        </ul>
+    </td>
+  </tr>
+  <tr>
+    <td><b>One-time micro-batch</b></td>
+    <td>
+        The query will execute *only one* micro-batch to process all the available data and then
+        stop on its own. This is useful in scenarios you want to periodically spin up a cluster,
+        process everything that is available since the last period, and then shutdown the
+        cluster. In some case, this may lead to significant cost savings.
+    </td>
+  </tr>
+  <tr>
+    <td><b>Continuous with fixed checkpoint interval</b><br/><i>(experimental)</i></td>
+    <td>
+        The query will be executed in the new low-latency, continuous processing mode. Read more
+        about this in the <a href="#continuous-processing-experimental">Continuous Processing section</a> below.
+    </td>
+  </tr>
+</table>
+
+Here are a few code examples.
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+
+{% highlight scala %}
+import org.apache.spark.sql.streaming.Trigger
+
+// Default trigger (runs micro-batch as soon as it can)
+df.writeStream
+  .format("console")
+  .start()
+
+// ProcessingTime trigger with two-seconds micro-batch interval
+df.writeStream
+  .format("console")
+  .trigger(Trigger.ProcessingTime("2 seconds"))
+  .start()
+
+// One-time trigger
+df.writeStream
+  .format("console")
+  .trigger(Trigger.Once())
+  .start()
+
+// Continuous trigger with one-second checkpointing interval
+df.writeStream
+  .format("console")
+  .trigger(Trigger.Continuous("1 second"))
+  .start()
+
+{% endhighlight %}
+
+
+</div>
+<div data-lang="java"  markdown="1">
+
+{% highlight java %}
+import org.apache.spark.sql.streaming.Trigger
+
+// Default trigger (runs micro-batch as soon as it can)
+df.writeStream
+  .format("console")
+  .start();
+
+// ProcessingTime trigger with two-seconds micro-batch interval
+df.writeStream
+  .format("console")
+  .trigger(Trigger.ProcessingTime("2 seconds"))
+  .start();
+
+// One-time trigger
+df.writeStream
+  .format("console")
+  .trigger(Trigger.Once())
+  .start();
+
+// Continuous trigger with one-second checkpointing interval
+df.writeStream
+  .format("console")
+  .trigger(Trigger.Continuous("1 second"))
+  .start();
+
+{% endhighlight %}
+
+</div>
+<div data-lang="python"  markdown="1">
+
+{% highlight python %}
+
+# Default trigger (runs micro-batch as soon as it can)
+df.writeStream \
+  .format("console") \
+  .start()
+
+# ProcessingTime trigger with two-seconds micro-batch interval
+df.writeStream \
+  .format("console") \
+  .trigger(processingTime='2 seconds') \
+  .start()
+
+# One-time trigger
+df.writeStream \
+  .format("console") \
+  .trigger(once=True) \
+  .start()
+
+# Continuous trigger with one-second checkpointing interval
+df.writeStream
+  .format("console")
+  .trigger(continuous='1 second')
+  .start()
+
+{% endhighlight %}
+</div>
+<div data-lang="r"  markdown="1">
+
+{% highlight r %}
+# Default trigger (runs micro-batch as soon as it can)
+write.stream(df, "console")
+
+# ProcessingTime trigger with two-seconds micro-batch interval
+write.stream(df, "console", trigger.processingTime = "2 seconds")
+
+# One-time trigger
+write.stream(df, "console", trigger.once = TRUE)
+
+# Continuous trigger is not yet supported
+{% endhighlight %}
+</div>
+</div>
+
+
 ## Managing Streaming Queries
 The `StreamingQuery` object created when a query is started can be used to monitor and manage the query. 
 
@@ -2516,7 +2709,10 @@ write.stream(aggDF, "memory", outputMode = "complete", checkpointLocation = "pat
 </div>
 </div>
 
-# Continuous Processing [Experimental]
+# Continuous Processing
+## [Experimental]
+{:.no_toc}
+
 **Continuous processing** is a new, experimental streaming execution mode introduced in Spark 2.3 that enables low (~1 ms) end-to-end latency with at-least-once fault-tolerance guarantees. Compare this with the default *micro-batch processing* engine which can achieve exactly-once guarantees but achieve latencies of ~100ms at best. For some types of queries (discussed below), you can choose which mode to execute them in without modifying the application logic (i.e. without changing the DataFrame/Dataset operations). 
 
 To run a supported query in continuous processing mode, all you need to do is specify a **continuous trigger** with the desired checkpoint interval as a parameter. For example, 
@@ -2589,6 +2785,8 @@ spark \
 A checkpoint interval of 1 second means that the continuous processing engine will records the progress of the query every second. The resulting checkpoints are in a format compatible with the micro-batch engine, hence any query can be restarted with any trigger. For example, a supported query started with the micro-batch mode can be restarted in continuous mode, and vice versa. Note that any time you switch to continuous mode, you will get at-least-once fault-tolerance guarantees.
 
 ## Supported Queries
+{:.no_toc}
+
 As of Spark 2.3, only the following type of queries are supported in the continuous processing mode.
 
 - *Operations*: Only map-like Dataset/DataFrame operations are supported in continuous mode, that is, only projections (`select`, `map`, `flatMap`, `mapPartitions`, etc.) and selections (`where`, `filter`, etc.).
@@ -2606,6 +2804,8 @@ As of Spark 2.3, only the following type of queries are supported in the continu
 See [Input Sources](#input-sources) and [Output Sinks](#output-sinks) sections for more details on them. While the console sink is good for testing, the end-to-end low-latency processing can be best observed with Kafka as the source and sink, as this allows the engine to process the data and make the results available in the output topic within milliseconds of the input data being available in the input topic.
 
 ## Caveats
+{:.no_toc}
+
 - Continuous processing engine launches multiple long-running tasks that continuously read data from sources, process it and continuously write to sinks. The number of tasks required by the query depends on how many partitions the query can read from the sources in parallel. Therefore, before starting a continuous processing query, you must ensure there are enough cores in the cluster to all the tasks in parallel. For example, if you are reading from a Kafka topic that has 10 partitions, then the cluster must have at least 10 cores for the query to make progress.
 - Stopping a continuous processing stream may produce spurious task termination warnings. These can be safely ignored.
 - There are currently no automatic retries of failed tasks. Any failure will lead to the query being stopped and it needs to be manually restarted from the checkpoint.

From 373ac642fc145527bdfccae046d0e98f105ee7b3 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Wed, 21 Feb 2018 14:56:13 -0800
Subject: [PATCH 310/566] [SPARK-23484][SS] Fix possible race condition in
 KafkaContinuousReader

## What changes were proposed in this pull request?

var `KafkaContinuousReader.knownPartitions` should be threadsafe as it is accessed from multiple threads - the query thread at the time of reader factory creation, and the epoch tracking thread at the time of `needsReconfiguration`.

## How was this patch tested?

Existing tests.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #20655 from tdas/SPARK-23484.

(cherry picked from commit 3fd0ccb13fea44727d970479af1682ef00592147)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../org/apache/spark/sql/kafka010/KafkaContinuousReader.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
index b049a054cb40e..a269a50d81c38 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
@@ -66,7 +66,7 @@ class KafkaContinuousReader(
   // Initialized when creating reader factories. If this diverges from the partitions at the latest
   // offsets, we need to reconfigure.
   // Exposed outside this object only for unit tests.
-  private[sql] var knownPartitions: Set[TopicPartition] = _
+  @volatile private[sql] var knownPartitions: Set[TopicPartition] = _
 
   override def readSchema: StructType = KafkaOffsetReader.kafkaSchema
 

From 23ba4416e1bbbaa818876d7a837f7a5e260aa048 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <zsxwing@gmail.com>
Date: Wed, 21 Feb 2018 15:37:28 -0800
Subject: [PATCH 311/566] [SPARK-23481][WEBUI] lastStageAttempt should fail
 when a stage doesn't exist

## What changes were proposed in this pull request?

The issue here is `AppStatusStore.lastStageAttempt` will return the next available stage in the store when a stage doesn't exist.

This PR adds `last(stageId)` to ensure it returns a correct `StageData`

## How was this patch tested?

The new unit test.

Author: Shixiong Zhu <zsxwing@gmail.com>

Closes #20654 from zsxwing/SPARK-23481.

(cherry picked from commit 744d5af652ee8cece361cbca31e5201134e0fb42)
Signed-off-by: Shixiong Zhu <zsxwing@gmail.com>
---
 .../apache/spark/status/AppStatusStore.scala  |  6 +++-
 .../spark/status/AppStatusListenerSuite.scala | 33 +++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
index efc28538a33db..688f25a9fdea1 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
@@ -95,7 +95,11 @@ private[spark] class AppStatusStore(
   }
 
   def lastStageAttempt(stageId: Int): v1.StageData = {
-    val it = store.view(classOf[StageDataWrapper]).index("stageId").reverse().first(stageId)
+    val it = store.view(classOf[StageDataWrapper])
+      .index("stageId")
+      .reverse()
+      .first(stageId)
+      .last(stageId)
       .closeableIterator()
     try {
       if (it.hasNext()) {
diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
index 01d76a2cfb2a8..f3fa4c9619e44 100644
--- a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
@@ -1057,6 +1057,39 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     }
   }
 
+  test("lastStageAttempt should fail when the stage doesn't exist") {
+    val testConf = conf.clone().set(MAX_RETAINED_STAGES, 1)
+    val listener = new AppStatusListener(store, testConf, true)
+    val appStore = new AppStatusStore(store)
+
+    val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1")
+    val stage2 = new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2")
+    val stage3 = new StageInfo(3, 0, "stage3", 4, Nil, Nil, "details3")
+
+    time += 1
+    stage1.submissionTime = Some(time)
+    listener.onStageSubmitted(SparkListenerStageSubmitted(stage1, new Properties()))
+    stage1.completionTime = Some(time)
+    listener.onStageCompleted(SparkListenerStageCompleted(stage1))
+
+    // Make stage 3 complete before stage 2 so that stage 3 will be evicted
+    time += 1
+    stage3.submissionTime = Some(time)
+    listener.onStageSubmitted(SparkListenerStageSubmitted(stage3, new Properties()))
+    stage3.completionTime = Some(time)
+    listener.onStageCompleted(SparkListenerStageCompleted(stage3))
+
+    time += 1
+    stage2.submissionTime = Some(time)
+    listener.onStageSubmitted(SparkListenerStageSubmitted(stage2, new Properties()))
+    stage2.completionTime = Some(time)
+    listener.onStageCompleted(SparkListenerStageCompleted(stage2))
+
+    assert(appStore.asOption(appStore.lastStageAttempt(1)) === None)
+    assert(appStore.asOption(appStore.lastStageAttempt(2)).map(_.stageId) === Some(2))
+    assert(appStore.asOption(appStore.lastStageAttempt(3)) === None)
+  }
+
   test("driver logs") {
     val listener = new AppStatusListener(store, conf, true)
 

From a0d7949896e70f427e7f3942ff340c9484ff0aab Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <zsxwing@gmail.com>
Date: Wed, 21 Feb 2018 19:43:11 -0800
Subject: [PATCH 312/566] [SPARK-23475][WEBUI] Skipped stages should be evicted
 before completed stages

## What changes were proposed in this pull request?

The root cause of missing completed stages is because `cleanupStages` will never remove skipped stages.

This PR changes the logic to always remove skipped stage first. This is safe since  the job itself contains enough information to render skipped stages in the UI.

## How was this patch tested?

The new unit tests.

Author: Shixiong Zhu <zsxwing@gmail.com>

Closes #20656 from zsxwing/SPARK-23475.

(cherry picked from commit 45cf714ee6d4eead2fe00794a0d754fa6d33d4a6)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../spark/status/AppStatusListener.scala      |  5 ++-
 .../spark/status/AppStatusListenerSuite.scala | 36 +++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
index 8ac029191b1eb..496165c9df0d9 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
@@ -863,7 +863,10 @@ private[spark] class AppStatusListener(
       return
     }
 
-    val view = kvstore.view(classOf[StageDataWrapper]).index("completionTime").first(0L)
+    // As the completion time of a skipped stage is always -1, we will remove skipped stages first.
+    // This is safe since the job itself contains enough information to render skipped stages in the
+    // UI.
+    val view = kvstore.view(classOf[StageDataWrapper]).index("completionTime")
     val stages = KVUtils.viewToSeq(view, countToDelete.toInt) { s =>
       s.info.status != v1.StageStatus.ACTIVE && s.info.status != v1.StageStatus.PENDING
     }
diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
index f3fa4c9619e44..eb03ef3b3b5e3 100644
--- a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
@@ -1025,6 +1025,42 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     }
   }
 
+  test("skipped stages should be evicted before completed stages") {
+    val testConf = conf.clone().set(MAX_RETAINED_STAGES, 2)
+    val listener = new AppStatusListener(store, testConf, true)
+
+    val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1")
+    val stage2 = new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2")
+
+    // Sart job 1
+    time += 1
+    listener.onJobStart(SparkListenerJobStart(1, time, Seq(stage1, stage2), null))
+
+    // Start and stop stage 1
+    time += 1
+    stage1.submissionTime = Some(time)
+    listener.onStageSubmitted(SparkListenerStageSubmitted(stage1, new Properties()))
+
+    time += 1
+    stage1.completionTime = Some(time)
+    listener.onStageCompleted(SparkListenerStageCompleted(stage1))
+
+    // Stop job 1 and stage 2 will become SKIPPED
+    time += 1
+    listener.onJobEnd(SparkListenerJobEnd(1, time, JobSucceeded))
+
+    // Submit stage 3 and verify stage 2 is evicted
+    val stage3 = new StageInfo(3, 0, "stage3", 4, Nil, Nil, "details3")
+    time += 1
+    stage3.submissionTime = Some(time)
+    listener.onStageSubmitted(SparkListenerStageSubmitted(stage3, new Properties()))
+
+    assert(store.count(classOf[StageDataWrapper]) === 2)
+    intercept[NoSuchElementException] {
+      store.read(classOf[StageDataWrapper], Array(2, 0))
+    }
+  }
+
   test("eviction should respect task completion time") {
     val testConf = conf.clone().set(MAX_RETAINED_TASKS_PER_STAGE, 2)
     val listener = new AppStatusListener(store, testConf, true)

From 992447fb30ee9ebb3cf794f2d06f4d63a2d792db Mon Sep 17 00:00:00 2001
From: Sameer Agarwal <sameerag@apache.org>
Date: Thu, 22 Feb 2018 09:56:57 -0800
Subject: [PATCH 313/566] Preparing Spark release v2.3.0-rc5

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 29a8a0062873f..6d46c31906260 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.1
+Version: 2.3.0
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 5c5a8e9b8232e..2ca9ab66462a8 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 2a625da682cdc..404c744f98261 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index adb1890439094..3c0b5288862fa 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 4cdcfa2a4da99..fe3bcfd68f988 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 494137f8e9ec1..90ca4016ecc79 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 32bebf5eeca99..8bb2c38f7d097 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 8dc4ddeea3ec0..0d5d4b1661012 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 5613f5e81384c..d6136e6c29a84 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index c34c9df39063b..364aaab74e67b 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index 213579037f51a..aabf3151890f9 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.1-SNAPSHOT
-SPARK_VERSION_SHORT: 2.3.1
+SPARK_VERSION: 2.3.0
+SPARK_VERSION_SHORT: 2.3.0
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index e584feed1b337..28de7a180c2e7 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 6c4031b392f65..9b74a4d327136 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 6935ade33546a..1da013353775c 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 60aa5fb1a1031..da8ce5928b8bb 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index d8ed3364c4bd4..65a9f9972d398 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 15d550d3bb445..f5a2dbbef3b6f 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 7ee7156efe22c..e9b8e4e18d6df 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 1f28798fd245a..46a5bd1611d30 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index ad3cd0059e183..72446ecb5cd82 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 53d903fc58aee..e49c8281c6ddb 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 4206a1ac4ad99..7ad92b07094f2 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 7e838812bdc2e..126624ecd8922 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index ee7eb98e3dfc8..a8bcff8cdc14c 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 2ea2e4a7adbdb..6e60fbac9ef5a 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 35ddfac1d6624..7e09e304cb1b7 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 29c1e53f480a3..d8c1792f5b0dd 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 61eb9dbcf6f7a..d9380ed2c6d4d 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index edba0a72ee6a4..9cb3ed2e7ae1d 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 9aa531ec65746..e31b5e692e641 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.1-SNAPSHOT</version>
+  <version>2.3.0</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index b09fa9822184c..8065b058f8c0e 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.1.dev0"
+__version__ = "2.3.0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 3b445bfaf773d..c21a1cd108a8a 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index ae1c69f305589..544e08b333aaf 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index ea31b9c121433..3dc6955ce312c 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index d42bfff851572..7e1397ef9ba79 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 865d3b3f6ad4e..936267ce560e6 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index f39bffe417666..77d4cf7ec8459 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 6e04f528a1c2f..7ade122666a83 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index da50c9bd60fa6..da34cb8253272 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 6e8e034fc4de4..adbafac9bc458 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 9523085cf5513..4e005623e2b26 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 285b841ffbfb21c0af3f83800f7815fb0bfe3627 Mon Sep 17 00:00:00 2001
From: Sameer Agarwal <sameerag@apache.org>
Date: Thu, 22 Feb 2018 09:57:03 -0800
Subject: [PATCH 314/566] Preparing development version 2.3.1-SNAPSHOT

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 6d46c31906260..29a8a0062873f 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.0
+Version: 2.3.1
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 2ca9ab66462a8..5c5a8e9b8232e 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 404c744f98261..2a625da682cdc 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 3c0b5288862fa..adb1890439094 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index fe3bcfd68f988..4cdcfa2a4da99 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 90ca4016ecc79..494137f8e9ec1 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 8bb2c38f7d097..32bebf5eeca99 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 0d5d4b1661012..8dc4ddeea3ec0 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index d6136e6c29a84..5613f5e81384c 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 364aaab74e67b..c34c9df39063b 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index aabf3151890f9..213579037f51a 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.0
-SPARK_VERSION_SHORT: 2.3.0
+SPARK_VERSION: 2.3.1-SNAPSHOT
+SPARK_VERSION_SHORT: 2.3.1
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 28de7a180c2e7..e584feed1b337 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 9b74a4d327136..6c4031b392f65 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 1da013353775c..6935ade33546a 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index da8ce5928b8bb..60aa5fb1a1031 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 65a9f9972d398..d8ed3364c4bd4 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index f5a2dbbef3b6f..15d550d3bb445 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index e9b8e4e18d6df..7ee7156efe22c 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 46a5bd1611d30..1f28798fd245a 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 72446ecb5cd82..ad3cd0059e183 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index e49c8281c6ddb..53d903fc58aee 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 7ad92b07094f2..4206a1ac4ad99 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 126624ecd8922..7e838812bdc2e 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index a8bcff8cdc14c..ee7eb98e3dfc8 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 6e60fbac9ef5a..2ea2e4a7adbdb 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 7e09e304cb1b7..35ddfac1d6624 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index d8c1792f5b0dd..29c1e53f480a3 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index d9380ed2c6d4d..61eb9dbcf6f7a 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 9cb3ed2e7ae1d..edba0a72ee6a4 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index e31b5e692e641..9aa531ec65746 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.0</version>
+  <version>2.3.1-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 8065b058f8c0e..b09fa9822184c 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.0"
+__version__ = "2.3.1.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index c21a1cd108a8a..3b445bfaf773d 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 544e08b333aaf..ae1c69f305589 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 3dc6955ce312c..ea31b9c121433 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 7e1397ef9ba79..d42bfff851572 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 936267ce560e6..865d3b3f6ad4e 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 77d4cf7ec8459..f39bffe417666 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 7ade122666a83..6e04f528a1c2f 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index da34cb8253272..da50c9bd60fa6 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index adbafac9bc458..6e8e034fc4de4 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 4e005623e2b26..9523085cf5513 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 578607b30a65d101380acbe9c9740dc267a5d55c Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Fri, 23 Feb 2018 18:27:33 -0800
Subject: [PATCH 315/566] [SPARK-23475][UI][BACKPORT-2.3] Show also skipped
 stages

## What changes were proposed in this pull request?

SPARK-20648 introduced the status `SKIPPED` for the stages. On the UI, previously, skipped stages were shown as `PENDING`; after this change, they are not shown on the UI.

The PR introduce a new section in order to show also `SKIPPED` stages in a proper table.

Manual backport from to branch-2.3.

## How was this patch tested?

added UT

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #20662 from mgaido91/SPARK-23475_2.3.
---
 .../apache/spark/ui/jobs/AllStagesPage.scala  | 27 +++++++++++++++++++
 .../org/apache/spark/ui/UISeleniumSuite.scala | 17 ++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala
index b1e343451e28e..9325b903b33a1 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala
@@ -36,6 +36,7 @@ private[ui] class AllStagesPage(parent: StagesTab) extends WebUIPage("") {
 
     val activeStages = allStages.filter(_.status == StageStatus.ACTIVE)
     val pendingStages = allStages.filter(_.status == StageStatus.PENDING)
+    val skippedStages = allStages.filter(_.status == StageStatus.SKIPPED)
     val completedStages = allStages.filter(_.status == StageStatus.COMPLETE)
     val failedStages = allStages.filter(_.status == StageStatus.FAILED).reverse
 
@@ -51,6 +52,9 @@ private[ui] class AllStagesPage(parent: StagesTab) extends WebUIPage("") {
     val completedStagesTable =
       new StageTableBase(parent.store, request, completedStages, "completed", "completedStage",
         parent.basePath, subPath, parent.isFairScheduler, false, false)
+    val skippedStagesTable =
+      new StageTableBase(parent.store, request, skippedStages, "skipped", "skippedStage",
+        parent.basePath, subPath, parent.isFairScheduler, false, false)
     val failedStagesTable =
       new StageTableBase(parent.store, request, failedStages, "failed", "failedStage",
         parent.basePath, subPath, parent.isFairScheduler, false, true)
@@ -66,6 +70,7 @@ private[ui] class AllStagesPage(parent: StagesTab) extends WebUIPage("") {
     val shouldShowActiveStages = activeStages.nonEmpty
     val shouldShowPendingStages = pendingStages.nonEmpty
     val shouldShowCompletedStages = completedStages.nonEmpty
+    val shouldShowSkippedStages = skippedStages.nonEmpty
     val shouldShowFailedStages = failedStages.nonEmpty
 
     val appSummary = parent.store.appSummary()
@@ -102,6 +107,14 @@ private[ui] class AllStagesPage(parent: StagesTab) extends WebUIPage("") {
               </li>
             }
           }
+          {
+            if (shouldShowSkippedStages) {
+              <li>
+                <a href="#skipped"><strong>Skipped Stages:</strong></a>
+                {skippedStages.size}
+              </li>
+            }
+          }
           {
             if (shouldShowFailedStages) {
               <li>
@@ -133,6 +146,20 @@ private[ui] class AllStagesPage(parent: StagesTab) extends WebUIPage("") {
       content ++= <h4 id="completed">Completed Stages ({completedStageNumStr})</h4> ++
       completedStagesTable.toNodeSeq
     }
+    if (shouldShowSkippedStages) {
+      content ++=
+        <span id="skipped" class="collapse-aggregated-allSkippedStages collapse-table"
+              onClick="collapseTable('collapse-aggregated-allSkippedStages',
+            'aggregated-allSkippedStages')">
+          <h4>
+            <span class="collapse-table-arrow arrow-open"></span>
+            <a>Skipped Stages ({skippedStages.size})</a>
+          </h4>
+        </span> ++
+        <div class="aggregated-allSkippedStages collapsible-table">
+          {skippedStagesTable.toNodeSeq}
+        </div>
+    }
     if (shouldShowFailedStages) {
       content ++= <h4 id ="failed">Failed Stages ({numFailedStages})</h4> ++
       failedStagesTable.toNodeSeq
diff --git a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
index 326546787ab6c..0f20eea735044 100644
--- a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
@@ -706,6 +706,23 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
     }
   }
 
+  test("stages page should show skipped stages") {
+    withSpark(newSparkContext()) { sc =>
+      val rdd = sc.parallelize(0 to 100, 100).repartition(10).cache()
+      rdd.count()
+      rdd.count()
+
+      eventually(timeout(5 seconds), interval(50 milliseconds)) {
+        goToUi(sc, "/stages")
+        find(id("skipped")).get.text should be("Skipped Stages (1)")
+      }
+      val stagesJson = getJson(sc.ui.get, "stages")
+      stagesJson.children.size should be (4)
+      val stagesStatus = stagesJson.children.map(_ \ "status")
+      stagesStatus.count(_ == JString(StageStatus.SKIPPED.name())) should be (1)
+    }
+  }
+
   def goToUi(sc: SparkContext, path: String): Unit = {
     goToUi(sc.ui.get, path)
   }

From 1f180cd121b13ecd455bee55ed2224936a2f3b2a Mon Sep 17 00:00:00 2001
From: Gabor Somogyi <gabor.g.somogyi@gmail.com>
Date: Mon, 26 Feb 2018 08:39:44 -0800
Subject: [PATCH 316/566] [SPARK-23438][DSTREAMS] Fix DStreams data loss with
 WAL when driver crashes

There is a race condition introduced in SPARK-11141 which could cause data loss.
The problem is that ReceivedBlockTracker.insertAllocatedBatch function assumes that all blocks from streamIdToUnallocatedBlockQueues allocated to the batch and clears the queue.

In this PR only the allocated blocks will be removed from the queue which will prevent data loss.

Additional unit test + manually.

Author: Gabor Somogyi <gabor.g.somogyi@gmail.com>

Closes #20620 from gaborgsomogyi/SPARK-23438.

(cherry picked from commit b308182f233b8840dfe0e6b5736d2f2746f40757)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../scheduler/ReceivedBlockTracker.scala      | 11 +++++----
 .../streaming/ReceivedBlockTrackerSuite.scala | 23 ++++++++++++++++++-
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala
index 5d9a8ac0d9297..dacff69d55dd2 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala
@@ -193,12 +193,15 @@ private[streaming] class ReceivedBlockTracker(
       getReceivedBlockQueue(receivedBlockInfo.streamId) += receivedBlockInfo
     }
 
-    // Insert the recovered block-to-batch allocations and clear the queue of received blocks
-    // (when the blocks were originally allocated to the batch, the queue must have been cleared).
+    // Insert the recovered block-to-batch allocations and removes them from queue of
+    // received blocks.
     def insertAllocatedBatch(batchTime: Time, allocatedBlocks: AllocatedBlocks) {
       logTrace(s"Recovery: Inserting allocated batch for time $batchTime to " +
         s"${allocatedBlocks.streamIdToAllocatedBlocks}")
-      streamIdToUnallocatedBlockQueues.values.foreach { _.clear() }
+      allocatedBlocks.streamIdToAllocatedBlocks.foreach {
+        case (streamId, allocatedBlocksInStream) =>
+          getReceivedBlockQueue(streamId).dequeueAll(allocatedBlocksInStream.toSet)
+      }
       timeToAllocatedBlocks.put(batchTime, allocatedBlocks)
       lastAllocatedBatchTime = batchTime
     }
@@ -227,7 +230,7 @@ private[streaming] class ReceivedBlockTracker(
   }
 
   /** Write an update to the tracker to the write ahead log */
-  private def writeToLog(record: ReceivedBlockTrackerLogEvent): Boolean = {
+  private[streaming] def writeToLog(record: ReceivedBlockTrackerLogEvent): Boolean = {
     if (isWriteAheadLogEnabled) {
       logTrace(s"Writing record: $record")
       try {
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala
index 107c3f5dcc08d..4fa236bd39663 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala
@@ -33,7 +33,7 @@ import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
 import org.apache.spark.internal.Logging
 import org.apache.spark.storage.StreamBlockId
 import org.apache.spark.streaming.receiver.BlockManagerBasedStoreResult
-import org.apache.spark.streaming.scheduler._
+import org.apache.spark.streaming.scheduler.{AllocatedBlocks, _}
 import org.apache.spark.streaming.util._
 import org.apache.spark.streaming.util.WriteAheadLogSuite._
 import org.apache.spark.util.{Clock, ManualClock, SystemClock, Utils}
@@ -94,6 +94,27 @@ class ReceivedBlockTrackerSuite
     receivedBlockTracker.getUnallocatedBlocks(streamId) shouldEqual blockInfos
   }
 
+  test("recovery with write ahead logs should remove only allocated blocks from received queue") {
+    val manualClock = new ManualClock
+    val batchTime = manualClock.getTimeMillis()
+
+    val tracker1 = createTracker(clock = manualClock)
+    tracker1.isWriteAheadLogEnabled should be (true)
+
+    val allocatedBlockInfos = generateBlockInfos()
+    val unallocatedBlockInfos = generateBlockInfos()
+    val receivedBlockInfos = allocatedBlockInfos ++ unallocatedBlockInfos
+    receivedBlockInfos.foreach { b => tracker1.writeToLog(BlockAdditionEvent(b)) }
+    val allocatedBlocks = AllocatedBlocks(Map(streamId -> allocatedBlockInfos))
+    tracker1.writeToLog(BatchAllocationEvent(batchTime, allocatedBlocks))
+    tracker1.stop()
+
+    val tracker2 = createTracker(clock = manualClock, recoverFromWriteAheadLog = true)
+    tracker2.getBlocksOfBatch(batchTime) shouldEqual allocatedBlocks.streamIdToAllocatedBlocks
+    tracker2.getUnallocatedBlocks(streamId) shouldEqual unallocatedBlockInfos
+    tracker2.stop()
+  }
+
   test("recovery and cleanup with write ahead logs") {
     val manualClock = new ManualClock
     // Set the time increment level to twice the rotation interval so that every increment creates

From 6eee545f9ad4552057bb51daa866d68b08f27c0f Mon Sep 17 00:00:00 2001
From: Andrew Korzhuev <korzhuev@andrusha.me>
Date: Mon, 26 Feb 2018 10:28:45 -0800
Subject: [PATCH 317/566] [SPARK-23449][K8S] Preserve extraJavaOptions ordering

For some JVM options, like `-XX:+UnlockExperimentalVMOptions` ordering is necessary.

## What changes were proposed in this pull request?

Keep original `extraJavaOptions` ordering, when passing them through environment variables inside the Docker container.

## How was this patch tested?

Ran base branch a couple of times and checked startup command in logs. Ordering differed every time. Added sorting, ordering was consistent to what user had in `extraJavaOptions`.

Author: Andrew Korzhuev <korzhuev@andrusha.me>

Closes #20628 from andrusha/patch-2.

(cherry picked from commit 185f5bc7dd52cebe8fac9393ecb2bd0968bc5867)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
index b9090dc2852a5..3d67b0a702dd4 100755
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
@@ -41,7 +41,7 @@ fi
 shift 1
 
 SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*"
-env | grep SPARK_JAVA_OPT_ | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt
+env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt
 readarray -t SPARK_JAVA_OPTS < /tmp/java_opts.txt
 if [ -n "$SPARK_MOUNTED_CLASSPATH" ]; then
   SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_MOUNTED_CLASSPATH"

From 30242b664a05120c01b54ba9ac0ebed114f0d54e Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Tue, 27 Feb 2018 11:12:32 -0800
Subject: [PATCH 318/566] [SPARK-23365][CORE] Do not adjust num executors when
 killing idle executors.

The ExecutorAllocationManager should not adjust the target number of
executors when killing idle executors, as it has already adjusted the
target number down based on the task backlog.

The name `replace` was misleading with DynamicAllocation on, as the target number
of executors is changed outside of the call to `killExecutors`, so I adjusted that name.  Also separated out the logic of `countFailures` as you don't always want that tied to `replace`.

While I was there I made two changes that weren't directly related to this:
1) Fixed `countFailures` in a couple cases where it was getting an incorrect value since it used to be tied to `replace`, eg. when killing executors on a blacklisted node.
2) hard error if you call `sc.killExecutors` with dynamic allocation on, since that's another way the ExecutorAllocationManager and the CoarseGrainedSchedulerBackend would get out of sync.

Added a unit test case which verifies that the calls to ExecutorAllocationClient do not adjust the number of executors.

Author: Imran Rashid <irashid@cloudera.com>

Closes #20604 from squito/SPARK-23365.

(cherry picked from commit ecb8b383af1cf1b67f3111c148229e00c9c17c40)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../spark/ExecutorAllocationClient.scala      | 15 +++--
 .../spark/ExecutorAllocationManager.scala     | 20 ++++--
 .../scala/org/apache/spark/SparkContext.scala | 13 +++-
 .../spark/scheduler/BlacklistTracker.scala    |  3 +-
 .../CoarseGrainedSchedulerBackend.scala       | 22 ++++---
 .../ExecutorAllocationManagerSuite.scala      | 66 ++++++++++++++++++-
 .../StandaloneDynamicAllocationSuite.scala    |  3 +-
 .../scheduler/BlacklistTrackerSuite.scala     | 14 ++--
 8 files changed, 121 insertions(+), 35 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala
index 9112d93a86b2a..63d87b4cd385c 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala
@@ -55,18 +55,18 @@ private[spark] trait ExecutorAllocationClient {
   /**
    * Request that the cluster manager kill the specified executors.
    *
-   * When asking the executor to be replaced, the executor loss is considered a failure, and
-   * killed tasks that are running on the executor will count towards the failure limits. If no
-   * replacement is being requested, then the tasks will not count towards the limit.
-   *
    * @param executorIds identifiers of executors to kill
-   * @param replace whether to replace the killed executors with new ones, default false
+   * @param adjustTargetNumExecutors whether the target number of executors will be adjusted down
+   *                                 after these executors have been killed
+   * @param countFailures if there are tasks running on the executors when they are killed, whether
+    *                     to count those failures toward task failure limits
    * @param force whether to force kill busy executors, default false
    * @return the ids of the executors acknowledged by the cluster manager to be removed.
    */
   def killExecutors(
     executorIds: Seq[String],
-    replace: Boolean = false,
+    adjustTargetNumExecutors: Boolean,
+    countFailures: Boolean,
     force: Boolean = false): Seq[String]
 
   /**
@@ -81,7 +81,8 @@ private[spark] trait ExecutorAllocationClient {
    * @return whether the request is acknowledged by the cluster manager.
    */
   def killExecutor(executorId: String): Boolean = {
-    val killedExecutors = killExecutors(Seq(executorId))
+    val killedExecutors = killExecutors(Seq(executorId), adjustTargetNumExecutors = true,
+      countFailures = false)
     killedExecutors.nonEmpty && killedExecutors(0).equals(executorId)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 6c59038f2a6c1..189d91333c045 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -29,6 +29,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.{DYN_ALLOCATION_MAX_EXECUTORS, DYN_ALLOCATION_MIN_EXECUTORS}
 import org.apache.spark.metrics.source.Source
 import org.apache.spark.scheduler._
+import org.apache.spark.storage.BlockManagerMaster
 import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils}
 
 /**
@@ -81,7 +82,8 @@ import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils}
 private[spark] class ExecutorAllocationManager(
     client: ExecutorAllocationClient,
     listenerBus: LiveListenerBus,
-    conf: SparkConf)
+    conf: SparkConf,
+    blockManagerMaster: BlockManagerMaster)
   extends Logging {
 
   allocationManager =>
@@ -151,7 +153,7 @@ private[spark] class ExecutorAllocationManager(
   private var clock: Clock = new SystemClock()
 
   // Listener for Spark events that impact the allocation policy
-  private val listener = new ExecutorAllocationListener
+  val listener = new ExecutorAllocationListener
 
   // Executor that handles the scheduling task.
   private val executor =
@@ -334,6 +336,11 @@ private[spark] class ExecutorAllocationManager(
 
       // If the new target has not changed, avoid sending a message to the cluster manager
       if (numExecutorsTarget < oldNumExecutorsTarget) {
+        // We lower the target number of executors but don't actively kill any yet.  Killing is
+        // controlled separately by an idle timeout.  It's still helpful to reduce the target number
+        // in case an executor just happens to get lost (eg., bad hardware, or the cluster manager
+        // preempts it) -- in that case, there is no point in trying to immediately  get a new
+        // executor, since we wouldn't even use it yet.
         client.requestTotalExecutors(numExecutorsTarget, localityAwareTasks, hostToLocalTaskCount)
         logDebug(s"Lowering target number of executors to $numExecutorsTarget (previously " +
           s"$oldNumExecutorsTarget) because not all requested executors are actually needed")
@@ -455,7 +462,10 @@ private[spark] class ExecutorAllocationManager(
     val executorsRemoved = if (testing) {
       executorIdsToBeRemoved
     } else {
-      client.killExecutors(executorIdsToBeRemoved)
+      // We don't want to change our target number of executors, because we already did that
+      // when the task backlog decreased.
+      client.killExecutors(executorIdsToBeRemoved, adjustTargetNumExecutors = false,
+        countFailures = false, force = false)
     }
     // [SPARK-21834] killExecutors api reduces the target number of executors.
     // So we need to update the target with desired value.
@@ -575,7 +585,7 @@ private[spark] class ExecutorAllocationManager(
         // Note that it is not necessary to query the executors since all the cached
         // blocks we are concerned with are reported to the driver. Note that this
         // does not include broadcast blocks.
-        val hasCachedBlocks = SparkEnv.get.blockManager.master.hasCachedBlocks(executorId)
+        val hasCachedBlocks = blockManagerMaster.hasCachedBlocks(executorId)
         val now = clock.getTimeMillis()
         val timeout = {
           if (hasCachedBlocks) {
@@ -610,7 +620,7 @@ private[spark] class ExecutorAllocationManager(
    * This class is intentionally conservative in its assumptions about the relative ordering
    * and consistency of events returned by the listener.
    */
-  private class ExecutorAllocationListener extends SparkListener {
+  private[spark] class ExecutorAllocationListener extends SparkListener {
 
     private val stageIdToNumTasks = new mutable.HashMap[Int, Int]
     // Number of running tasks per stage including speculative tasks.
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 3828d4f703247..f5b560c9e345f 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -533,7 +533,8 @@ class SparkContext(config: SparkConf) extends Logging {
         schedulerBackend match {
           case b: ExecutorAllocationClient =>
             Some(new ExecutorAllocationManager(
-              schedulerBackend.asInstanceOf[ExecutorAllocationClient], listenerBus, _conf))
+              schedulerBackend.asInstanceOf[ExecutorAllocationClient], listenerBus, _conf,
+              _env.blockManager.master))
           case _ =>
             None
         }
@@ -1632,6 +1633,8 @@ class SparkContext(config: SparkConf) extends Logging {
    * :: DeveloperApi ::
    * Request that the cluster manager kill the specified executors.
    *
+   * This is not supported when dynamic allocation is turned on.
+   *
    * @note This is an indication to the cluster manager that the application wishes to adjust
    * its resource usage downwards. If the application wishes to replace the executors it kills
    * through this method with new ones, it should follow up explicitly with a call to
@@ -1643,7 +1646,10 @@ class SparkContext(config: SparkConf) extends Logging {
   def killExecutors(executorIds: Seq[String]): Boolean = {
     schedulerBackend match {
       case b: ExecutorAllocationClient =>
-        b.killExecutors(executorIds, replace = false, force = true).nonEmpty
+        require(executorAllocationManager.isEmpty,
+          "killExecutors() unsupported with Dynamic Allocation turned on")
+        b.killExecutors(executorIds, adjustTargetNumExecutors = true, countFailures = false,
+          force = true).nonEmpty
       case _ =>
         logWarning("Killing executors is not supported by current scheduler.")
         false
@@ -1681,7 +1687,8 @@ class SparkContext(config: SparkConf) extends Logging {
   private[spark] def killAndReplaceExecutor(executorId: String): Boolean = {
     schedulerBackend match {
       case b: ExecutorAllocationClient =>
-        b.killExecutors(Seq(executorId), replace = true, force = true).nonEmpty
+        b.killExecutors(Seq(executorId), adjustTargetNumExecutors = false, countFailures = true,
+          force = true).nonEmpty
       case _ =>
         logWarning("Killing executors is not supported by current scheduler.")
         false
diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
index cd8e61d6d0208..952598f6de19d 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
@@ -152,7 +152,8 @@ private[scheduler] class BlacklistTracker (
         case Some(a) =>
           logInfo(s"Killing blacklisted executor id $exec " +
             s"since ${config.BLACKLIST_KILL_ENABLED.key} is set.")
-          a.killExecutors(Seq(exec), true, true)
+          a.killExecutors(Seq(exec), adjustTargetNumExecutors = false, countFailures = false,
+            force = true)
         case None =>
           logWarning(s"Not attempting to kill blacklisted executor id $exec " +
             s"since allocation client is not defined.")
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 4d75063fbf1c5..5627a557a12f3 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -147,7 +147,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
 
       case KillExecutorsOnHost(host) =>
         scheduler.getExecutorsAliveOnHost(host).foreach { exec =>
-          killExecutors(exec.toSeq, replace = true, force = true)
+          killExecutors(exec.toSeq, adjustTargetNumExecutors = false, countFailures = false,
+            force = true)
         }
 
       case UpdateDelegationTokens(newDelegationTokens) =>
@@ -584,18 +585,18 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
   /**
    * Request that the cluster manager kill the specified executors.
    *
-   * When asking the executor to be replaced, the executor loss is considered a failure, and
-   * killed tasks that are running on the executor will count towards the failure limits. If no
-   * replacement is being requested, then the tasks will not count towards the limit.
-   *
    * @param executorIds identifiers of executors to kill
-   * @param replace whether to replace the killed executors with new ones, default false
+   * @param adjustTargetNumExecutors whether the target number of executors be adjusted down
+   *                                 after these executors have been killed
+   * @param countFailures if there are tasks running on the executors when they are killed, whether
+   *                      those failures be counted to task failure limits?
    * @param force whether to force kill busy executors, default false
    * @return the ids of the executors acknowledged by the cluster manager to be removed.
    */
   final override def killExecutors(
       executorIds: Seq[String],
-      replace: Boolean,
+      adjustTargetNumExecutors: Boolean,
+      countFailures: Boolean,
       force: Boolean): Seq[String] = {
     logInfo(s"Requesting to kill executor(s) ${executorIds.mkString(", ")}")
 
@@ -610,7 +611,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
       val executorsToKill = knownExecutors
         .filter { id => !executorsPendingToRemove.contains(id) }
         .filter { id => force || !scheduler.isExecutorBusy(id) }
-      executorsToKill.foreach { id => executorsPendingToRemove(id) = !replace }
+      executorsToKill.foreach { id => executorsPendingToRemove(id) = !countFailures }
 
       logInfo(s"Actual list of executor(s) to be killed is ${executorsToKill.mkString(", ")}")
 
@@ -618,12 +619,13 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
       // with the cluster manager to avoid allocating new ones. When computing the new target,
       // take into account executors that are pending to be added or removed.
       val adjustTotalExecutors =
-        if (!replace) {
+        if (adjustTargetNumExecutors) {
           requestedTotalExecutors = math.max(requestedTotalExecutors - executorsToKill.size, 0)
           if (requestedTotalExecutors !=
               (numExistingExecutors + numPendingExecutors - executorsPendingToRemove.size)) {
             logDebug(
-              s"""killExecutors($executorIds, $replace, $force): Executor counts do not match:
+              s"""killExecutors($executorIds, $adjustTargetNumExecutors, $countFailures, $force):
+                 |Executor counts do not match:
                  |requestedTotalExecutors  = $requestedTotalExecutors
                  |numExistingExecutors     = $numExistingExecutors
                  |numPendingExecutors      = $numPendingExecutors
diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
index a0cae5a9e011c..9807d1269e3d4 100644
--- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
@@ -19,6 +19,8 @@ package org.apache.spark
 
 import scala.collection.mutable
 
+import org.mockito.Matchers.{any, eq => meq}
+import org.mockito.Mockito.{mock, never, verify, when}
 import org.scalatest.{BeforeAndAfter, PrivateMethodTester}
 
 import org.apache.spark.executor.TaskMetrics
@@ -26,6 +28,7 @@ import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.ExternalClusterManager
 import org.apache.spark.scheduler.cluster.ExecutorInfo
 import org.apache.spark.scheduler.local.LocalSchedulerBackend
+import org.apache.spark.storage.BlockManagerMaster
 import org.apache.spark.util.ManualClock
 
 /**
@@ -1050,6 +1053,66 @@ class ExecutorAllocationManagerSuite
     assert(removeTimes(manager) === Map.empty)
   }
 
+  test("SPARK-23365 Don't update target num executors when killing idle executors") {
+    val minExecutors = 1
+    val initialExecutors = 1
+    val maxExecutors = 2
+    val conf = new SparkConf()
+      .set("spark.dynamicAllocation.enabled", "true")
+      .set("spark.shuffle.service.enabled", "true")
+      .set("spark.dynamicAllocation.minExecutors", minExecutors.toString)
+      .set("spark.dynamicAllocation.maxExecutors", maxExecutors.toString)
+      .set("spark.dynamicAllocation.initialExecutors", initialExecutors.toString)
+      .set("spark.dynamicAllocation.schedulerBacklogTimeout", "1000ms")
+      .set("spark.dynamicAllocation.sustainedSchedulerBacklogTimeout", "1000ms")
+      .set("spark.dynamicAllocation.executorIdleTimeout", s"3000ms")
+    val mockAllocationClient = mock(classOf[ExecutorAllocationClient])
+    val mockBMM = mock(classOf[BlockManagerMaster])
+    val manager = new ExecutorAllocationManager(
+      mockAllocationClient, mock(classOf[LiveListenerBus]), conf, mockBMM)
+    val clock = new ManualClock()
+    manager.setClock(clock)
+
+    when(mockAllocationClient.requestTotalExecutors(meq(2), any(), any())).thenReturn(true)
+    // test setup -- job with 2 tasks, scale up to two executors
+    assert(numExecutorsTarget(manager) === 1)
+    manager.listener.onExecutorAdded(SparkListenerExecutorAdded(
+      clock.getTimeMillis(), "executor-1", new ExecutorInfo("host1", 1, Map.empty)))
+    manager.listener.onStageSubmitted(SparkListenerStageSubmitted(createStageInfo(0, 2)))
+    clock.advance(1000)
+    manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.getTimeMillis())
+    assert(numExecutorsTarget(manager) === 2)
+    val taskInfo0 = createTaskInfo(0, 0, "executor-1")
+    manager.listener.onTaskStart(SparkListenerTaskStart(0, 0, taskInfo0))
+    manager.listener.onExecutorAdded(SparkListenerExecutorAdded(
+      clock.getTimeMillis(), "executor-2", new ExecutorInfo("host1", 1, Map.empty)))
+    val taskInfo1 = createTaskInfo(1, 1, "executor-2")
+    manager.listener.onTaskStart(SparkListenerTaskStart(0, 0, taskInfo1))
+    assert(numExecutorsTarget(manager) === 2)
+
+    // have one task finish -- we should adjust the target number of executors down
+    // but we should *not* kill any executors yet
+    manager.listener.onTaskEnd(SparkListenerTaskEnd(0, 0, null, Success, taskInfo0, null))
+    assert(maxNumExecutorsNeeded(manager) === 1)
+    assert(numExecutorsTarget(manager) === 2)
+    clock.advance(1000)
+    manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.getTimeMillis())
+    assert(numExecutorsTarget(manager) === 1)
+    verify(mockAllocationClient, never).killExecutors(any(), any(), any(), any())
+
+    // now we cross the idle timeout for executor-1, so we kill it.  the really important
+    // thing here is that we do *not* ask the executor allocation client to adjust the target
+    // number of executors down
+    when(mockAllocationClient.killExecutors(Seq("executor-1"), false, false, false))
+      .thenReturn(Seq("executor-1"))
+    clock.advance(3000)
+    schedule(manager)
+    assert(maxNumExecutorsNeeded(manager) === 1)
+    assert(numExecutorsTarget(manager) === 1)
+    // here's the important verify -- we did kill the executors, but did not adjust the target count
+    verify(mockAllocationClient).killExecutors(Seq("executor-1"), false, false, false)
+  }
+
   private def createSparkContext(
       minExecutors: Int = 1,
       maxExecutors: Int = 5,
@@ -1268,7 +1331,8 @@ private class DummyLocalSchedulerBackend (sc: SparkContext, sb: SchedulerBackend
 
   override def killExecutors(
       executorIds: Seq[String],
-      replace: Boolean,
+      adjustTargetNumExecutors: Boolean,
+      countFailures: Boolean,
       force: Boolean): Seq[String] = executorIds
 
   override def start(): Unit = sb.start()
diff --git a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
index bf7480d79f8a1..155564a65c607 100644
--- a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
@@ -573,7 +573,8 @@ class StandaloneDynamicAllocationSuite
     syncExecutors(sc)
     sc.schedulerBackend match {
       case b: CoarseGrainedSchedulerBackend =>
-        b.killExecutors(Seq(executorId), replace = false, force)
+        b.killExecutors(Seq(executorId), adjustTargetNumExecutors = true, countFailures = false,
+          force)
       case _ => fail("expected coarse grained scheduler")
     }
   }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala
index cd1b7a9e5ab18..88a57c599e240 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala
@@ -479,7 +479,7 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
 
   test("blacklisting kills executors, configured by BLACKLIST_KILL_ENABLED") {
     val allocationClientMock = mock[ExecutorAllocationClient]
-    when(allocationClientMock.killExecutors(any(), any(), any())).thenReturn(Seq("called"))
+    when(allocationClientMock.killExecutors(any(), any(), any(), any())).thenReturn(Seq("called"))
     when(allocationClientMock.killExecutorsOnHost("hostA")).thenAnswer(new Answer[Boolean] {
       // To avoid a race between blacklisting and killing, it is important that the nodeBlacklist
       // is updated before we ask the executor allocation client to kill all the executors
@@ -517,7 +517,7 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     }
     blacklist.updateBlacklistForSuccessfulTaskSet(0, 0, taskSetBlacklist1.execToFailures)
 
-    verify(allocationClientMock, never).killExecutors(any(), any(), any())
+    verify(allocationClientMock, never).killExecutors(any(), any(), any(), any())
     verify(allocationClientMock, never).killExecutorsOnHost(any())
 
     // Enable auto-kill. Blacklist an executor and make sure killExecutors is called.
@@ -533,7 +533,7 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     }
     blacklist.updateBlacklistForSuccessfulTaskSet(0, 0, taskSetBlacklist2.execToFailures)
 
-    verify(allocationClientMock).killExecutors(Seq("1"), true, true)
+    verify(allocationClientMock).killExecutors(Seq("1"), false, false, true)
 
     val taskSetBlacklist3 = createTaskSetBlacklist(stageId = 1)
     // Fail 4 tasks in one task set on executor 2, so that executor gets blacklisted for the whole
@@ -545,13 +545,13 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     }
     blacklist.updateBlacklistForSuccessfulTaskSet(0, 0, taskSetBlacklist3.execToFailures)
 
-    verify(allocationClientMock).killExecutors(Seq("2"), true, true)
+    verify(allocationClientMock).killExecutors(Seq("2"), false, false, true)
     verify(allocationClientMock).killExecutorsOnHost("hostA")
   }
 
   test("fetch failure blacklisting kills executors, configured by BLACKLIST_KILL_ENABLED") {
     val allocationClientMock = mock[ExecutorAllocationClient]
-    when(allocationClientMock.killExecutors(any(), any(), any())).thenReturn(Seq("called"))
+    when(allocationClientMock.killExecutors(any(), any(), any(), any())).thenReturn(Seq("called"))
     when(allocationClientMock.killExecutorsOnHost("hostA")).thenAnswer(new Answer[Boolean] {
       // To avoid a race between blacklisting and killing, it is important that the nodeBlacklist
       // is updated before we ask the executor allocation client to kill all the executors
@@ -571,7 +571,7 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     conf.set(config.BLACKLIST_KILL_ENABLED, false)
     blacklist.updateBlacklistForFetchFailure("hostA", exec = "1")
 
-    verify(allocationClientMock, never).killExecutors(any(), any(), any())
+    verify(allocationClientMock, never).killExecutors(any(), any(), any(), any())
     verify(allocationClientMock, never).killExecutorsOnHost(any())
 
     // Enable auto-kill. Blacklist an executor and make sure killExecutors is called.
@@ -580,7 +580,7 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     clock.advance(1000)
     blacklist.updateBlacklistForFetchFailure("hostA", exec = "1")
 
-    verify(allocationClientMock).killExecutors(Seq("1"), true, true)
+    verify(allocationClientMock).killExecutors(Seq("1"), false, false, true)
     verify(allocationClientMock, never).killExecutorsOnHost(any())
 
     assert(blacklist.executorIdToBlacklistStatus.contains("1"))

From fe9cb4afe39944b394b39cc25622e311554ae187 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 28 Feb 2018 11:00:54 +0900
Subject: [PATCH 319/566] [SPARK-23448][SQL] Clarify JSON and CSV parser
 behavior in document

## What changes were proposed in this pull request?

Clarify JSON and CSV reader behavior in document.

JSON doesn't support partial results for corrupted records.
CSV only supports partial results for the records with more or less tokens.

## How was this patch tested?

Pass existing tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #20666 from viirya/SPARK-23448-2.

(cherry picked from commit b14993e1fcb68e1c946a671c6048605ab4afdf58)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 python/pyspark/sql/readwriter.py              | 30 ++++++++++---------
 python/pyspark/sql/streaming.py               | 30 ++++++++++---------
 .../sql/catalyst/json/JacksonParser.scala     |  3 ++
 .../apache/spark/sql/DataFrameReader.scala    | 22 +++++++-------
 .../datasources/csv/UnivocityParser.scala     |  5 ++++
 .../sql/streaming/DataStreamReader.scala      | 22 +++++++-------
 6 files changed, 64 insertions(+), 48 deletions(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 49af1bcee5ef8..9d05ac7cb39be 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -209,13 +209,13 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
         :param mode: allows a mode for dealing with corrupt records during parsing. If None is
                      set, it uses the default value, ``PERMISSIVE``.
 
-                * ``PERMISSIVE`` : sets other fields to ``null`` when it meets a corrupted \
-                 record, and puts the malformed string into a field configured by \
-                 ``columnNameOfCorruptRecord``. To keep corrupt records, an user can set \
-                 a string type field named ``columnNameOfCorruptRecord`` in an user-defined \
-                 schema. If a schema does not have the field, it drops corrupt records during \
-                 parsing. When inferring a schema, it implicitly adds a \
-                 ``columnNameOfCorruptRecord`` field in an output schema.
+                * ``PERMISSIVE`` : when it meets a corrupted record, puts the malformed string \
+                  into a field configured by ``columnNameOfCorruptRecord``, and sets other \
+                  fields to ``null``. To keep corrupt records, an user can set a string type \
+                  field named ``columnNameOfCorruptRecord`` in an user-defined schema. If a \
+                  schema does not have the field, it drops corrupt records during parsing. \
+                  When inferring a schema, it implicitly adds a ``columnNameOfCorruptRecord`` \
+                  field in an output schema.
                 *  ``DROPMALFORMED`` : ignores the whole corrupted records.
                 *  ``FAILFAST`` : throws an exception when it meets corrupted records.
 
@@ -393,13 +393,15 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
         :param mode: allows a mode for dealing with corrupt records during parsing. If None is
                      set, it uses the default value, ``PERMISSIVE``.
 
-                * ``PERMISSIVE`` : sets other fields to ``null`` when it meets a corrupted \
-                  record, and puts the malformed string into a field configured by \
-                  ``columnNameOfCorruptRecord``. To keep corrupt records, an user can set \
-                  a string type field named ``columnNameOfCorruptRecord`` in an \
-                  user-defined schema. If a schema does not have the field, it drops corrupt \
-                  records during parsing. When a length of parsed CSV tokens is shorter than \
-                  an expected length of a schema, it sets `null` for extra fields.
+                * ``PERMISSIVE`` : when it meets a corrupted record, puts the malformed string \
+                  into a field configured by ``columnNameOfCorruptRecord``, and sets other \
+                  fields to ``null``. To keep corrupt records, an user can set a string type \
+                  field named ``columnNameOfCorruptRecord`` in an user-defined schema. If a \
+                  schema does not have the field, it drops corrupt records during parsing. \
+                  A record with less/more tokens than schema is not a corrupted record to CSV. \
+                  When it meets a record having fewer tokens than the length of the schema, \
+                  sets ``null`` to extra fields. When the record has more tokens than the \
+                  length of the schema, it drops extra tokens.
                 * ``DROPMALFORMED`` : ignores the whole corrupted records.
                 * ``FAILFAST`` : throws an exception when it meets corrupted records.
 
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index e2a97acb5e2a7..cc622decfd682 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -442,13 +442,13 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
         :param mode: allows a mode for dealing with corrupt records during parsing. If None is
                      set, it uses the default value, ``PERMISSIVE``.
 
-                * ``PERMISSIVE`` : sets other fields to ``null`` when it meets a corrupted \
-                 record, and puts the malformed string into a field configured by \
-                 ``columnNameOfCorruptRecord``. To keep corrupt records, an user can set \
-                 a string type field named ``columnNameOfCorruptRecord`` in an user-defined \
-                 schema. If a schema does not have the field, it drops corrupt records during \
-                 parsing. When inferring a schema, it implicitly adds a \
-                 ``columnNameOfCorruptRecord`` field in an output schema.
+                * ``PERMISSIVE`` : when it meets a corrupted record, puts the malformed string \
+                  into a field configured by ``columnNameOfCorruptRecord``, and sets other \
+                  fields to ``null``. To keep corrupt records, an user can set a string type \
+                  field named ``columnNameOfCorruptRecord`` in an user-defined schema. If a \
+                  schema does not have the field, it drops corrupt records during parsing. \
+                  When inferring a schema, it implicitly adds a ``columnNameOfCorruptRecord`` \
+                  field in an output schema.
                 *  ``DROPMALFORMED`` : ignores the whole corrupted records.
                 *  ``FAILFAST`` : throws an exception when it meets corrupted records.
 
@@ -621,13 +621,15 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
         :param mode: allows a mode for dealing with corrupt records during parsing. If None is
                      set, it uses the default value, ``PERMISSIVE``.
 
-                * ``PERMISSIVE`` : sets other fields to ``null`` when it meets a corrupted \
-                  record, and puts the malformed string into a field configured by \
-                  ``columnNameOfCorruptRecord``. To keep corrupt records, an user can set \
-                  a string type field named ``columnNameOfCorruptRecord`` in an \
-                  user-defined schema. If a schema does not have the field, it drops corrupt \
-                  records during parsing. When a length of parsed CSV tokens is shorter than \
-                  an expected length of a schema, it sets `null` for extra fields.
+                * ``PERMISSIVE`` : when it meets a corrupted record, puts the malformed string \
+                  into a field configured by ``columnNameOfCorruptRecord``, and sets other \
+                  fields to ``null``. To keep corrupt records, an user can set a string type \
+                  field named ``columnNameOfCorruptRecord`` in an user-defined schema. If a \
+                  schema does not have the field, it drops corrupt records during parsing. \
+                  A record with less/more tokens than schema is not a corrupted record to CSV. \
+                  When it meets a record having fewer tokens than the length of the schema, \
+                  sets ``null`` to extra fields. When the record has more tokens than the \
+                  length of the schema, it drops extra tokens.
                 * ``DROPMALFORMED`` : ignores the whole corrupted records.
                 * ``FAILFAST`` : throws an exception when it meets corrupted records.
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
index bd144c9575c72..7f6956994f31f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
@@ -357,6 +357,9 @@ class JacksonParser(
       }
     } catch {
       case e @ (_: RuntimeException | _: JsonProcessingException) =>
+        // JSON parser currently doesn't support partial results for corrupted records.
+        // For such records, all fields other than the field configured by
+        // `columnNameOfCorruptRecord` are set to `null`.
         throw BadRecordException(() => recordLiteral(record), () => None, e)
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index fcaf8d618c168..395e1c999f025 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -368,12 +368,12 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
    * during parsing.
    *   <ul>
-   *     <li>`PERMISSIVE` : sets other fields to `null` when it meets a corrupted record, and puts
-   *     the malformed string into a field configured by `columnNameOfCorruptRecord`. To keep
-   *     corrupt records, an user can set a string type field named `columnNameOfCorruptRecord`
-   *     in an user-defined schema. If a schema does not have the field, it drops corrupt records
-   *     during parsing. When inferring a schema, it implicitly adds a `columnNameOfCorruptRecord`
-   *     field in an output schema.</li>
+   *     <li>`PERMISSIVE` : when it meets a corrupted record, puts the malformed string into a
+   *     field configured by `columnNameOfCorruptRecord`, and sets other fields to `null`. To
+   *     keep corrupt records, an user can set a string type field named
+   *     `columnNameOfCorruptRecord` in an user-defined schema. If a schema does not have the
+   *     field, it drops corrupt records during parsing. When inferring a schema, it implicitly
+   *     adds a `columnNameOfCorruptRecord` field in an output schema.</li>
    *     <li>`DROPMALFORMED` : ignores the whole corrupted records.</li>
    *     <li>`FAILFAST` : throws an exception when it meets corrupted records.</li>
    *   </ul>
@@ -573,12 +573,14 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
    *    during parsing. It supports the following case-insensitive modes.
    *   <ul>
-   *     <li>`PERMISSIVE` : sets other fields to `null` when it meets a corrupted record, and puts
-   *     the malformed string into a field configured by `columnNameOfCorruptRecord`. To keep
+   *     <li>`PERMISSIVE` : when it meets a corrupted record, puts the malformed string into a
+   *     field configured by `columnNameOfCorruptRecord`, and sets other fields to `null`. To keep
    *     corrupt records, an user can set a string type field named `columnNameOfCorruptRecord`
    *     in an user-defined schema. If a schema does not have the field, it drops corrupt records
-   *     during parsing. When a length of parsed CSV tokens is shorter than an expected length
-   *     of a schema, it sets `null` for extra fields.</li>
+   *     during parsing. A record with less/more tokens than schema is not a corrupted record to
+   *     CSV. When it meets a record having fewer tokens than the length of the schema, sets
+   *     `null` to extra fields. When the record has more tokens than the length of the schema,
+   *     it drops extra tokens.</li>
    *     <li>`DROPMALFORMED` : ignores the whole corrupted records.</li>
    *     <li>`FAILFAST` : throws an exception when it meets corrupted records.</li>
    *   </ul>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index 7d6d7e7eef926..3d6cc30f2ba83 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -203,6 +203,8 @@ class UnivocityParser(
           case _: BadRecordException => None
         }
       }
+      // For records with less or more tokens than the schema, tries to return partial results
+      // if possible.
       throw BadRecordException(
         () => getCurrentInput,
         () => getPartialResult(),
@@ -218,6 +220,9 @@ class UnivocityParser(
         row
       } catch {
         case NonFatal(e) =>
+          // For corrupted records with the number of tokens same as the schema,
+          // CSV reader doesn't support partial results. All fields other than the field
+          // configured by `columnNameOfCorruptRecord` are set to `null`.
           throw BadRecordException(() => getCurrentInput, () => None, e)
       }
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index f23851655350a..61e22fac854f9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -236,12 +236,12 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
    * <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
    * during parsing.
    *   <ul>
-   *     <li>`PERMISSIVE` : sets other fields to `null` when it meets a corrupted record, and puts
-   *     the malformed string into a field configured by `columnNameOfCorruptRecord`. To keep
-   *     corrupt records, an user can set a string type field named `columnNameOfCorruptRecord`
-   *     in an user-defined schema. If a schema does not have the field, it drops corrupt records
-   *     during parsing. When inferring a schema, it implicitly adds a `columnNameOfCorruptRecord`
-   *     field in an output schema.</li>
+   *     <li>`PERMISSIVE` : when it meets a corrupted record, puts the malformed string into a
+   *     field configured by `columnNameOfCorruptRecord`, and sets other fields to `null`. To
+   *     keep corrupt records, an user can set a string type field named
+   *     `columnNameOfCorruptRecord` in an user-defined schema. If a schema does not have the
+   *     field, it drops corrupt records during parsing. When inferring a schema, it implicitly
+   *     adds a `columnNameOfCorruptRecord` field in an output schema.</li>
    *     <li>`DROPMALFORMED` : ignores the whole corrupted records.</li>
    *     <li>`FAILFAST` : throws an exception when it meets corrupted records.</li>
    *   </ul>
@@ -316,12 +316,14 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
    * <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
    *    during parsing. It supports the following case-insensitive modes.
    *   <ul>
-   *     <li>`PERMISSIVE` : sets other fields to `null` when it meets a corrupted record, and puts
-   *     the malformed string into a field configured by `columnNameOfCorruptRecord`. To keep
+   *     <li>`PERMISSIVE` : when it meets a corrupted record, puts the malformed string into a
+   *     field configured by `columnNameOfCorruptRecord`, and sets other fields to `null`. To keep
    *     corrupt records, an user can set a string type field named `columnNameOfCorruptRecord`
    *     in an user-defined schema. If a schema does not have the field, it drops corrupt records
-   *     during parsing. When a length of parsed CSV tokens is shorter than an expected length
-   *     of a schema, it sets `null` for extra fields.</li>
+   *     during parsing. A record with less/more tokens than schema is not a corrupted record to
+   *     CSV. When it meets a record having fewer tokens than the length of the schema, sets
+   *     `null` to extra fields. When the record has more tokens than the length of the schema,
+   *     it drops extra tokens.</li>
    *     <li>`DROPMALFORMED` : ignores the whole corrupted records.</li>
    *     <li>`FAILFAST` : throws an exception when it meets corrupted records.</li>
    *   </ul>

From dfa43792feb78b4cc3776606b3a13eff3586fbb1 Mon Sep 17 00:00:00 2001
From: zhoukang <zhoukang199191@gmail.com>
Date: Wed, 28 Feb 2018 23:16:29 +0800
Subject: [PATCH 320/566] [SPARK-23508][CORE] Fix BlockmanagerId in case
 blockManagerIdCache cause oom
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… cause oom

## What changes were proposed in this pull request?
blockManagerIdCache in BlockManagerId will not remove old values which may cause oom

`val blockManagerIdCache = new ConcurrentHashMap[BlockManagerId, BlockManagerId]()`
Since whenever we apply a new BlockManagerId, it will put into this map.

This patch will use guava cahce for  blockManagerIdCache instead.

A heap dump show in [SPARK-23508](https://issues.apache.org/jira/browse/SPARK-23508)

## How was this patch tested?
Exist tests.

Author: zhoukang <zhoukang199191@gmail.com>

Closes #20667 from caneGuy/zhoukang/fix-history.

(cherry picked from commit 6a8abe29ef3369b387d9bc2ee3459a6611246ab1)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../org/apache/spark/storage/BlockManagerId.scala  | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerId.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerId.scala
index 2c3da0ee85e06..d4a59c33b974c 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerId.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerId.scala
@@ -18,7 +18,8 @@
 package org.apache.spark.storage
 
 import java.io.{Externalizable, IOException, ObjectInput, ObjectOutput}
-import java.util.concurrent.ConcurrentHashMap
+
+import com.google.common.cache.{CacheBuilder, CacheLoader}
 
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.DeveloperApi
@@ -132,10 +133,17 @@ private[spark] object BlockManagerId {
     getCachedBlockManagerId(obj)
   }
 
-  val blockManagerIdCache = new ConcurrentHashMap[BlockManagerId, BlockManagerId]()
+  /**
+   * The max cache size is hardcoded to 10000, since the size of a BlockManagerId
+   * object is about 48B, the total memory cost should be below 1MB which is feasible.
+   */
+  val blockManagerIdCache = CacheBuilder.newBuilder()
+    .maximumSize(10000)
+    .build(new CacheLoader[BlockManagerId, BlockManagerId]() {
+      override def load(id: BlockManagerId) = id
+    })
 
   def getCachedBlockManagerId(id: BlockManagerId): BlockManagerId = {
-    blockManagerIdCache.putIfAbsent(id, id)
     blockManagerIdCache.get(id)
   }
 }

From a4eb1e47ad2453b41ebb431272c92e1ac48bb310 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Thu, 1 Mar 2018 00:44:13 +0900
Subject: [PATCH 321/566] [SPARK-23517][PYTHON] Make
 `pyspark.util._exception_message` produce the trace from Java side by
 Py4JJavaError

## What changes were proposed in this pull request?

This PR proposes for `pyspark.util._exception_message` to produce the trace from Java side by `Py4JJavaError`.

Currently, in Python 2, it uses `message` attribute which `Py4JJavaError` didn't happen to have:

```python
>>> from pyspark.util import _exception_message
>>> try:
...     sc._jvm.java.lang.String(None)
... except Exception as e:
...     pass
...
>>> e.message
''
```

Seems we should use `str` instead for now:

 https://github.com/bartdag/py4j/blob/aa6c53b59027925a426eb09b58c453de02c21b7c/py4j-python/src/py4j/protocol.py#L412

but this doesn't address the problem with non-ascii string from Java side -
 `https://github.com/bartdag/py4j/issues/306`

So, we could directly call `__str__()`:

```python
>>> e.__str__()
u'An error occurred while calling None.java.lang.String.\n: java.lang.NullPointerException\n\tat java.lang.String.<init>(String.java:588)\n\tat sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)\n\tat sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)\n\tat sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)\n\tat java.lang.reflect.Constructor.newInstance(Constructor.java:422)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:238)\n\tat py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)\n\tat py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:745)\n'
```

which doesn't type coerce unicodes to `str` in Python 2.

This can be actually a problem:

```python
from pyspark.sql.functions import udf
spark.conf.set("spark.sql.execution.arrow.enabled", True)
spark.range(1).select(udf(lambda x: [[]])()).toPandas()
```

**Before**

```
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/.../spark/python/pyspark/sql/dataframe.py", line 2009, in toPandas
    raise RuntimeError("%s\n%s" % (_exception_message(e), msg))
RuntimeError:
Note: toPandas attempted Arrow optimization because 'spark.sql.execution.arrow.enabled' is set to true. Please set it to false to disable this.
```

**After**

```
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/.../spark/python/pyspark/sql/dataframe.py", line 2009, in toPandas
    raise RuntimeError("%s\n%s" % (_exception_message(e), msg))
RuntimeError: An error occurred while calling o47.collectAsArrowToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 7 in stage 0.0 failed 1 times, most recent failure: Lost task 7.0 in stage 0.0 (TID 7, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/.../spark/python/pyspark/worker.py", line 245, in main
    process()
  File "/.../spark/python/pyspark/worker.py", line 240, in process
...
Note: toPandas attempted Arrow optimization because 'spark.sql.execution.arrow.enabled' is set to true. Please set it to false to disable this.
```

## How was this patch tested?

Manually tested and unit tests were added.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20680 from HyukjinKwon/SPARK-23517.

(cherry picked from commit fab563b9bd1581112462c0fc0b299ad6510b6564)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 python/pyspark/tests.py | 11 +++++++++++
 python/pyspark/util.py  |  7 +++++++
 2 files changed, 18 insertions(+)

diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index da99872da2f0e..18bbeb53d769a 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -2286,6 +2286,17 @@ def set(self, x=None, other=None, other_x=None):
         self.assertEqual(b._x, 2)
 
 
+class UtilTests(PySparkTestCase):
+    def test_py4j_exception_message(self):
+        from pyspark.util import _exception_message
+
+        with self.assertRaises(Py4JJavaError) as context:
+            # This attempts java.lang.String(null) which throws an NPE.
+            self.sc._jvm.java.lang.String(None)
+
+        self.assertTrue('NullPointerException' in _exception_message(context.exception))
+
+
 @unittest.skipIf(not _have_scipy, "SciPy not installed")
 class SciPyTests(PySparkTestCase):
 
diff --git a/python/pyspark/util.py b/python/pyspark/util.py
index e5d332ce54429..ad4a0bc68ef41 100644
--- a/python/pyspark/util.py
+++ b/python/pyspark/util.py
@@ -15,6 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from py4j.protocol import Py4JJavaError
 
 __all__ = []
 
@@ -33,6 +34,12 @@ def _exception_message(excp):
     >>> msg == _exception_message(excp)
     True
     """
+    if isinstance(excp, Py4JJavaError):
+        # 'Py4JJavaError' doesn't contain the stack trace available on the Java side in 'message'
+        # attribute in Python 2. We should call 'str' function on this exception in general but
+        # 'Py4JJavaError' has an issue about addressing non-ascii strings. So, here we work
+        # around by the direct call, '__str__()'. Please see SPARK-23517.
+        return excp.__str__()
     if hasattr(excp, "message"):
         return excp.message
     return str(excp)

From 2aa66eb387d20725bbd7551d2d77609a77b1e699 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Thu, 1 Mar 2018 17:26:39 -0800
Subject: [PATCH 322/566] [SPARK-23551][BUILD] Exclude
 `hadoop-mapreduce-client-core` dependency from `orc-mapreduce`

## What changes were proposed in this pull request?

This PR aims to prevent `orc-mapreduce` dependency from making IDEs and maven confused.

**BEFORE**
Please note that `2.6.4` at `Spark Project SQL`.
```
$ mvn dependency:tree -Phadoop-2.7 -Dincludes=org.apache.hadoop:hadoop-mapreduce-client-core
...
[INFO] ------------------------------------------------------------------------
[INFO] Building Spark Project Catalyst 2.4.0-SNAPSHOT
[INFO] ------------------------------------------------------------------------
[INFO]
[INFO] --- maven-dependency-plugin:3.0.2:tree (default-cli)  spark-catalyst_2.11 ---
[INFO] org.apache.spark:spark-catalyst_2.11:jar:2.4.0-SNAPSHOT
[INFO] \- org.apache.spark:spark-core_2.11:jar:2.4.0-SNAPSHOT:compile
[INFO]    \- org.apache.hadoop:hadoop-client:jar:2.7.3:compile
[INFO]       \- org.apache.hadoop:hadoop-mapreduce-client-core:jar:2.7.3:compile
[INFO]
[INFO] ------------------------------------------------------------------------
[INFO] Building Spark Project SQL 2.4.0-SNAPSHOT
[INFO] ------------------------------------------------------------------------
[INFO]
[INFO] --- maven-dependency-plugin:3.0.2:tree (default-cli)  spark-sql_2.11 ---
[INFO] org.apache.spark:spark-sql_2.11:jar:2.4.0-SNAPSHOT
[INFO] \- org.apache.orc:orc-mapreduce:jar:nohive:1.4.3:compile
[INFO]    \- org.apache.hadoop:hadoop-mapreduce-client-core:jar:2.6.4:compile
```

**AFTER**
```
$ mvn dependency:tree -Phadoop-2.7 -Dincludes=org.apache.hadoop:hadoop-mapreduce-client-core
...
[INFO] ------------------------------------------------------------------------
[INFO] Building Spark Project Catalyst 2.4.0-SNAPSHOT
[INFO] ------------------------------------------------------------------------
[INFO]
[INFO] --- maven-dependency-plugin:3.0.2:tree (default-cli)  spark-catalyst_2.11 ---
[INFO] org.apache.spark:spark-catalyst_2.11:jar:2.4.0-SNAPSHOT
[INFO] \- org.apache.spark:spark-core_2.11:jar:2.4.0-SNAPSHOT:compile
[INFO]    \- org.apache.hadoop:hadoop-client:jar:2.7.3:compile
[INFO]       \- org.apache.hadoop:hadoop-mapreduce-client-core:jar:2.7.3:compile
[INFO]
[INFO] ------------------------------------------------------------------------
[INFO] Building Spark Project SQL 2.4.0-SNAPSHOT
[INFO] ------------------------------------------------------------------------
[INFO]
[INFO] --- maven-dependency-plugin:3.0.2:tree (default-cli)  spark-sql_2.11 ---
[INFO] org.apache.spark:spark-sql_2.11:jar:2.4.0-SNAPSHOT
[INFO] \- org.apache.spark:spark-core_2.11:jar:2.4.0-SNAPSHOT:compile
[INFO]    \- org.apache.hadoop:hadoop-client:jar:2.7.3:compile
[INFO]       \- org.apache.hadoop:hadoop-mapreduce-client-core:jar:2.7.3:compile
```

## How was this patch tested?

1. Pass the Jenkins with `dev/test-dependencies.sh` with the existing dependencies.
2. Manually do the following and see the change.
```
mvn dependency:tree -Phadoop-2.7 -Dincludes=org.apache.hadoop:hadoop-mapreduce-client-core
```

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20704 from dongjoon-hyun/SPARK-23551.

(cherry picked from commit 34811e0b908449fd59bca476604612b1d200778d)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 pom.xml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pom.xml b/pom.xml
index 9aa531ec65746..651ce7104c59f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1756,6 +1756,10 @@
             <groupId>org.apache.hadoop</groupId>
             <artifactId>hadoop-common</artifactId>
           </exclusion>
+          <exclusion>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-mapreduce-client-core</artifactId>
+          </exclusion>
           <exclusion>
             <groupId>org.apache.orc</groupId>
             <artifactId>orc-core</artifactId>

From 56cfbd932d3d038ce21cfa4939dfd9563c719003 Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Thu, 1 Mar 2018 21:04:01 -0800
Subject: [PATCH 323/566] [SPARK-22883][ML][TEST] Streaming tests for
 spark.ml.feature, from A to H

## What changes were proposed in this pull request?

Adds structured streaming tests using testTransformer for these suites:
* BinarizerSuite
* BucketedRandomProjectionLSHSuite
* BucketizerSuite
* ChiSqSelectorSuite
* CountVectorizerSuite
* DCTSuite.scala
* ElementwiseProductSuite
* FeatureHasherSuite
* HashingTFSuite

## How was this patch tested?

It tests itself because it is a bunch of tests!

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #20111 from jkbradley/SPARK-22883-streaming-featureAM.

(cherry picked from commit 119f6a0e4729aa952e811d2047790a32ee90bf69)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 .../spark/ml/feature/BinarizerSuite.scala     |  8 ++--
 .../BucketedRandomProjectionLSHSuite.scala    | 26 ++++++++---
 .../spark/ml/feature/BucketizerSuite.scala    | 11 +++--
 .../spark/ml/feature/ChiSqSelectorSuite.scala | 36 +++++++--------
 .../ml/feature/CountVectorizerSuite.scala     | 23 +++++-----
 .../apache/spark/ml/feature/DCTSuite.scala    | 14 +++---
 .../ml/feature/ElementwiseProductSuite.scala  | 30 ++++++++++---
 .../spark/ml/feature/FeatureHasherSuite.scala | 45 +++++++++----------
 .../spark/ml/feature/HashingTFSuite.scala     | 34 ++++++++------
 9 files changed, 126 insertions(+), 101 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala
index 4455d35210878..05d4a6ee2dabf 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala
@@ -17,14 +17,12 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
-import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
 import org.apache.spark.sql.{DataFrame, Row}
 
-class BinarizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class BinarizerSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -47,7 +45,7 @@ class BinarizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
       .setInputCol("feature")
       .setOutputCol("binarized_feature")
 
-    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
+    testTransformer[(Double, Double)](dataFrame, binarizer, "binarized_feature", "expected") {
       case Row(x: Double, y: Double) =>
         assert(x === y, "The feature value is not correct after binarization.")
     }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala
index 7175c721bff36..ed9a39d8d1512 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala
@@ -20,16 +20,15 @@ package org.apache.spark.ml.feature
 import breeze.numerics.{cos, sin}
 import breeze.numerics.constants.Pi
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
-import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.{Dataset, Row}
 
-class BucketedRandomProjectionLSHSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class BucketedRandomProjectionLSHSuite extends MLTest with DefaultReadWriteTest {
+
+  import testImplicits._
 
   @transient var dataset: Dataset[_] = _
 
@@ -98,6 +97,21 @@ class BucketedRandomProjectionLSHSuite
     MLTestingUtils.checkCopyAndUids(brp, brpModel)
   }
 
+  test("BucketedRandomProjectionLSH: streaming transform") {
+    val brp = new BucketedRandomProjectionLSH()
+      .setNumHashTables(2)
+      .setInputCol("keys")
+      .setOutputCol("values")
+      .setBucketLength(1.0)
+      .setSeed(12345)
+    val brpModel = brp.fit(dataset)
+
+    testTransformer[Tuple1[Vector]](dataset.toDF(), brpModel, "values") {
+      case Row(values: Seq[_]) =>
+        assert(values.length === brp.getNumHashTables)
+    }
+  }
+
   test("BucketedRandomProjectionLSH: test of LSH property") {
     // Project from 2 dimensional Euclidean Space to 1 dimensions
     val brp = new BucketedRandomProjectionLSH()
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
index 41cf72fe3470a..9ea15e1918532 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
@@ -23,14 +23,13 @@ import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.Pipeline
 import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
 import org.apache.spark.ml.util.TestingUtils._
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 
-class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class BucketizerSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -50,7 +49,7 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
       .setOutputCol("result")
       .setSplits(splits)
 
-    bucketizer.transform(dataFrame).select("result", "expected").collect().foreach {
+    testTransformer[(Double, Double)](dataFrame, bucketizer, "result", "expected") {
       case Row(x: Double, y: Double) =>
         assert(x === y,
           s"The feature value is not correct after bucketing.  Expected $y but found $x")
@@ -84,7 +83,7 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
       .setOutputCol("result")
       .setSplits(splits)
 
-    bucketizer.transform(dataFrame).select("result", "expected").collect().foreach {
+    testTransformer[(Double, Double)](dataFrame, bucketizer, "result", "expected") {
       case Row(x: Double, y: Double) =>
         assert(x === y,
           s"The feature value is not correct after bucketing.  Expected $y but found $x")
@@ -103,7 +102,7 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
       .setSplits(splits)
 
     bucketizer.setHandleInvalid("keep")
-    bucketizer.transform(dataFrame).select("result", "expected").collect().foreach {
+    testTransformer[(Double, Double)](dataFrame, bucketizer, "result", "expected") {
       case Row(x: Double, y: Double) =>
         assert(x === y,
           s"The feature value is not correct after bucketing.  Expected $y but found $x")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
index c83909c4498f2..c843df9f33e3e 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
@@ -17,16 +17,15 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{Dataset, Row}
 
-class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
-  with DefaultReadWriteTest {
+class ChiSqSelectorSuite extends MLTest with DefaultReadWriteTest {
+
+  import testImplicits._
 
   @transient var dataset: Dataset[_] = _
 
@@ -119,32 +118,32 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
   test("Test Chi-Square selector: numTopFeatures") {
     val selector = new ChiSqSelector()
       .setOutputCol("filtered").setSelectorType("numTopFeatures").setNumTopFeatures(1)
-    val model = ChiSqSelectorSuite.testSelector(selector, dataset)
+    val model = testSelector(selector, dataset)
     MLTestingUtils.checkCopyAndUids(selector, model)
   }
 
   test("Test Chi-Square selector: percentile") {
     val selector = new ChiSqSelector()
       .setOutputCol("filtered").setSelectorType("percentile").setPercentile(0.17)
-    ChiSqSelectorSuite.testSelector(selector, dataset)
+    testSelector(selector, dataset)
   }
 
   test("Test Chi-Square selector: fpr") {
     val selector = new ChiSqSelector()
       .setOutputCol("filtered").setSelectorType("fpr").setFpr(0.02)
-    ChiSqSelectorSuite.testSelector(selector, dataset)
+    testSelector(selector, dataset)
   }
 
   test("Test Chi-Square selector: fdr") {
     val selector = new ChiSqSelector()
       .setOutputCol("filtered").setSelectorType("fdr").setFdr(0.12)
-    ChiSqSelectorSuite.testSelector(selector, dataset)
+    testSelector(selector, dataset)
   }
 
   test("Test Chi-Square selector: fwe") {
     val selector = new ChiSqSelector()
       .setOutputCol("filtered").setSelectorType("fwe").setFwe(0.12)
-    ChiSqSelectorSuite.testSelector(selector, dataset)
+    testSelector(selector, dataset)
   }
 
   test("read/write") {
@@ -163,18 +162,19 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
         assert(expected.selectedFeatures === actual.selectedFeatures)
       }
   }
-}
 
-object ChiSqSelectorSuite {
-
-  private def testSelector(selector: ChiSqSelector, dataset: Dataset[_]): ChiSqSelectorModel = {
-    val selectorModel = selector.fit(dataset)
-    selectorModel.transform(dataset).select("filtered", "topFeature").collect()
-      .foreach { case Row(vec1: Vector, vec2: Vector) =>
+  private def testSelector(selector: ChiSqSelector, data: Dataset[_]): ChiSqSelectorModel = {
+    val selectorModel = selector.fit(data)
+    testTransformer[(Double, Vector, Vector)](data.toDF(), selectorModel,
+      "filtered", "topFeature") {
+      case Row(vec1: Vector, vec2: Vector) =>
         assert(vec1 ~== vec2 absTol 1e-1)
-      }
+    }
     selectorModel
   }
+}
+
+object ChiSqSelectorSuite {
 
   /**
    * Mapping from all Params to valid settings which differ from the defaults.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
index f213145f1ba0a..b4cabff0ecacb 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
@@ -16,16 +16,13 @@
  */
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.Row
 
-class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
-  with DefaultReadWriteTest {
+class CountVectorizerSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -50,7 +47,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
     val cv = new CountVectorizerModel(Array("a", "b", "c", "d"))
       .setInputCol("words")
       .setOutputCol("features")
-    cv.transform(df).select("features", "expected").collect().foreach {
+    testTransformer[(Int, Seq[String], Vector)](df, cv, "features", "expected") {
       case Row(features: Vector, expected: Vector) =>
         assert(features ~== expected absTol 1e-14)
     }
@@ -72,7 +69,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
     MLTestingUtils.checkCopyAndUids(cv, cvm)
     assert(cvm.vocabulary.toSet === Set("a", "b", "c", "d", "e"))
 
-    cvm.transform(df).select("features", "expected").collect().foreach {
+    testTransformer[(Int, Seq[String], Vector)](df, cvm, "features", "expected") {
       case Row(features: Vector, expected: Vector) =>
         assert(features ~== expected absTol 1e-14)
     }
@@ -100,7 +97,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
       .fit(df)
     assert(cvModel2.vocabulary === Array("a", "b"))
 
-    cvModel2.transform(df).select("features", "expected").collect().foreach {
+    testTransformer[(Int, Seq[String], Vector)](df, cvModel2, "features", "expected") {
       case Row(features: Vector, expected: Vector) =>
         assert(features ~== expected absTol 1e-14)
     }
@@ -113,7 +110,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
       .fit(df)
     assert(cvModel3.vocabulary === Array("a", "b"))
 
-    cvModel3.transform(df).select("features", "expected").collect().foreach {
+    testTransformer[(Int, Seq[String], Vector)](df, cvModel3, "features", "expected") {
       case Row(features: Vector, expected: Vector) =>
         assert(features ~== expected absTol 1e-14)
     }
@@ -147,7 +144,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
       .setInputCol("words")
       .setOutputCol("features")
       .setMinTF(3)
-    cv.transform(df).select("features", "expected").collect().foreach {
+    testTransformer[(Int, Seq[String], Vector)](df, cv, "features", "expected") {
       case Row(features: Vector, expected: Vector) =>
         assert(features ~== expected absTol 1e-14)
     }
@@ -166,7 +163,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
       .setInputCol("words")
       .setOutputCol("features")
       .setMinTF(0.3)
-    cv.transform(df).select("features", "expected").collect().foreach {
+    testTransformer[(Int, Seq[String], Vector)](df, cv, "features", "expected") {
       case Row(features: Vector, expected: Vector) =>
         assert(features ~== expected absTol 1e-14)
     }
@@ -186,7 +183,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
       .setOutputCol("features")
       .setBinary(true)
       .fit(df)
-    cv.transform(df).select("features", "expected").collect().foreach {
+    testTransformer[(Int, Seq[String], Vector)](df, cv, "features", "expected") {
       case Row(features: Vector, expected: Vector) =>
         assert(features ~== expected absTol 1e-14)
     }
@@ -196,7 +193,7 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
       .setInputCol("words")
       .setOutputCol("features")
       .setBinary(true)
-    cv2.transform(df).select("features", "expected").collect().foreach {
+    testTransformer[(Int, Seq[String], Vector)](df, cv2, "features", "expected") {
       case Row(features: Vector, expected: Vector) =>
         assert(features ~== expected absTol 1e-14)
     }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/DCTSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/DCTSuite.scala
index 8dd3dd75e1be5..6734336aac39c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/DCTSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/DCTSuite.scala
@@ -21,16 +21,14 @@ import scala.beans.BeanInfo
 
 import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{Vector, Vectors}
-import org.apache.spark.ml.util.DefaultReadWriteTest
-import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
 import org.apache.spark.sql.Row
 
 @BeanInfo
 case class DCTTestData(vec: Vector, wantedVec: Vector)
 
-class DCTSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class DCTSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -72,11 +70,9 @@ class DCTSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultRead
       .setOutputCol("resultVec")
       .setInverse(inverse)
 
-    transformer.transform(dataset)
-      .select("resultVec", "wantedVec")
-      .collect()
-      .foreach { case Row(resultVec: Vector, wantedVec: Vector) =>
-      assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6)
+    testTransformer[(Vector, Vector)](dataset, transformer, "resultVec", "wantedVec") {
+      case Row(resultVec: Vector, wantedVec: Vector) =>
+        assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6)
     }
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ElementwiseProductSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ElementwiseProductSuite.scala
index a4cca27be7815..3a8d0762e2ab7 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ElementwiseProductSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ElementwiseProductSuite.scala
@@ -17,13 +17,31 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.ml.linalg.Vectors
-import org.apache.spark.ml.util.DefaultReadWriteTest
-import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.ml.linalg.{Vector, Vectors}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
+import org.apache.spark.ml.util.TestingUtils._
+import org.apache.spark.sql.Row
 
-class ElementwiseProductSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class ElementwiseProductSuite extends MLTest with DefaultReadWriteTest {
+
+  import testImplicits._
+
+  test("streaming transform") {
+    val scalingVec = Vectors.dense(0.1, 10.0)
+    val data = Seq(
+      (Vectors.dense(0.1, 1.0), Vectors.dense(0.01, 10.0)),
+      (Vectors.dense(0.0, -1.1), Vectors.dense(0.0, -11.0))
+    )
+    val df = spark.createDataFrame(data).toDF("features", "expected")
+    val ep = new ElementwiseProduct()
+      .setInputCol("features")
+      .setOutputCol("actual")
+      .setScalingVec(scalingVec)
+    testTransformer[(Vector, Vector)](df, ep, "actual", "expected") {
+      case Row(actual: Vector, expected: Vector) =>
+        assert(actual ~== expected relTol 1e-14)
+    }
+  }
 
   test("read/write") {
     val ep = new ElementwiseProduct()
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala
index 7bc1825b69c43..d799ba6011fa8 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala
@@ -17,27 +17,24 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
 import org.apache.spark.ml.util.TestingUtils._
-import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.functions.col
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
-class FeatureHasherSuite extends SparkFunSuite
-  with MLlibTestSparkContext
-  with DefaultReadWriteTest {
+class FeatureHasherSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
   import FeatureHasherSuite.murmur3FeatureIdx
 
-  implicit private val vectorEncoder = ExpressionEncoder[Vector]()
+  implicit private val vectorEncoder: ExpressionEncoder[Vector] = ExpressionEncoder[Vector]()
 
   test("params") {
     ParamsSuite.checkParams(new FeatureHasher)
@@ -52,31 +49,31 @@ class FeatureHasherSuite extends SparkFunSuite
   }
 
   test("feature hashing") {
+    val numFeatures = 100
+    // Assume perfect hash on field names in computing expected results
+    def idx: Any => Int = murmur3FeatureIdx(numFeatures)
+
     val df = Seq(
-      (2.0, true, "1", "foo"),
-      (3.0, false, "2", "bar")
-    ).toDF("real", "bool", "stringNum", "string")
+      (2.0, true, "1", "foo",
+        Vectors.sparse(numFeatures, Seq((idx("real"), 2.0), (idx("bool=true"), 1.0),
+          (idx("stringNum=1"), 1.0), (idx("string=foo"), 1.0)))),
+      (3.0, false, "2", "bar",
+        Vectors.sparse(numFeatures, Seq((idx("real"), 3.0), (idx("bool=false"), 1.0),
+          (idx("stringNum=2"), 1.0), (idx("string=bar"), 1.0))))
+    ).toDF("real", "bool", "stringNum", "string", "expected")
 
-    val n = 100
     val hasher = new FeatureHasher()
       .setInputCols("real", "bool", "stringNum", "string")
       .setOutputCol("features")
-      .setNumFeatures(n)
+      .setNumFeatures(numFeatures)
     val output = hasher.transform(df)
     val attrGroup = AttributeGroup.fromStructField(output.schema("features"))
-    assert(attrGroup.numAttributes === Some(n))
+    assert(attrGroup.numAttributes === Some(numFeatures))
 
-    val features = output.select("features").as[Vector].collect()
-    // Assume perfect hash on field names
-    def idx: Any => Int = murmur3FeatureIdx(n)
-    // check expected indices
-    val expected = Seq(
-      Vectors.sparse(n, Seq((idx("real"), 2.0), (idx("bool=true"), 1.0),
-        (idx("stringNum=1"), 1.0), (idx("string=foo"), 1.0))),
-      Vectors.sparse(n, Seq((idx("real"), 3.0), (idx("bool=false"), 1.0),
-        (idx("stringNum=2"), 1.0), (idx("string=bar"), 1.0)))
-    )
-    assert(features.zip(expected).forall { case (e, a) => e ~== a absTol 1e-14 })
+    testTransformer[(Double, Boolean, String, String, Vector)](df, hasher, "features", "expected") {
+      case Row(features: Vector, expected: Vector) =>
+        assert(features ~== expected absTol 1e-14 )
+    }
   }
 
   test("setting explicit numerical columns to treat as categorical") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala
index a46272fdce1fb..c5183ecfef7d7 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala
@@ -17,17 +17,16 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.feature.{HashingTF => MLlibHashingTF}
-import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.Row
 import org.apache.spark.util.Utils
 
-class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class HashingTFSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
   import HashingTFSuite.murmur3FeatureIdx
@@ -37,21 +36,28 @@ class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
   }
 
   test("hashingTF") {
-    val df = Seq((0, "a a b b c d".split(" ").toSeq)).toDF("id", "words")
-    val n = 100
+    val numFeatures = 100
+    // Assume perfect hash when computing expected features.
+    def idx: Any => Int = murmur3FeatureIdx(numFeatures)
+    val data = Seq(
+      ("a a b b c d".split(" ").toSeq,
+        Vectors.sparse(numFeatures,
+          Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0))))
+    )
+
+    val df = data.toDF("words", "expected")
     val hashingTF = new HashingTF()
       .setInputCol("words")
       .setOutputCol("features")
-      .setNumFeatures(n)
+      .setNumFeatures(numFeatures)
     val output = hashingTF.transform(df)
     val attrGroup = AttributeGroup.fromStructField(output.schema("features"))
-    require(attrGroup.numAttributes === Some(n))
-    val features = output.select("features").first().getAs[Vector](0)
-    // Assume perfect hash on "a", "b", "c", and "d".
-    def idx: Any => Int = murmur3FeatureIdx(n)
-    val expected = Vectors.sparse(n,
-      Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0)))
-    assert(features ~== expected absTol 1e-14)
+    require(attrGroup.numAttributes === Some(numFeatures))
+
+    testTransformer[(Seq[String], Vector)](df, hashingTF, "features", "expected") {
+      case Row(features: Vector, expected: Vector) =>
+        assert(features ~== expected absTol 1e-14)
+    }
   }
 
   test("applying binary term freqs") {

From 8fe20e15196b4ddbd80828ad3a91cf06c5dbea84 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Fri, 2 Mar 2018 09:23:39 -0800
Subject: [PATCH 324/566] [SPARKR][DOC] fix link in vignettes

## What changes were proposed in this pull request?

Fix doc link that was changed in 2.3

shivaram

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #20711 from felixcheung/rvigmean.

(cherry picked from commit 0b6ceadeb563205cbd6bd03bc88e608086273b5b)
Signed-off-by: Felix Cheung <felixcheung@apache.org>
---
 R/pkg/vignettes/sparkr-vignettes.Rmd | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index feca617c2554c..d4713de7806a1 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -46,7 +46,7 @@ Sys.setenv("_JAVA_OPTIONS" = paste("-XX:-UsePerfData", old_java_opt, sep = " "))
 
 ## Overview
 
-SparkR is an R package that provides a light-weight frontend to use Apache Spark from R. With Spark `r packageVersion("SparkR")`, SparkR provides a distributed data frame implementation that supports data processing operations like selection, filtering, aggregation etc. and distributed machine learning using [MLlib](http://spark.apache.org/mllib/).
+SparkR is an R package that provides a light-weight frontend to use Apache Spark from R. With Spark `r packageVersion("SparkR")`, SparkR provides a distributed data frame implementation that supports data processing operations like selection, filtering, aggregation etc. and distributed machine learning using [MLlib](https://spark.apache.org/mllib/).
 
 ## Getting Started
 
@@ -132,7 +132,7 @@ sparkR.session.stop()
 
 Different from many other R packages, to use SparkR, you need an additional installation of Apache Spark. The Spark installation will be used to run a backend process that will compile and execute SparkR programs.
 
-After installing the SparkR package, you can call `sparkR.session` as explained in the previous section to start and it will check for the Spark installation. If you are working with SparkR from an interactive shell (eg. R, RStudio) then Spark is downloaded and cached automatically if it is not found. Alternatively, we provide an easy-to-use function `install.spark` for running this manually. If you don't have Spark installed on the computer, you may download it from [Apache Spark Website](http://spark.apache.org/downloads.html).
+After installing the SparkR package, you can call `sparkR.session` as explained in the previous section to start and it will check for the Spark installation. If you are working with SparkR from an interactive shell (eg. R, RStudio) then Spark is downloaded and cached automatically if it is not found. Alternatively, we provide an easy-to-use function `install.spark` for running this manually. If you don't have Spark installed on the computer, you may download it from [Apache Spark Website](https://spark.apache.org/downloads.html).
 
 ```{r, eval=FALSE}
 install.spark()
@@ -147,7 +147,7 @@ sparkR.session(sparkHome = "/HOME/spark")
 ### Spark Session {#SetupSparkSession}
 
 
-In addition to `sparkHome`, many other options can be specified in `sparkR.session`. For a complete list, see [Starting up: SparkSession](http://spark.apache.org/docs/latest/sparkr.html#starting-up-sparksession) and [SparkR API doc](http://spark.apache.org/docs/latest/api/R/sparkR.session.html).
+In addition to `sparkHome`, many other options can be specified in `sparkR.session`. For a complete list, see [Starting up: SparkSession](https://spark.apache.org/docs/latest/sparkr.html#starting-up-sparksession) and [SparkR API doc](https://spark.apache.org/docs/latest/api/R/sparkR.session.html).
 
 In particular, the following Spark driver properties can be set in `sparkConfig`.
 
@@ -169,7 +169,7 @@ sparkR.session(spark.sql.warehouse.dir = spark_warehouse_path)
 
 
 #### Cluster Mode
-SparkR can connect to remote Spark clusters. [Cluster Mode Overview](http://spark.apache.org/docs/latest/cluster-overview.html) is a good introduction to different Spark cluster modes.
+SparkR can connect to remote Spark clusters. [Cluster Mode Overview](https://spark.apache.org/docs/latest/cluster-overview.html) is a good introduction to different Spark cluster modes.
 
 When connecting SparkR to a remote Spark cluster, make sure that the Spark version and Hadoop version on the machine match the corresponding versions on the cluster. Current SparkR package is compatible with
 ```{r, echo=FALSE, tidy = TRUE}
@@ -177,7 +177,7 @@ paste("Spark", packageVersion("SparkR"))
 ```
 It should be used both on the local computer and on the remote cluster.
 
-To connect, pass the URL of the master node to `sparkR.session`. A complete list can be seen in [Spark Master URLs](http://spark.apache.org/docs/latest/submitting-applications.html#master-urls).
+To connect, pass the URL of the master node to `sparkR.session`. A complete list can be seen in [Spark Master URLs](https://spark.apache.org/docs/latest/submitting-applications.html#master-urls).
 For example, to connect to a local standalone Spark master, we can call
 
 ```{r, eval=FALSE}
@@ -317,7 +317,7 @@ A common flow of grouping and aggregation is
 
 2. Feed the `GroupedData` object to `agg` or `summarize` functions, with some provided aggregation functions to compute a number within each group.
 
-A number of widely used functions are supported to aggregate data after grouping, including `avg`, `countDistinct`, `count`, `first`, `kurtosis`, `last`, `max`, `mean`, `min`, `sd`, `skewness`, `stddev_pop`, `stddev_samp`, `sumDistinct`, `sum`, `var_pop`, `var_samp`, `var`. See the [API doc for `mean`](http://spark.apache.org/docs/latest/api/R/mean.html) and other `agg_funcs` linked there.
+A number of widely used functions are supported to aggregate data after grouping, including `avg`, `countDistinct`, `count`, `first`, `kurtosis`, `last`, `max`, `mean`, `min`, `sd`, `skewness`, `stddev_pop`, `stddev_samp`, `sumDistinct`, `sum`, `var_pop`, `var_samp`, `var`. See the [API doc for aggregate functions](https://spark.apache.org/docs/latest/api/R/column_aggregate_functions.html) linked there.
 
 For example we can compute a histogram of the number of cylinders in the `mtcars` dataset as shown below.
 
@@ -935,7 +935,7 @@ perplexity
 
 #### Alternating Least Squares
 
-`spark.als` learns latent factors in [collaborative filtering](https://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering) via [alternating least squares](http://dl.acm.org/citation.cfm?id=1608614).
+`spark.als` learns latent factors in [collaborative filtering](https://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering) via [alternating least squares](https://dl.acm.org/citation.cfm?id=1608614).
 
 There are multiple options that can be configured in `spark.als`, including `rank`, `reg`, and `nonnegative`. For a complete list, refer to the help file.
 
@@ -1171,11 +1171,11 @@ env | map
 
 ## References
 
-* [Spark Cluster Mode Overview](http://spark.apache.org/docs/latest/cluster-overview.html)
+* [Spark Cluster Mode Overview](https://spark.apache.org/docs/latest/cluster-overview.html)
 
-* [Submitting Spark Applications](http://spark.apache.org/docs/latest/submitting-applications.html)
+* [Submitting Spark Applications](https://spark.apache.org/docs/latest/submitting-applications.html)
 
-* [Machine Learning Library Guide (MLlib)](http://spark.apache.org/docs/latest/ml-guide.html)
+* [Machine Learning Library Guide (MLlib)](https://spark.apache.org/docs/latest/ml-guide.html)
 
 * [SparkR: Scaling R Programs with Spark](https://people.csail.mit.edu/matei/papers/2016/sigmod_sparkr.pdf), Shivaram Venkataraman, Zongheng Yang, Davies Liu, Eric Liang, Hossein Falaki, Xiangrui Meng, Reynold Xin, Ali Ghodsi, Michael Franklin, Ion Stoica, and Matei Zaharia. SIGMOD 2016. June 2016.
 

From f12fa13f16daf0a3f194d78a7e028c8aa7522676 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Fri, 2 Mar 2018 14:30:37 -0800
Subject: [PATCH 325/566] [SPARK-23570][SQL] Add Spark 2.3.0 in
 HiveExternalCatalogVersionsSuite

## What changes were proposed in this pull request?
Add Spark 2.3.0 in HiveExternalCatalogVersionsSuite since Spark 2.3.0 is released for ensuring backward compatibility.

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20720 from gatorsmile/add2.3.

(cherry picked from commit 487377e693af65b2ff3d6b874ca7326c1ff0076c)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../spark/sql/hive/HiveExternalCatalogVersionsSuite.scala       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
index c13a750dbb270..6ca58e68d31eb 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
@@ -195,7 +195,7 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
 
 object PROCESS_TABLES extends QueryTest with SQLTestUtils {
   // Tests the latest version of every release line.
-  val testingVersions = Seq("2.0.2", "2.1.2", "2.2.0", "2.2.1")
+  val testingVersions = Seq("2.0.2", "2.1.2", "2.2.0", "2.2.1", "2.3.0")
 
   protected var spark: SparkSession = _
 

From 26a8a675aa938a38b0fce91418b0c7ed6fd65625 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Mon, 5 Mar 2018 07:32:24 +0900
Subject: [PATCH 326/566] [SQL][MINOR] XPathDouble prettyPrint should say
 'double' not 'float'

## What changes were proposed in this pull request?

It looks like this was incorrectly copied from `XPathFloat` in the class above.

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Eric Liang <ekhliang@gmail.com>

Closes #20730 from ericl/fix-typo-xpath.

(cherry picked from commit a89cdf55fa76fa23a524f0443e323498c3cc8664)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 .../org/apache/spark/sql/catalyst/expressions/xml/xpath.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala
index d0185562c9cfc..aacf1a44e2ad0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala
@@ -160,7 +160,7 @@ case class XPathFloat(xml: Expression, path: Expression) extends XPathExtract {
   """)
 // scalastyle:on line.size.limit
 case class XPathDouble(xml: Expression, path: Expression) extends XPathExtract {
-  override def prettyName: String = "xpath_float"
+  override def prettyName: String = "xpath_double"
   override def dataType: DataType = DoubleType
 
   override def nullSafeEval(xml: Any, path: Any): Any = {

From c8aa6fbb049795195e414597839fe61ae3f56d92 Mon Sep 17 00:00:00 2001
From: "Michael (Stu) Stewart" <mstewart141@gmail.com>
Date: Mon, 5 Mar 2018 13:36:42 +0900
Subject: [PATCH 327/566] [SPARK-23569][PYTHON] Allow pandas_udf to work with
 python3 style type-annotated functions

## What changes were proposed in this pull request?

Check python version to determine whether to use `inspect.getargspec` or `inspect.getfullargspec` before applying `pandas_udf` core logic to a function. The former is python2.7 (deprecated in python3) and the latter is python3.x. The latter correctly accounts for type annotations, which are syntax errors in python2.x.

## How was this patch tested?

Locally, on python 2.7 and 3.6.

Author: Michael (Stu) Stewart <mstewart141@gmail.com>

Closes #20728 from mstewart141/pandas_udf_fix.

(cherry picked from commit 7965c91d8a67c213ca5eebda5e46e7c49a8ba121)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 python/pyspark/sql/tests.py | 18 ++++++++++++++++++
 python/pyspark/sql/udf.py   |  9 ++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index da50b4d9ffa0f..4c51f715b58b8 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -4327,6 +4327,24 @@ def test_timestamp_dst(self):
         result = df.withColumn('time', foo_udf(df.time))
         self.assertEquals(df.collect(), result.collect())
 
+    @unittest.skipIf(sys.version_info[:2] < (3, 5), "Type hints are supported from Python 3.5.")
+    def test_type_annotation(self):
+        from pyspark.sql.functions import pandas_udf
+        # Regression test to check if type hints can be used. See SPARK-23569.
+        # Note that it throws an error during compilation in lower Python versions if 'exec'
+        # is not used. Also, note that we explicitly use another dictionary to avoid modifications
+        # in the current 'locals()'.
+        #
+        # Hyukjin: I think it's an ugly way to test issues about syntax specific in
+        # higher versions of Python, which we shouldn't encourage. This was the last resort
+        # I could come up with at that time.
+        _locals = {}
+        exec(
+            "import pandas as pd\ndef noop(col: pd.Series) -> pd.Series: return col",
+            _locals)
+        df = self.spark.range(1).select(pandas_udf(f=_locals['noop'], returnType='bigint')('id'))
+        self.assertEqual(df.first()[0], 0)
+
 
 @unittest.skipIf(
     not _have_pandas or not _have_pyarrow,
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index 2b16b7148bfae..671e5680b8e7b 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -40,10 +40,17 @@ def _create_udf(f, returnType, evalType):
     if evalType in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                     PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF):
         import inspect
+        import sys
         from pyspark.sql.utils import require_minimum_pyarrow_version
 
         require_minimum_pyarrow_version()
-        argspec = inspect.getargspec(f)
+
+        if sys.version_info[0] < 3:
+            # `getargspec` is deprecated since python3.0 (incompatible with function annotations).
+            # See SPARK-23569.
+            argspec = inspect.getargspec(f)
+        else:
+            argspec = inspect.getfullargspec(f)
 
         if evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF and len(argspec.args) == 0 and \
                 argspec.varargs is None:

From 88dd335f6f36ce68862d33720959aaf62742f86d Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sun, 4 Mar 2018 21:22:30 -0800
Subject: [PATCH 328/566] [MINOR][DOCS] Fix a link in "Compatibility with
 Apache Hive"

## What changes were proposed in this pull request?

This PR fixes a broken link as below:

**Before:**

<img width="678" alt="2018-03-05 12 23 58" src="https://user-images.githubusercontent.com/6477701/36957930-6d00ebda-207b-11e8-9ae4-718561b0428c.png">

**After:**

<img width="680" alt="2018-03-05 12 23 20" src="https://user-images.githubusercontent.com/6477701/36957934-6f834ac4-207b-11e8-97b4-18832b2b80cd.png">

Also see https://spark.apache.org/docs/2.3.0/sql-programming-guide.html#compatibility-with-apache-hive

## How was this patch tested?

Manually tested. I checked the same instances in `docs` directory. Seems this is the only one.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20733 from HyukjinKwon/minor-link.

(cherry picked from commit 269cd53590dd155aeb5269efc909a6e228f21e22)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 docs/sql-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 91e43678481d6..06e5cedc53fe5 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -2219,7 +2219,7 @@ referencing a singleton.
 Spark SQL is designed to be compatible with the Hive Metastore, SerDes and UDFs.
 Currently Hive SerDes and UDFs are based on Hive 1.2.1,
 and Spark SQL can be connected to different versions of Hive Metastore
-(from 0.12.0 to 2.1.1. Also see [Interacting with Different Versions of Hive Metastore] (#interacting-with-different-versions-of-hive-metastore)).
+(from 0.12.0 to 2.1.1. Also see [Interacting with Different Versions of Hive Metastore](#interacting-with-different-versions-of-hive-metastore)).
 
 #### Deploying in Existing Hive Warehouses
 

From 232b9f81f02ec00fc698f610ecc1ca25740e8802 Mon Sep 17 00:00:00 2001
From: Mihaly Toth <misutoth@gmail.com>
Date: Mon, 5 Mar 2018 23:46:40 +0900
Subject: [PATCH 329/566] [SPARK-23329][SQL] Fix documentation of trigonometric
 functions

## What changes were proposed in this pull request?

Provide more details in trigonometric function documentations. Referenced `java.lang.Math` for further details in the descriptions.
## How was this patch tested?

Ran full build, checked generated documentation manually

Author: Mihaly Toth <misutoth@gmail.com>

Closes #20618 from misutoth/trigonometric-doc.

(cherry picked from commit a366b950b90650693ad0eb1e5b9a988ad028d845)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 R/pkg/R/functions.R                           |  34 ++--
 python/pyspark/sql/functions.py               |  62 ++++---
 .../expressions/mathExpressions.scala         |  99 ++++++++---
 .../org/apache/spark/sql/functions.scala      | 160 ++++++++++++------
 4 files changed, 248 insertions(+), 107 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 9f7c6317cd924..29ee146ab14f9 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -278,8 +278,8 @@ setMethod("abs",
           })
 
 #' @details
-#' \code{acos}: Computes the cosine inverse of the given value; the returned angle is in
-#' the range 0.0 through pi.
+#' \code{acos}: Returns the inverse cosine of the given value,
+#' as if computed by \code{java.lang.Math.acos()}
 #'
 #' @rdname column_math_functions
 #' @export
@@ -334,8 +334,8 @@ setMethod("ascii",
           })
 
 #' @details
-#' \code{asin}: Computes the sine inverse of the given value; the returned angle is in
-#' the range -pi/2 through pi/2.
+#' \code{asin}: Returns the inverse sine of the given value,
+#' as if computed by \code{java.lang.Math.asin()}
 #'
 #' @rdname column_math_functions
 #' @export
@@ -349,8 +349,8 @@ setMethod("asin",
           })
 
 #' @details
-#' \code{atan}: Computes the tangent inverse of the given value; the returned angle is in the range
-#' -pi/2 through pi/2.
+#' \code{atan}: Returns the inverse tangent of the given value,
+#' as if computed by \code{java.lang.Math.atan()}
 #'
 #' @rdname column_math_functions
 #' @export
@@ -613,7 +613,8 @@ setMethod("covar_pop", signature(col1 = "characterOrColumn", col2 = "characterOr
           })
 
 #' @details
-#' \code{cos}: Computes the cosine of the given value. Units in radians.
+#' \code{cos}: Returns the cosine of the given value,
+#' as if computed by \code{java.lang.Math.cos()}. Units in radians.
 #'
 #' @rdname column_math_functions
 #' @aliases cos cos,Column-method
@@ -627,7 +628,8 @@ setMethod("cos",
           })
 
 #' @details
-#' \code{cosh}: Computes the hyperbolic cosine of the given value.
+#' \code{cosh}: Returns the hyperbolic cosine of the given value,
+#' as if computed by \code{java.lang.Math.cosh()}.
 #'
 #' @rdname column_math_functions
 #' @aliases cosh cosh,Column-method
@@ -1463,7 +1465,8 @@ setMethod("sign", signature(x = "Column"),
           })
 
 #' @details
-#' \code{sin}: Computes the sine of the given value. Units in radians.
+#' \code{sin}: Returns the sine of the given value,
+#' as if computed by \code{java.lang.Math.sin()}. Units in radians.
 #'
 #' @rdname column_math_functions
 #' @aliases sin sin,Column-method
@@ -1477,7 +1480,8 @@ setMethod("sin",
           })
 
 #' @details
-#' \code{sinh}: Computes the hyperbolic sine of the given value.
+#' \code{sinh}: Returns the hyperbolic sine of the given value,
+#' as if computed by \code{java.lang.Math.sinh()}.
 #'
 #' @rdname column_math_functions
 #' @aliases sinh sinh,Column-method
@@ -1653,7 +1657,9 @@ setMethod("sumDistinct",
           })
 
 #' @details
-#' \code{tan}: Computes the tangent of the given value. Units in radians.
+#' \code{tan}: Returns the tangent of the given value,
+#' as if computed by \code{java.lang.Math.tan()}.
+#' Units in radians.
 #'
 #' @rdname column_math_functions
 #' @aliases tan tan,Column-method
@@ -1667,7 +1673,8 @@ setMethod("tan",
           })
 
 #' @details
-#' \code{tanh}: Computes the hyperbolic tangent of the given value.
+#' \code{tanh}: Returns the hyperbolic tangent of the given value,
+#' as if computed by \code{java.lang.Math.tanh()}.
 #'
 #' @rdname column_math_functions
 #' @aliases tanh tanh,Column-method
@@ -1973,7 +1980,8 @@ setMethod("year",
 
 #' @details
 #' \code{atan2}: Returns the angle theta from the conversion of rectangular coordinates
-#' (x, y) to polar coordinates (r, theta). Units in radians.
+#' (x, y) to polar coordinates (r, theta),
+#' as if computed by \code{java.lang.Math.atan2()}. Units in radians.
 #'
 #' @rdname column_math_functions
 #' @aliases atan2 atan2,Column-method
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 52fab7c086efa..316aa85550ad0 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -106,18 +106,15 @@ def _():
 
 _functions_1_4 = {
     # unary math functions
-    'acos': 'Computes the cosine inverse of the given value; the returned angle is in the range' +
-            '0.0 through pi.',
-    'asin': 'Computes the sine inverse of the given value; the returned angle is in the range' +
-            '-pi/2 through pi/2.',
-    'atan': 'Computes the tangent inverse of the given value; the returned angle is in the range' +
-            '-pi/2 through pi/2',
+    'acos': ':return: inverse cosine of `col`, as if computed by `java.lang.Math.acos()`',
+    'asin': ':return: inverse sine of `col`, as if computed by `java.lang.Math.asin()`',
+    'atan': ':return: inverse tangent of `col`, as if computed by `java.lang.Math.atan()`',
     'cbrt': 'Computes the cube-root of the given value.',
     'ceil': 'Computes the ceiling of the given value.',
-    'cos': """Computes the cosine of the given value.
-
-           :param col: :class:`DoubleType` column, units in radians.""",
-    'cosh': 'Computes the hyperbolic cosine of the given value.',
+    'cos': """:param col: angle in radians
+           :return: cosine of the angle, as if computed by `java.lang.Math.cos()`.""",
+    'cosh': """:param col: hyperbolic angle
+           :return: hyperbolic cosine of the angle, as if computed by `java.lang.Math.cosh()`""",
     'exp': 'Computes the exponential of the given value.',
     'expm1': 'Computes the exponential of the given value minus one.',
     'floor': 'Computes the floor of the given value.',
@@ -127,14 +124,16 @@ def _():
     'rint': 'Returns the double value that is closest in value to the argument and' +
             ' is equal to a mathematical integer.',
     'signum': 'Computes the signum of the given value.',
-    'sin': """Computes the sine of the given value.
-
-           :param col: :class:`DoubleType` column, units in radians.""",
-    'sinh': 'Computes the hyperbolic sine of the given value.',
-    'tan': """Computes the tangent of the given value.
-
-           :param col: :class:`DoubleType` column, units in radians.""",
-    'tanh': 'Computes the hyperbolic tangent of the given value.',
+    'sin': """:param col: angle in radians
+           :return: sine of the angle, as if computed by `java.lang.Math.sin()`""",
+    'sinh': """:param col: hyperbolic angle
+           :return: hyperbolic sine of the given value,
+                    as if computed by `java.lang.Math.sinh()`""",
+    'tan': """:param col: angle in radians
+           :return: tangent of the given value, as if computed by `java.lang.Math.tan()`""",
+    'tanh': """:param col: hyperbolic angle
+            :return: hyperbolic tangent of the given value,
+                     as if computed by `java.lang.Math.tanh()`""",
     'toDegrees': '.. note:: Deprecated in 2.1, use :func:`degrees` instead.',
     'toRadians': '.. note:: Deprecated in 2.1, use :func:`radians` instead.',
     'bitwiseNOT': 'Computes bitwise not.',
@@ -173,16 +172,31 @@ def _():
 
 _functions_2_1 = {
     # unary math functions
-    'degrees': 'Converts an angle measured in radians to an approximately equivalent angle ' +
-               'measured in degrees.',
-    'radians': 'Converts an angle measured in degrees to an approximately equivalent angle ' +
-               'measured in radians.',
+    'degrees': """
+               Converts an angle measured in radians to an approximately equivalent angle
+               measured in degrees.
+               :param col: angle in radians
+               :return: angle in degrees, as if computed by `java.lang.Math.toDegrees()`
+               """,
+    'radians': """
+               Converts an angle measured in degrees to an approximately equivalent angle
+               measured in radians.
+               :param col: angle in degrees
+               :return: angle in radians, as if computed by `java.lang.Math.toRadians()`
+               """,
 }
 
 # math functions that take two arguments as input
 _binary_mathfunctions = {
-    'atan2': 'Returns the angle theta from the conversion of rectangular coordinates (x, y) to' +
-             'polar coordinates (r, theta). Units in radians.',
+    'atan2': """
+             :param col1: coordinate on y-axis
+             :param col2: coordinate on x-axis
+             :return: the `theta` component of the point
+                (`r`, `theta`)
+                in polar coordinates that corresponds to the point
+                (`x`, `y`) in Cartesian coordinates,
+                as if computed by `java.lang.Math.atan2()`
+             """,
     'hypot': 'Computes ``sqrt(a^2 + b^2)`` without intermediate overflow or underflow.',
     'pow': 'Returns the value of the first argument raised to the power of the second argument.',
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
index d8dc0862f1141..816ae9ae74729 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
@@ -168,9 +168,11 @@ case class Pi() extends LeafMathExpression(math.Pi, "PI")
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the inverse cosine (a.k.a. arccosine) of `expr` if -1<=`expr`<=1 or NaN otherwise.",
+  usage = """
+    _FUNC_(expr) - Returns the inverse cosine (a.k.a. arc cosine) of `expr`, as if computed by
+      `java.lang.Math._FUNC_`.
+  """,
   examples = """
     Examples:
       > SELECT _FUNC_(1);
@@ -178,12 +180,13 @@ case class Pi() extends LeafMathExpression(math.Pi, "PI")
       > SELECT _FUNC_(2);
        NaN
   """)
-// scalastyle:on line.size.limit
 case class Acos(child: Expression) extends UnaryMathExpression(math.acos, "ACOS")
 
-// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the inverse sine (a.k.a. arcsine) the arc sin of `expr` if -1<=`expr`<=1 or NaN otherwise.",
+  usage = """
+    _FUNC_(expr) - Returns the inverse sine (a.k.a. arc sine) the arc sin of `expr`,
+      as if computed by `java.lang.Math._FUNC_`.
+  """,
   examples = """
     Examples:
       > SELECT _FUNC_(0);
@@ -191,18 +194,18 @@ case class Acos(child: Expression) extends UnaryMathExpression(math.acos, "ACOS"
       > SELECT _FUNC_(2);
        NaN
   """)
-// scalastyle:on line.size.limit
 case class Asin(child: Expression) extends UnaryMathExpression(math.asin, "ASIN")
 
-// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the inverse tangent (a.k.a. arctangent).",
+  usage = """
+    _FUNC_(expr) - Returns the inverse tangent (a.k.a. arc tangent) of `expr`, as if computed by
+      `java.lang.Math._FUNC_`
+  """,
   examples = """
     Examples:
       > SELECT _FUNC_(0);
        0.0
   """)
-// scalastyle:on line.size.limit
 case class Atan(child: Expression) extends UnaryMathExpression(math.atan, "ATAN")
 
 @ExpressionDescription(
@@ -252,7 +255,14 @@ case class Ceil(child: Expression) extends UnaryMathExpression(math.ceil, "CEIL"
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the cosine of `expr`.",
+  usage = """
+    _FUNC_(expr) - Returns the cosine of `expr`, as if computed by
+      `java.lang.Math._FUNC_`.
+  """,
+  arguments = """
+    Arguments:
+      * expr - angle in radians
+  """,
   examples = """
     Examples:
       > SELECT _FUNC_(0);
@@ -261,7 +271,14 @@ case class Ceil(child: Expression) extends UnaryMathExpression(math.ceil, "CEIL"
 case class Cos(child: Expression) extends UnaryMathExpression(math.cos, "COS")
 
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the hyperbolic cosine of `expr`.",
+  usage = """
+      _FUNC_(expr) - Returns the hyperbolic cosine of `expr`, as if computed by
+        `java.lang.Math._FUNC_`.
+  """,
+  arguments = """
+    Arguments:
+      * expr - hyperbolic angle
+  """,
   examples = """
     Examples:
       > SELECT _FUNC_(0);
@@ -512,7 +529,11 @@ case class Rint(child: Expression) extends UnaryMathExpression(math.rint, "ROUND
 case class Signum(child: Expression) extends UnaryMathExpression(math.signum, "SIGNUM")
 
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the sine of `expr`.",
+  usage = "_FUNC_(expr) - Returns the sine of `expr`, as if computed by `java.lang.Math._FUNC_`.",
+  arguments = """
+    Arguments:
+      * expr - angle in radians
+  """,
   examples = """
     Examples:
       > SELECT _FUNC_(0);
@@ -521,7 +542,13 @@ case class Signum(child: Expression) extends UnaryMathExpression(math.signum, "S
 case class Sin(child: Expression) extends UnaryMathExpression(math.sin, "SIN")
 
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the hyperbolic sine of `expr`.",
+  usage = """
+    _FUNC_(expr) - Returns hyperbolic sine of `expr`, as if computed by `java.lang.Math._FUNC_`.
+  """,
+  arguments = """
+    Arguments:
+      * expr - hyperbolic angle
+  """,
   examples = """
     Examples:
       > SELECT _FUNC_(0);
@@ -539,7 +566,13 @@ case class Sinh(child: Expression) extends UnaryMathExpression(math.sinh, "SINH"
 case class Sqrt(child: Expression) extends UnaryMathExpression(math.sqrt, "SQRT")
 
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the tangent of `expr`.",
+  usage = """
+    _FUNC_(expr) - Returns the tangent of `expr`, as if computed by `java.lang.Math._FUNC_`.
+  """,
+  arguments = """
+    Arguments:
+      * expr - angle in radians
+  """,
   examples = """
     Examples:
       > SELECT _FUNC_(0);
@@ -548,7 +581,13 @@ case class Sqrt(child: Expression) extends UnaryMathExpression(math.sqrt, "SQRT"
 case class Tan(child: Expression) extends UnaryMathExpression(math.tan, "TAN")
 
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the cotangent of `expr`.",
+  usage = """
+    _FUNC_(expr) - Returns the cotangent of `expr`, as if computed by `1/java.lang.Math._FUNC_`.
+  """,
+  arguments = """
+    Arguments:
+      * expr - angle in radians
+  """,
   examples = """
     Examples:
       > SELECT _FUNC_(1);
@@ -562,7 +601,14 @@ case class Cot(child: Expression)
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the hyperbolic tangent of `expr`.",
+  usage = """
+    _FUNC_(expr) - Returns the hyperbolic tangent of `expr`, as if computed by
+      `java.lang.Math._FUNC_`.
+  """,
+  arguments = """
+    Arguments:
+      * expr - hyperbolic angle
+  """,
   examples = """
     Examples:
       > SELECT _FUNC_(0);
@@ -572,6 +618,10 @@ case class Tanh(child: Expression) extends UnaryMathExpression(math.tanh, "TANH"
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Converts radians to degrees.",
+  arguments = """
+    Arguments:
+      * expr - angle in radians
+  """,
   examples = """
     Examples:
       > SELECT _FUNC_(3.141592653589793);
@@ -583,6 +633,10 @@ case class ToDegrees(child: Expression) extends UnaryMathExpression(math.toDegre
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Converts degrees to radians.",
+  arguments = """
+    Arguments:
+      * expr - angle in degrees
+  """,
   examples = """
     Examples:
       > SELECT _FUNC_(180);
@@ -768,15 +822,22 @@ case class Unhex(child: Expression) extends UnaryExpression with ImplicitCastInp
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(expr1, expr2) - Returns the angle in radians between the positive x-axis of a plane and the point given by the coordinates (`expr1`, `expr2`).",
+  usage = """
+    _FUNC_(exprY, exprX) - Returns the angle in radians between the positive x-axis of a plane
+      and the point given by the coordinates (`exprX`, `exprY`), as if computed by
+      `java.lang.Math._FUNC_`.
+  """,
+  arguments = """
+    Arguments:
+      * exprY - coordinate on y-axis
+      * exprX - coordinate on x-axis
+  """,
   examples = """
     Examples:
       > SELECT _FUNC_(0, 0);
        0.0
   """)
-// scalastyle:on line.size.limit
 case class Atan2(left: Expression, right: Expression)
   extends BinaryMathExpression(math.atan2, "ATAN2") {
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 0d54c02c3d06f..c9ca9a8996344 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1313,8 +1313,7 @@ object functions {
   //////////////////////////////////////////////////////////////////////////////////////////////
 
   /**
-   * Computes the cosine inverse of the given value; the returned angle is in the range
-   * 0.0 through pi.
+   * @return inverse cosine of `e` in radians, as if computed by `java.lang.Math.acos`
    *
    * @group math_funcs
    * @since 1.4.0
@@ -1322,8 +1321,7 @@ object functions {
   def acos(e: Column): Column = withExpr { Acos(e.expr) }
 
   /**
-   * Computes the cosine inverse of the given column; the returned angle is in the range
-   * 0.0 through pi.
+   * @return inverse cosine of `columnName`, as if computed by `java.lang.Math.acos`
    *
    * @group math_funcs
    * @since 1.4.0
@@ -1331,8 +1329,7 @@ object functions {
   def acos(columnName: String): Column = acos(Column(columnName))
 
   /**
-   * Computes the sine inverse of the given value; the returned angle is in the range
-   * -pi/2 through pi/2.
+   * @return inverse sine of `e` in radians, as if computed by `java.lang.Math.asin`
    *
    * @group math_funcs
    * @since 1.4.0
@@ -1340,8 +1337,7 @@ object functions {
   def asin(e: Column): Column = withExpr { Asin(e.expr) }
 
   /**
-   * Computes the sine inverse of the given column; the returned angle is in the range
-   * -pi/2 through pi/2.
+   * @return inverse sine of `columnName`, as if computed by `java.lang.Math.asin`
    *
    * @group math_funcs
    * @since 1.4.0
@@ -1349,8 +1345,7 @@ object functions {
   def asin(columnName: String): Column = asin(Column(columnName))
 
   /**
-   * Computes the tangent inverse of the given column; the returned angle is in the range
-   * -pi/2 through pi/2
+   * @return inverse tangent of `e`, as if computed by `java.lang.Math.atan`
    *
    * @group math_funcs
    * @since 1.4.0
@@ -1358,8 +1353,7 @@ object functions {
   def atan(e: Column): Column = withExpr { Atan(e.expr) }
 
   /**
-   * Computes the tangent inverse of the given column; the returned angle is in the range
-   * -pi/2 through pi/2
+   * @return inverse tangent of `columnName`, as if computed by `java.lang.Math.atan`
    *
    * @group math_funcs
    * @since 1.4.0
@@ -1367,77 +1361,117 @@ object functions {
   def atan(columnName: String): Column = atan(Column(columnName))
 
   /**
-   * Returns the angle theta from the conversion of rectangular coordinates (x, y) to
-   * polar coordinates (r, theta). Units in radians.
+   * @param y coordinate on y-axis
+   * @param x coordinate on x-axis
+   * @return the <i>theta</i> component of the point
+   *         (<i>r</i>, <i>theta</i>)
+   *         in polar coordinates that corresponds to the point
+   *         (<i>x</i>, <i>y</i>) in Cartesian coordinates,
+   *         as if computed by `java.lang.Math.atan2`
    *
    * @group math_funcs
    * @since 1.4.0
    */
-  def atan2(l: Column, r: Column): Column = withExpr { Atan2(l.expr, r.expr) }
+  def atan2(y: Column, x: Column): Column = withExpr { Atan2(y.expr, x.expr) }
 
   /**
-   * Returns the angle theta from the conversion of rectangular coordinates (x, y) to
-   * polar coordinates (r, theta).
+   * @param y coordinate on y-axis
+   * @param xName coordinate on x-axis
+   * @return the <i>theta</i> component of the point
+   *         (<i>r</i>, <i>theta</i>)
+   *         in polar coordinates that corresponds to the point
+   *         (<i>x</i>, <i>y</i>) in Cartesian coordinates,
+   *         as if computed by `java.lang.Math.atan2`
    *
    * @group math_funcs
    * @since 1.4.0
    */
-  def atan2(l: Column, rightName: String): Column = atan2(l, Column(rightName))
+  def atan2(y: Column, xName: String): Column = atan2(y, Column(xName))
 
   /**
-   * Returns the angle theta from the conversion of rectangular coordinates (x, y) to
-   * polar coordinates (r, theta).
+   * @param yName coordinate on y-axis
+   * @param x coordinate on x-axis
+   * @return the <i>theta</i> component of the point
+   *         (<i>r</i>, <i>theta</i>)
+   *         in polar coordinates that corresponds to the point
+   *         (<i>x</i>, <i>y</i>) in Cartesian coordinates,
+   *         as if computed by `java.lang.Math.atan2`
    *
    * @group math_funcs
    * @since 1.4.0
    */
-  def atan2(leftName: String, r: Column): Column = atan2(Column(leftName), r)
+  def atan2(yName: String, x: Column): Column = atan2(Column(yName), x)
 
   /**
-   * Returns the angle theta from the conversion of rectangular coordinates (x, y) to
-   * polar coordinates (r, theta).
+   * @param yName coordinate on y-axis
+   * @param xName coordinate on x-axis
+   * @return the <i>theta</i> component of the point
+   *         (<i>r</i>, <i>theta</i>)
+   *         in polar coordinates that corresponds to the point
+   *         (<i>x</i>, <i>y</i>) in Cartesian coordinates,
+   *         as if computed by `java.lang.Math.atan2`
    *
    * @group math_funcs
    * @since 1.4.0
    */
-  def atan2(leftName: String, rightName: String): Column =
-    atan2(Column(leftName), Column(rightName))
+  def atan2(yName: String, xName: String): Column =
+    atan2(Column(yName), Column(xName))
 
   /**
-   * Returns the angle theta from the conversion of rectangular coordinates (x, y) to
-   * polar coordinates (r, theta).
+   * @param y coordinate on y-axis
+   * @param xValue coordinate on x-axis
+   * @return the <i>theta</i> component of the point
+   *         (<i>r</i>, <i>theta</i>)
+   *         in polar coordinates that corresponds to the point
+   *         (<i>x</i>, <i>y</i>) in Cartesian coordinates,
+   *         as if computed by `java.lang.Math.atan2`
    *
    * @group math_funcs
    * @since 1.4.0
    */
-  def atan2(l: Column, r: Double): Column = atan2(l, lit(r))
+  def atan2(y: Column, xValue: Double): Column = atan2(y, lit(xValue))
 
   /**
-   * Returns the angle theta from the conversion of rectangular coordinates (x, y) to
-   * polar coordinates (r, theta).
+   * @param yName coordinate on y-axis
+   * @param xValue coordinate on x-axis
+   * @return the <i>theta</i> component of the point
+   *         (<i>r</i>, <i>theta</i>)
+   *         in polar coordinates that corresponds to the point
+   *         (<i>x</i>, <i>y</i>) in Cartesian coordinates,
+   *         as if computed by `java.lang.Math.atan2`
    *
    * @group math_funcs
    * @since 1.4.0
    */
-  def atan2(leftName: String, r: Double): Column = atan2(Column(leftName), r)
+  def atan2(yName: String, xValue: Double): Column = atan2(Column(yName), xValue)
 
   /**
-   * Returns the angle theta from the conversion of rectangular coordinates (x, y) to
-   * polar coordinates (r, theta).
+   * @param yValue coordinate on y-axis
+   * @param x coordinate on x-axis
+   * @return the <i>theta</i> component of the point
+   *         (<i>r</i>, <i>theta</i>)
+   *         in polar coordinates that corresponds to the point
+   *         (<i>x</i>, <i>y</i>) in Cartesian coordinates,
+   *         as if computed by `java.lang.Math.atan2`
    *
    * @group math_funcs
    * @since 1.4.0
    */
-  def atan2(l: Double, r: Column): Column = atan2(lit(l), r)
+  def atan2(yValue: Double, x: Column): Column = atan2(lit(yValue), x)
 
   /**
-   * Returns the angle theta from the conversion of rectangular coordinates (x, y) to
-   * polar coordinates (r, theta).
+   * @param yValue coordinate on y-axis
+   * @param xName coordinate on x-axis
+   * @return the <i>theta</i> component of the point
+   *         (<i>r</i>, <i>theta</i>)
+   *         in polar coordinates that corresponds to the point
+   *         (<i>x</i>, <i>y</i>) in Cartesian coordinates,
+   *         as if computed by `java.lang.Math.atan2`
    *
    * @group math_funcs
    * @since 1.4.0
    */
-  def atan2(l: Double, rightName: String): Column = atan2(l, Column(rightName))
+  def atan2(yValue: Double, xName: String): Column = atan2(yValue, Column(xName))
 
   /**
    * An expression that returns the string representation of the binary value of the given long
@@ -1500,7 +1534,8 @@ object functions {
   }
 
   /**
-   * Computes the cosine of the given value. Units in radians.
+   * @param e angle in radians
+   * @return cosine of the angle, as if computed by `java.lang.Math.cos`
    *
    * @group math_funcs
    * @since 1.4.0
@@ -1508,7 +1543,8 @@ object functions {
   def cos(e: Column): Column = withExpr { Cos(e.expr) }
 
   /**
-   * Computes the cosine of the given column.
+   * @param columnName angle in radians
+   * @return cosine of the angle, as if computed by `java.lang.Math.cos`
    *
    * @group math_funcs
    * @since 1.4.0
@@ -1516,7 +1552,8 @@ object functions {
   def cos(columnName: String): Column = cos(Column(columnName))
 
   /**
-   * Computes the hyperbolic cosine of the given value.
+   * @param e hyperbolic angle
+   * @return hyperbolic cosine of the angle, as if computed by `java.lang.Math.cosh`
    *
    * @group math_funcs
    * @since 1.4.0
@@ -1524,7 +1561,8 @@ object functions {
   def cosh(e: Column): Column = withExpr { Cosh(e.expr) }
 
   /**
-   * Computes the hyperbolic cosine of the given column.
+   * @param columnName hyperbolic angle
+   * @return hyperbolic cosine of the angle, as if computed by `java.lang.Math.cosh`
    *
    * @group math_funcs
    * @since 1.4.0
@@ -1967,7 +2005,8 @@ object functions {
   def signum(columnName: String): Column = signum(Column(columnName))
 
   /**
-   * Computes the sine of the given value. Units in radians.
+   * @param e angle in radians
+   * @return sine of the angle, as if computed by `java.lang.Math.sin`
    *
    * @group math_funcs
    * @since 1.4.0
@@ -1975,7 +2014,8 @@ object functions {
   def sin(e: Column): Column = withExpr { Sin(e.expr) }
 
   /**
-   * Computes the sine of the given column.
+   * @param columnName angle in radians
+   * @return sine of the angle, as if computed by `java.lang.Math.sin`
    *
    * @group math_funcs
    * @since 1.4.0
@@ -1983,7 +2023,8 @@ object functions {
   def sin(columnName: String): Column = sin(Column(columnName))
 
   /**
-   * Computes the hyperbolic sine of the given value.
+   * @param e hyperbolic angle
+   * @return hyperbolic sine of the given value, as if computed by `java.lang.Math.sinh`
    *
    * @group math_funcs
    * @since 1.4.0
@@ -1991,7 +2032,8 @@ object functions {
   def sinh(e: Column): Column = withExpr { Sinh(e.expr) }
 
   /**
-   * Computes the hyperbolic sine of the given column.
+   * @param columnName hyperbolic angle
+   * @return hyperbolic sine of the given value, as if computed by `java.lang.Math.sinh`
    *
    * @group math_funcs
    * @since 1.4.0
@@ -1999,7 +2041,8 @@ object functions {
   def sinh(columnName: String): Column = sinh(Column(columnName))
 
   /**
-   * Computes the tangent of the given value. Units in radians.
+   * @param e angle in radians
+   * @return tangent of the given value, as if computed by `java.lang.Math.tan`
    *
    * @group math_funcs
    * @since 1.4.0
@@ -2007,7 +2050,8 @@ object functions {
   def tan(e: Column): Column = withExpr { Tan(e.expr) }
 
   /**
-   * Computes the tangent of the given column.
+   * @param columnName angle in radians
+   * @return tangent of the given value, as if computed by `java.lang.Math.tan`
    *
    * @group math_funcs
    * @since 1.4.0
@@ -2015,7 +2059,8 @@ object functions {
   def tan(columnName: String): Column = tan(Column(columnName))
 
   /**
-   * Computes the hyperbolic tangent of the given value.
+   * @param e hyperbolic angle
+   * @return hyperbolic tangent of the given value, as if computed by `java.lang.Math.tanh`
    *
    * @group math_funcs
    * @since 1.4.0
@@ -2023,7 +2068,8 @@ object functions {
   def tanh(e: Column): Column = withExpr { Tanh(e.expr) }
 
   /**
-   * Computes the hyperbolic tangent of the given column.
+   * @param columnName hyperbolic angle
+   * @return hyperbolic tangent of the given value, as if computed by `java.lang.Math.tanh`
    *
    * @group math_funcs
    * @since 1.4.0
@@ -2047,6 +2093,9 @@ object functions {
   /**
    * Converts an angle measured in radians to an approximately equivalent angle measured in degrees.
    *
+   * @param e angle in radians
+   * @return angle in degrees, as if computed by `java.lang.Math.toDegrees`
+   *
    * @group math_funcs
    * @since 2.1.0
    */
@@ -2055,6 +2104,9 @@ object functions {
   /**
    * Converts an angle measured in radians to an approximately equivalent angle measured in degrees.
    *
+   * @param columnName angle in radians
+   * @return angle in degrees, as if computed by `java.lang.Math.toDegrees`
+   *
    * @group math_funcs
    * @since 2.1.0
    */
@@ -2077,6 +2129,9 @@ object functions {
   /**
    * Converts an angle measured in degrees to an approximately equivalent angle measured in radians.
    *
+   * @param e angle in degrees
+   * @return angle in radians, as if computed by `java.lang.Math.toRadians`
+   *
    * @group math_funcs
    * @since 2.1.0
    */
@@ -2085,6 +2140,9 @@ object functions {
   /**
    * Converts an angle measured in degrees to an approximately equivalent angle measured in radians.
    *
+   * @param columnName angle in degrees
+   * @return angle in radians, as if computed by `java.lang.Math.toRadians`
+   *
    * @group math_funcs
    * @since 2.1.0
    */
@@ -2873,7 +2931,7 @@ object functions {
    *                      or equal to the `windowDuration`. Check
    *                      `org.apache.spark.unsafe.types.CalendarInterval` for valid duration
    *                      identifiers. This duration is likewise absolute, and does not vary
-    *                     according to a calendar.
+   *                      according to a calendar.
    * @param startTime The offset with respect to 1970-01-01 00:00:00 UTC with which to start
    *                  window intervals. For example, in order to have hourly tumbling windows that
    *                  start 15 minutes past the hour, e.g. 12:15-13:15, 13:15-14:15... provide
@@ -2929,7 +2987,7 @@ object functions {
    *                      or equal to the `windowDuration`. Check
    *                      `org.apache.spark.unsafe.types.CalendarInterval` for valid duration
    *                      identifiers. This duration is likewise absolute, and does not vary
-   *                     according to a calendar.
+   *                      according to a calendar.
    *
    * @group datetime_funcs
    * @since 2.0.0

From 4550673b1a94e9023a0c6fdc6a92e4b860e1cfb2 Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Mon, 5 Mar 2018 10:50:00 -0800
Subject: [PATCH 330/566] [SPARK-22882][ML][TESTS] ML test for structured
 streaming: ml.classification

## What changes were proposed in this pull request?

adding Structured Streaming tests for all Models/Transformers in spark.ml.classification

## How was this patch tested?

N/A

Author: WeichenXu <weichen.xu@databricks.com>

Closes #20121 from WeichenXu123/ml_stream_test_classification.

(cherry picked from commit 98a5c0a35f0a24730f5074522939acf57ef95422)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 .../DecisionTreeClassifierSuite.scala         |  29 +--
 .../classification/GBTClassifierSuite.scala   |  77 ++----
 .../ml/classification/LinearSVCSuite.scala    |  15 +-
 .../LogisticRegressionSuite.scala             | 229 +++++++-----------
 .../MultilayerPerceptronClassifierSuite.scala |  44 ++--
 .../ml/classification/NaiveBayesSuite.scala   |  47 ++--
 .../ml/classification/OneVsRestSuite.scala    |  21 +-
 .../ProbabilisticClassifierSuite.scala        |  29 +--
 .../RandomForestClassifierSuite.scala         |  16 +-
 9 files changed, 202 insertions(+), 305 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
index 98c879ece62d6..1968041aaf161 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
@@ -23,15 +23,14 @@ import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.tree.{CategoricalSplit, InternalNode, LeafNode}
 import org.apache.spark.ml.tree.impl.TreeTests
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
-import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree, DecisionTreeSuite => OldDecisionTreeSuite}
-import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree,
+  DecisionTreeSuite => OldDecisionTreeSuite}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Row}
 
-class DecisionTreeClassifierSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class DecisionTreeClassifierSuite extends MLTest with DefaultReadWriteTest {
 
   import DecisionTreeClassifierSuite.compareAPIs
   import testImplicits._
@@ -251,20 +250,18 @@ class DecisionTreeClassifierSuite
 
     MLTestingUtils.checkCopyAndUids(dt, newTree)
 
-    val predictions = newTree.transform(newData)
-      .select(newTree.getPredictionCol, newTree.getRawPredictionCol, newTree.getProbabilityCol)
-      .collect()
-
-    predictions.foreach { case Row(pred: Double, rawPred: Vector, probPred: Vector) =>
-      assert(pred === rawPred.argmax,
-        s"Expected prediction $pred but calculated ${rawPred.argmax} from rawPrediction.")
-      val sum = rawPred.toArray.sum
-      assert(Vectors.dense(rawPred.toArray.map(_ / sum)) === probPred,
-        "probability prediction mismatch")
+    testTransformer[(Vector, Double)](newData, newTree,
+      "prediction", "rawPrediction", "probability") {
+      case Row(pred: Double, rawPred: Vector, probPred: Vector) =>
+        assert(pred === rawPred.argmax,
+          s"Expected prediction $pred but calculated ${rawPred.argmax} from rawPrediction.")
+        val sum = rawPred.toArray.sum
+        assert(Vectors.dense(rawPred.toArray.map(_ / sum)) === probPred,
+          "probability prediction mismatch")
     }
 
     ProbabilisticClassifierSuite.testPredictMethods[
-      Vector, DecisionTreeClassificationModel](newTree, newData)
+      Vector, DecisionTreeClassificationModel](this, newTree, newData)
   }
 
   test("training with 1-category categorical feature") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
index 978f89c459f0a..092b4a01d5b0d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
@@ -26,13 +26,12 @@ import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.regression.DecisionTreeRegressionModel
 import org.apache.spark.ml.tree.LeafNode
 import org.apache.spark.ml.tree.impl.TreeTests
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
 import org.apache.spark.mllib.tree.{EnsembleTestHelper, GradientBoostedTrees => OldGBT}
 import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
 import org.apache.spark.mllib.tree.loss.LogLoss
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.util.Utils
@@ -40,8 +39,7 @@ import org.apache.spark.util.Utils
 /**
  * Test suite for [[GBTClassifier]].
  */
-class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext
-  with DefaultReadWriteTest {
+class GBTClassifierSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
   import GBTClassifierSuite.compareAPIs
@@ -126,14 +124,15 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext
 
     // should predict all zeros
     binaryModel.setThresholds(Array(0.0, 1.0))
-    val binaryZeroPredictions = binaryModel.transform(df).select("prediction").collect()
-    assert(binaryZeroPredictions.forall(_.getDouble(0) === 0.0))
+    testTransformer[(Double, Vector)](df, binaryModel, "prediction") {
+      case Row(prediction: Double) => prediction === 0.0
+    }
 
     // should predict all ones
     binaryModel.setThresholds(Array(1.0, 0.0))
-    val binaryOnePredictions = binaryModel.transform(df).select("prediction").collect()
-    assert(binaryOnePredictions.forall(_.getDouble(0) === 1.0))
-
+    testTransformer[(Double, Vector)](df, binaryModel, "prediction") {
+      case Row(prediction: Double) => prediction === 1.0
+    }
 
     val gbtBase = new GBTClassifier
     val model = gbtBase.fit(df)
@@ -141,15 +140,18 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext
 
     // constant threshold scaling is the same as no thresholds
     binaryModel.setThresholds(Array(1.0, 1.0))
-    val scaledPredictions = binaryModel.transform(df).select("prediction").collect()
-    assert(scaledPredictions.zip(basePredictions).forall { case (scaled, base) =>
-      scaled.getDouble(0) === base.getDouble(0)
-    })
+    testTransformerByGlobalCheckFunc[(Double, Vector)](df, binaryModel, "prediction") {
+      scaledPredictions: Seq[Row] =>
+        assert(scaledPredictions.zip(basePredictions).forall { case (scaled, base) =>
+          scaled.getDouble(0) === base.getDouble(0)
+        })
+    }
 
     // force it to use the predict method
     model.setRawPredictionCol("").setProbabilityCol("").setThresholds(Array(0, 1))
-    val predictionsWithPredict = model.transform(df).select("prediction").collect()
-    assert(predictionsWithPredict.forall(_.getDouble(0) === 0.0))
+    testTransformer[(Double, Vector)](df, model, "prediction") {
+      case Row(prediction: Double) => prediction === 0.0
+    }
   }
 
   test("GBTClassifier: Predictor, Classifier methods") {
@@ -169,61 +171,30 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext
     val blas = BLAS.getInstance()
 
     val validationDataset = validationData.toDF(labelCol, featuresCol)
-    val results = gbtModel.transform(validationDataset)
-    // check that raw prediction is tree predictions dot tree weights
-    results.select(rawPredictionCol, featuresCol).collect().foreach {
-      case Row(raw: Vector, features: Vector) =>
+    testTransformer[(Double, Vector)](validationDataset, gbtModel,
+      "rawPrediction", "features", "probability", "prediction") {
+      case Row(raw: Vector, features: Vector, prob: Vector, pred: Double) =>
         assert(raw.size === 2)
+        // check that raw prediction is tree predictions dot tree weights
         val treePredictions = gbtModel.trees.map(_.rootNode.predictImpl(features).prediction)
         val prediction = blas.ddot(gbtModel.numTrees, treePredictions, 1, gbtModel.treeWeights, 1)
         assert(raw ~== Vectors.dense(-prediction, prediction) relTol eps)
-    }
 
-    // Compare rawPrediction with probability
-    results.select(rawPredictionCol, probabilityCol).collect().foreach {
-      case Row(raw: Vector, prob: Vector) =>
-        assert(raw.size === 2)
+        // Compare rawPrediction with probability
         assert(prob.size === 2)
         // Note: we should check other loss types for classification if they are added
         val predFromRaw = raw.toDense.values.map(value => LogLoss.computeProbability(value))
         assert(prob(0) ~== predFromRaw(0) relTol eps)
         assert(prob(1) ~== predFromRaw(1) relTol eps)
         assert(prob(0) + prob(1) ~== 1.0 absTol absEps)
-    }
 
-    // Compare prediction with probability
-    results.select(predictionCol, probabilityCol).collect().foreach {
-      case Row(pred: Double, prob: Vector) =>
+        // Compare prediction with probability
         val predFromProb = prob.toArray.zipWithIndex.maxBy(_._1)._2
         assert(pred == predFromProb)
     }
 
-    // force it to use raw2prediction
-    gbtModel.setRawPredictionCol(rawPredictionCol).setProbabilityCol("")
-    val resultsUsingRaw2Predict =
-      gbtModel.transform(validationDataset).select(predictionCol).as[Double].collect()
-    resultsUsingRaw2Predict.zip(results.select(predictionCol).as[Double].collect()).foreach {
-      case (pred1, pred2) => assert(pred1 === pred2)
-    }
-
-    // force it to use probability2prediction
-    gbtModel.setRawPredictionCol("").setProbabilityCol(probabilityCol)
-    val resultsUsingProb2Predict =
-      gbtModel.transform(validationDataset).select(predictionCol).as[Double].collect()
-    resultsUsingProb2Predict.zip(results.select(predictionCol).as[Double].collect()).foreach {
-      case (pred1, pred2) => assert(pred1 === pred2)
-    }
-
-    // force it to use predict
-    gbtModel.setRawPredictionCol("").setProbabilityCol("")
-    val resultsUsingPredict =
-      gbtModel.transform(validationDataset).select(predictionCol).as[Double].collect()
-    resultsUsingPredict.zip(results.select(predictionCol).as[Double].collect()).foreach {
-      case (pred1, pred2) => assert(pred1 === pred2)
-    }
-
     ProbabilisticClassifierSuite.testPredictMethods[
-      Vector, GBTClassificationModel](gbtModel, validationDataset)
+      Vector, GBTClassificationModel](this, gbtModel, validationDataset)
   }
 
   test("GBT parameter stepSize should be in interval (0, 1]") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala
index 41a5d22dd6283..a93825b8a812d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala
@@ -21,20 +21,18 @@ import scala.util.Random
 
 import breeze.linalg.{DenseVector => BDV}
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.classification.LinearSVCSuite._
 import org.apache.spark.ml.feature.{Instance, LabeledPoint}
 import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.ml.optim.aggregator.HingeAggregator
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{Dataset, Row}
 import org.apache.spark.sql.functions.udf
 
 
-class LinearSVCSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class LinearSVCSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -141,10 +139,11 @@ class LinearSVCSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
         threshold: Double,
         expected: Set[(Int, Double)]): Unit = {
       model.setThreshold(threshold)
-      val results = model.transform(df).select("id", "prediction").collect()
-        .map(r => (r.getInt(0), r.getDouble(1)))
-        .toSet
-      assert(results === expected, s"Failed for threshold = $threshold")
+      testTransformerByGlobalCheckFunc[(Int, Vector)](df, model, "id", "prediction") {
+        rows: Seq[Row] =>
+          val results = rows.map(r => (r.getInt(0), r.getDouble(1))).toSet
+          assert(results === expected, s"Failed for threshold = $threshold")
+      }
     }
 
     def checkResults(threshold: Double, expected: Set[(Int, Double)]): Unit = {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index a5f81a38face9..9987cbf6ba116 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -22,22 +22,20 @@ import scala.language.existentials
 import scala.util.Random
 import scala.util.control.Breaks._
 
-import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.SparkException
 import org.apache.spark.ml.attribute.NominalAttribute
 import org.apache.spark.ml.classification.LogisticRegressionSuite._
 import org.apache.spark.ml.feature.{Instance, LabeledPoint}
 import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, Matrix, SparseMatrix, Vector, Vectors}
 import org.apache.spark.ml.optim.aggregator.LogisticAggregator
 import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{Dataset, Row}
 import org.apache.spark.sql.functions.{col, lit, rand}
 import org.apache.spark.sql.types.LongType
 
-class LogisticRegressionSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -332,15 +330,14 @@ class LogisticRegressionSuite
     val binaryModel = blr.fit(smallBinaryDataset)
 
     binaryModel.setThreshold(1.0)
-    val binaryZeroPredictions =
-      binaryModel.transform(smallBinaryDataset).select("prediction").collect()
-    assert(binaryZeroPredictions.forall(_.getDouble(0) === 0.0))
+    testTransformer[(Double, Vector)](smallBinaryDataset.toDF(), binaryModel, "prediction") {
+      row => assert(row.getDouble(0) === 0.0)
+    }
 
     binaryModel.setThreshold(0.0)
-    val binaryOnePredictions =
-      binaryModel.transform(smallBinaryDataset).select("prediction").collect()
-    assert(binaryOnePredictions.forall(_.getDouble(0) === 1.0))
-
+    testTransformer[(Double, Vector)](smallBinaryDataset.toDF(), binaryModel, "prediction") {
+      row => assert(row.getDouble(0) === 1.0)
+    }
 
     val mlr = new LogisticRegression().setFamily("multinomial")
     val model = mlr.fit(smallMultinomialDataset)
@@ -348,31 +345,36 @@ class LogisticRegressionSuite
 
     // should predict all zeros
     model.setThresholds(Array(1, 1000, 1000))
-    val zeroPredictions = model.transform(smallMultinomialDataset).select("prediction").collect()
-    assert(zeroPredictions.forall(_.getDouble(0) === 0.0))
+    testTransformer[(Double, Vector)](smallMultinomialDataset.toDF(), model, "prediction") {
+      row => assert(row.getDouble(0) === 0.0)
+    }
 
     // should predict all ones
     model.setThresholds(Array(1000, 1, 1000))
-    val onePredictions = model.transform(smallMultinomialDataset).select("prediction").collect()
-    assert(onePredictions.forall(_.getDouble(0) === 1.0))
+    testTransformer[(Double, Vector)](smallMultinomialDataset.toDF(), model, "prediction") {
+      row => assert(row.getDouble(0) === 1.0)
+    }
 
     // should predict all twos
     model.setThresholds(Array(1000, 1000, 1))
-    val twoPredictions = model.transform(smallMultinomialDataset).select("prediction").collect()
-    assert(twoPredictions.forall(_.getDouble(0) === 2.0))
+    testTransformer[(Double, Vector)](smallMultinomialDataset.toDF(), model, "prediction") {
+      row => assert(row.getDouble(0) === 2.0)
+    }
 
     // constant threshold scaling is the same as no thresholds
     model.setThresholds(Array(1000, 1000, 1000))
-    val scaledPredictions = model.transform(smallMultinomialDataset).select("prediction").collect()
-    assert(scaledPredictions.zip(basePredictions).forall { case (scaled, base) =>
-      scaled.getDouble(0) === base.getDouble(0)
-    })
+    testTransformerByGlobalCheckFunc[(Double, Vector)](smallMultinomialDataset.toDF(), model,
+      "prediction") { scaledPredictions: Seq[Row] =>
+      assert(scaledPredictions.zip(basePredictions).forall { case (scaled, base) =>
+        scaled.getDouble(0) === base.getDouble(0)
+      })
+    }
 
     // force it to use the predict method
     model.setRawPredictionCol("").setProbabilityCol("").setThresholds(Array(0, 1, 1))
-    val predictionsWithPredict =
-      model.transform(smallMultinomialDataset).select("prediction").collect()
-    assert(predictionsWithPredict.forall(_.getDouble(0) === 0.0))
+    testTransformer[(Double, Vector)](smallMultinomialDataset.toDF(), model, "prediction") {
+      row => assert(row.getDouble(0) === 0.0)
+    }
   }
 
   test("logistic regression doesn't fit intercept when fitIntercept is off") {
@@ -403,21 +405,19 @@ class LogisticRegressionSuite
 
     // Modify model params, and check that the params worked.
     model.setThreshold(1.0)
-    val predAllZero = model.transform(smallBinaryDataset)
-      .select("prediction", "myProbability")
-      .collect()
-      .map { case Row(pred: Double, prob: Vector) => pred }
-    assert(predAllZero.forall(_ === 0),
-      s"With threshold=1.0, expected predictions to be all 0, but only" +
-      s" ${predAllZero.count(_ === 0)} of ${smallBinaryDataset.count()} were 0.")
+    testTransformerByGlobalCheckFunc[(Double, Vector)](smallBinaryDataset.toDF(),
+      model, "prediction", "myProbability") { rows =>
+      val predAllZero = rows.map(_.getDouble(0))
+      assert(predAllZero.forall(_ === 0),
+        s"With threshold=1.0, expected predictions to be all 0, but only" +
+        s" ${predAllZero.count(_ === 0)} of ${smallBinaryDataset.count()} were 0.")
+    }
     // Call transform with params, and check that the params worked.
-    val predNotAllZero =
-      model.transform(smallBinaryDataset, model.threshold -> 0.0,
-        model.probabilityCol -> "myProb")
-        .select("prediction", "myProb")
-        .collect()
-        .map { case Row(pred: Double, prob: Vector) => pred }
-    assert(predNotAllZero.exists(_ !== 0.0))
+    testTransformerByGlobalCheckFunc[(Double, Vector)](smallBinaryDataset.toDF(),
+      model.copy(ParamMap(model.threshold -> 0.0,
+        model.probabilityCol -> "myProb")), "prediction", "myProb") {
+      rows => assert(rows.map(_.getDouble(0)).exists(_ !== 0.0))
+    }
 
     // Call fit() with new params, and check as many params as we can.
     lr.setThresholds(Array(0.6, 0.4))
@@ -441,10 +441,10 @@ class LogisticRegressionSuite
     val numFeatures = smallMultinomialDataset.select("features").first().getAs[Vector](0).size
     assert(model.numFeatures === numFeatures)
 
-    val results = model.transform(smallMultinomialDataset)
-    // check that raw prediction is coefficients dot features + intercept
-    results.select("rawPrediction", "features").collect().foreach {
-      case Row(raw: Vector, features: Vector) =>
+    testTransformer[(Double, Vector)](smallMultinomialDataset.toDF(),
+      model, "rawPrediction", "features", "probability") {
+      case Row(raw: Vector, features: Vector, prob: Vector) =>
+        // check that raw prediction is coefficients dot features + intercept
         assert(raw.size === 3)
         val margins = Array.tabulate(3) { k =>
           var margin = 0.0
@@ -455,12 +455,7 @@ class LogisticRegressionSuite
           margin
         }
         assert(raw ~== Vectors.dense(margins) relTol eps)
-    }
-
-    // Compare rawPrediction with probability
-    results.select("rawPrediction", "probability").collect().foreach {
-      case Row(raw: Vector, prob: Vector) =>
-        assert(raw.size === 3)
+        // Compare rawPrediction with probability
         assert(prob.size === 3)
         val max = raw.toArray.max
         val subtract = if (max > 0) max else 0.0
@@ -472,39 +467,8 @@ class LogisticRegressionSuite
         assert(prob(2) ~== 1.0 - probFromRaw1 - probFromRaw0 relTol eps)
     }
 
-    // Compare prediction with probability
-    results.select("prediction", "probability").collect().foreach {
-      case Row(pred: Double, prob: Vector) =>
-        val predFromProb = prob.toArray.zipWithIndex.maxBy(_._1)._2
-        assert(pred == predFromProb)
-    }
-
-    // force it to use raw2prediction
-    model.setRawPredictionCol("rawPrediction").setProbabilityCol("")
-    val resultsUsingRaw2Predict =
-      model.transform(smallMultinomialDataset).select("prediction").as[Double].collect()
-    resultsUsingRaw2Predict.zip(results.select("prediction").as[Double].collect()).foreach {
-      case (pred1, pred2) => assert(pred1 === pred2)
-    }
-
-    // force it to use probability2prediction
-    model.setRawPredictionCol("").setProbabilityCol("probability")
-    val resultsUsingProb2Predict =
-      model.transform(smallMultinomialDataset).select("prediction").as[Double].collect()
-    resultsUsingProb2Predict.zip(results.select("prediction").as[Double].collect()).foreach {
-      case (pred1, pred2) => assert(pred1 === pred2)
-    }
-
-    // force it to use predict
-    model.setRawPredictionCol("").setProbabilityCol("")
-    val resultsUsingPredict =
-      model.transform(smallMultinomialDataset).select("prediction").as[Double].collect()
-    resultsUsingPredict.zip(results.select("prediction").as[Double].collect()).foreach {
-      case (pred1, pred2) => assert(pred1 === pred2)
-    }
-
     ProbabilisticClassifierSuite.testPredictMethods[
-      Vector, LogisticRegressionModel](model, smallMultinomialDataset)
+      Vector, LogisticRegressionModel](this, model, smallMultinomialDataset)
   }
 
   test("binary logistic regression: Predictor, Classifier methods") {
@@ -517,51 +481,22 @@ class LogisticRegressionSuite
     val numFeatures = smallBinaryDataset.select("features").first().getAs[Vector](0).size
     assert(model.numFeatures === numFeatures)
 
-    val results = model.transform(smallBinaryDataset)
-
-    // Compare rawPrediction with probability
-    results.select("rawPrediction", "probability").collect().foreach {
-      case Row(raw: Vector, prob: Vector) =>
+    testTransformer[(Double, Vector)](smallBinaryDataset.toDF(),
+      model, "rawPrediction", "probability", "prediction") {
+      case Row(raw: Vector, prob: Vector, pred: Double) =>
+        // Compare rawPrediction with probability
         assert(raw.size === 2)
         assert(prob.size === 2)
         val probFromRaw1 = 1.0 / (1.0 + math.exp(-raw(1)))
         assert(prob(1) ~== probFromRaw1 relTol eps)
         assert(prob(0) ~== 1.0 - probFromRaw1 relTol eps)
-    }
-
-    // Compare prediction with probability
-    results.select("prediction", "probability").collect().foreach {
-      case Row(pred: Double, prob: Vector) =>
+        // Compare prediction with probability
         val predFromProb = prob.toArray.zipWithIndex.maxBy(_._1)._2
         assert(pred == predFromProb)
     }
 
-    // force it to use raw2prediction
-    model.setRawPredictionCol("rawPrediction").setProbabilityCol("")
-    val resultsUsingRaw2Predict =
-      model.transform(smallBinaryDataset).select("prediction").as[Double].collect()
-    resultsUsingRaw2Predict.zip(results.select("prediction").as[Double].collect()).foreach {
-      case (pred1, pred2) => assert(pred1 === pred2)
-    }
-
-    // force it to use probability2prediction
-    model.setRawPredictionCol("").setProbabilityCol("probability")
-    val resultsUsingProb2Predict =
-      model.transform(smallBinaryDataset).select("prediction").as[Double].collect()
-    resultsUsingProb2Predict.zip(results.select("prediction").as[Double].collect()).foreach {
-      case (pred1, pred2) => assert(pred1 === pred2)
-    }
-
-    // force it to use predict
-    model.setRawPredictionCol("").setProbabilityCol("")
-    val resultsUsingPredict =
-      model.transform(smallBinaryDataset).select("prediction").as[Double].collect()
-    resultsUsingPredict.zip(results.select("prediction").as[Double].collect()).foreach {
-      case (pred1, pred2) => assert(pred1 === pred2)
-    }
-
     ProbabilisticClassifierSuite.testPredictMethods[
-      Vector, LogisticRegressionModel](model, smallBinaryDataset)
+      Vector, LogisticRegressionModel](this, model, smallBinaryDataset)
   }
 
   test("coefficients and intercept methods") {
@@ -616,19 +551,21 @@ class LogisticRegressionSuite
       LabeledPoint(1.0, Vectors.dense(0.0, 1000.0)),
       LabeledPoint(1.0, Vectors.dense(0.0, -1.0))
     ).toDF()
-    val results = model.transform(overFlowData).select("rawPrediction", "probability").collect()
-
-    // probabilities are correct when margins have to be adjusted
-    val raw1 = results(0).getAs[Vector](0)
-    val prob1 = results(0).getAs[Vector](1)
-    assert(raw1 === Vectors.dense(1000.0, 2000.0, 3000.0))
-    assert(prob1 ~== Vectors.dense(0.0, 0.0, 1.0) absTol eps)
-
-    // probabilities are correct when margins don't have to be adjusted
-    val raw2 = results(1).getAs[Vector](0)
-    val prob2 = results(1).getAs[Vector](1)
-    assert(raw2 === Vectors.dense(-1.0, -2.0, -3.0))
-    assert(prob2 ~== Vectors.dense(0.66524096, 0.24472847, 0.09003057) relTol eps)
+
+    testTransformerByGlobalCheckFunc[(Double, Vector)](overFlowData.toDF(),
+      model, "rawPrediction", "probability") { results: Seq[Row] =>
+        // probabilities are correct when margins have to be adjusted
+        val raw1 = results(0).getAs[Vector](0)
+        val prob1 = results(0).getAs[Vector](1)
+        assert(raw1 === Vectors.dense(1000.0, 2000.0, 3000.0))
+        assert(prob1 ~== Vectors.dense(0.0, 0.0, 1.0) absTol eps)
+
+        // probabilities are correct when margins don't have to be adjusted
+        val raw2 = results(1).getAs[Vector](0)
+        val prob2 = results(1).getAs[Vector](1)
+        assert(raw2 === Vectors.dense(-1.0, -2.0, -3.0))
+        assert(prob2 ~== Vectors.dense(0.66524096, 0.24472847, 0.09003057) relTol eps)
+    }
   }
 
   test("MultiClassSummarizer") {
@@ -2567,10 +2504,13 @@ class LogisticRegressionSuite
     val model1 = lr.fit(smallBinaryDataset)
     val lr2 = new LogisticRegression().setInitialModel(model1).setMaxIter(5).setFamily("binomial")
     val model2 = lr2.fit(smallBinaryDataset)
-    val predictions1 = model1.transform(smallBinaryDataset).select("prediction").collect()
-    val predictions2 = model2.transform(smallBinaryDataset).select("prediction").collect()
-    predictions1.zip(predictions2).foreach { case (Row(p1: Double), Row(p2: Double)) =>
-      assert(p1 === p2)
+    val binaryExpected = model1.transform(smallBinaryDataset).select("prediction").collect()
+      .map(_.getDouble(0))
+    for (model <- Seq(model1, model2)) {
+      testTransformerByGlobalCheckFunc[(Double, Vector)](smallBinaryDataset.toDF(), model,
+        "prediction") { rows: Seq[Row] =>
+        rows.map(_.getDouble(0)).toArray === binaryExpected
+      }
     }
     assert(model2.summary.totalIterations === 1)
 
@@ -2579,10 +2519,13 @@ class LogisticRegressionSuite
     val lr4 = new LogisticRegression()
       .setInitialModel(model3).setMaxIter(5).setFamily("multinomial")
     val model4 = lr4.fit(smallMultinomialDataset)
-    val predictions3 = model3.transform(smallMultinomialDataset).select("prediction").collect()
-    val predictions4 = model4.transform(smallMultinomialDataset).select("prediction").collect()
-    predictions3.zip(predictions4).foreach { case (Row(p1: Double), Row(p2: Double)) =>
-      assert(p1 === p2)
+    val multinomialExpected = model3.transform(smallMultinomialDataset).select("prediction")
+      .collect().map(_.getDouble(0))
+    for (model <- Seq(model3, model4)) {
+      testTransformerByGlobalCheckFunc[(Double, Vector)](smallMultinomialDataset.toDF(), model,
+        "prediction") { rows: Seq[Row] =>
+        rows.map(_.getDouble(0)).toArray === multinomialExpected
+      }
     }
     assert(model4.summary.totalIterations === 1)
   }
@@ -2638,8 +2581,8 @@ class LogisticRegressionSuite
       LabeledPoint(4.0, Vectors.dense(2.0))).toDF()
     val mlr = new LogisticRegression().setFamily("multinomial")
     val model = mlr.fit(constantData)
-    val results = model.transform(constantData)
-    results.select("rawPrediction", "probability", "prediction").collect().foreach {
+    testTransformer[(Double, Vector)](constantData, model,
+      "rawPrediction", "probability", "prediction") {
       case Row(raw: Vector, prob: Vector, pred: Double) =>
         assert(raw === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, Double.PositiveInfinity)))
         assert(prob === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, 1.0)))
@@ -2653,8 +2596,8 @@ class LogisticRegressionSuite
       LabeledPoint(0.0, Vectors.dense(1.0)),
       LabeledPoint(0.0, Vectors.dense(2.0))).toDF()
     val modelZeroLabel = mlr.setFitIntercept(false).fit(constantZeroData)
-    val resultsZero = modelZeroLabel.transform(constantZeroData)
-    resultsZero.select("rawPrediction", "probability", "prediction").collect().foreach {
+    testTransformer[(Double, Vector)](constantZeroData, modelZeroLabel,
+      "rawPrediction", "probability", "prediction") {
       case Row(raw: Vector, prob: Vector, pred: Double) =>
         assert(prob === Vectors.dense(Array(1.0)))
         assert(pred === 0.0)
@@ -2666,8 +2609,8 @@ class LogisticRegressionSuite
     val constantDataWithMetadata = constantData
       .select(constantData("label").as("label", labelMeta), constantData("features"))
     val modelWithMetadata = mlr.setFitIntercept(true).fit(constantDataWithMetadata)
-    val resultsWithMetadata = modelWithMetadata.transform(constantDataWithMetadata)
-    resultsWithMetadata.select("rawPrediction", "probability", "prediction").collect().foreach {
+    testTransformer[(Double, Vector)](constantDataWithMetadata, modelWithMetadata,
+      "rawPrediction", "probability", "prediction") {
       case Row(raw: Vector, prob: Vector, pred: Double) =>
         assert(raw === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, Double.PositiveInfinity, 0.0)))
         assert(prob === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, 1.0, 0.0)))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
index d3141ec708560..daa58a56896d7 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
@@ -17,22 +17,17 @@
 
 package org.apache.spark.ml.classification
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.classification.LogisticRegressionSuite._
 import org.apache.spark.ml.linalg.{Vector, Vectors}
-import org.apache.spark.ml.util.DefaultReadWriteTest
-import org.apache.spark.ml.util.MLTestingUtils
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
 import org.apache.spark.mllib.evaluation.MulticlassMetrics
 import org.apache.spark.mllib.linalg.{Vectors => OldVectors}
 import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{Dataset, Row}
-import org.apache.spark.sql.functions._
 
-class MultilayerPerceptronClassifierSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class MultilayerPerceptronClassifierSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -75,11 +70,9 @@ class MultilayerPerceptronClassifierSuite
       .setMaxIter(100)
       .setSolver("l-bfgs")
     val model = trainer.fit(dataset)
-    val result = model.transform(dataset)
     MLTestingUtils.checkCopyAndUids(trainer, model)
-    val predictionAndLabels = result.select("prediction", "label").collect()
-    predictionAndLabels.foreach { case Row(p: Double, l: Double) =>
-      assert(p == l)
+    testTransformer[(Vector, Double)](dataset.toDF(), model, "prediction", "label") {
+      case Row(p: Double, l: Double) => assert(p == l)
     }
   }
 
@@ -99,13 +92,12 @@ class MultilayerPerceptronClassifierSuite
       .setMaxIter(100)
       .setSolver("l-bfgs")
     val model = trainer.fit(strongDataset)
-    val result = model.transform(strongDataset)
-    result.select("probability", "expectedProbability").collect().foreach {
-      case Row(p: Vector, e: Vector) =>
-        assert(p ~== e absTol 1e-3)
+    testTransformer[(Vector, Double, Vector)](strongDataset.toDF(), model,
+      "probability", "expectedProbability") {
+      case Row(p: Vector, e: Vector) => assert(p ~== e absTol 1e-3)
     }
     ProbabilisticClassifierSuite.testPredictMethods[
-      Vector, MultilayerPerceptronClassificationModel](model, strongDataset)
+      Vector, MultilayerPerceptronClassificationModel](this, model, strongDataset)
   }
 
   test("test model probability") {
@@ -118,11 +110,10 @@ class MultilayerPerceptronClassifierSuite
       .setSolver("l-bfgs")
     val model = trainer.fit(dataset)
     model.setProbabilityCol("probability")
-    val result = model.transform(dataset)
-    val features2prob = udf { features: Vector => model.mlpModel.predict(features) }
-    result.select(features2prob(col("features")), col("probability")).collect().foreach {
-      case Row(p1: Vector, p2: Vector) =>
-        assert(p1 ~== p2 absTol 1e-3)
+    testTransformer[(Vector, Double)](dataset.toDF(), model, "features", "probability") {
+      case Row(features: Vector, prob: Vector) =>
+        val prob2 = model.mlpModel.predict(features)
+        assert(prob ~== prob2 absTol 1e-3)
     }
   }
 
@@ -175,9 +166,6 @@ class MultilayerPerceptronClassifierSuite
     val model = trainer.fit(dataFrame)
     val numFeatures = dataFrame.select("features").first().getAs[Vector](0).size
     assert(model.numFeatures === numFeatures)
-    val mlpPredictionAndLabels = model.transform(dataFrame).select("prediction", "label").rdd.map {
-      case Row(p: Double, l: Double) => (p, l)
-    }
     // train multinomial logistic regression
     val lr = new LogisticRegressionWithLBFGS()
       .setIntercept(true)
@@ -189,8 +177,12 @@ class MultilayerPerceptronClassifierSuite
       lrModel.predict(data.rdd.map(p => OldVectors.fromML(p.features))).zip(data.rdd.map(_.label))
     // MLP's predictions should not differ a lot from LR's.
     val lrMetrics = new MulticlassMetrics(lrPredictionAndLabels)
-    val mlpMetrics = new MulticlassMetrics(mlpPredictionAndLabels)
-    assert(mlpMetrics.confusionMatrix.asML ~== lrMetrics.confusionMatrix.asML absTol 100)
+    testTransformerByGlobalCheckFunc[(Double, Vector)](dataFrame, model, "prediction", "label") {
+      rows: Seq[Row] =>
+        val mlpPredictionAndLabels = rows.map(x => (x.getDouble(0), x.getDouble(1)))
+        val mlpMetrics = new MulticlassMetrics(sc.makeRDD(mlpPredictionAndLabels))
+        assert(mlpMetrics.confusionMatrix.asML ~== lrMetrics.confusionMatrix.asML absTol 100)
+    }
   }
 
   test("read/write: MultilayerPerceptronClassifier") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
index 0d3adf993383f..49115c8a4db30 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
@@ -28,12 +28,11 @@ import org.apache.spark.ml.classification.NaiveBayesSuite._
 import org.apache.spark.ml.feature.LabeledPoint
 import org.apache.spark.ml.linalg._
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 
-class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class NaiveBayesSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -56,13 +55,13 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
     bernoulliDataset = generateNaiveBayesInput(pi, theta, 100, seed, "bernoulli").toDF()
   }
 
-  def validatePrediction(predictionAndLabels: DataFrame): Unit = {
-    val numOfErrorPredictions = predictionAndLabels.collect().count {
+  def validatePrediction(predictionAndLabels: Seq[Row]): Unit = {
+    val numOfErrorPredictions = predictionAndLabels.filter {
       case Row(prediction: Double, label: Double) =>
         prediction != label
-    }
+    }.length
     // At least 80% of the predictions should be on.
-    assert(numOfErrorPredictions < predictionAndLabels.count() / 5)
+    assert(numOfErrorPredictions < predictionAndLabels.length / 5)
   }
 
   def validateModelFit(
@@ -92,10 +91,10 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
   }
 
   def validateProbabilities(
-      featureAndProbabilities: DataFrame,
+      featureAndProbabilities: Seq[Row],
       model: NaiveBayesModel,
       modelType: String): Unit = {
-    featureAndProbabilities.collect().foreach {
+    featureAndProbabilities.foreach {
       case Row(features: Vector, probability: Vector) =>
         assert(probability.toArray.sum ~== 1.0 relTol 1.0e-10)
         val expected = modelType match {
@@ -154,15 +153,18 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
     val validationDataset =
       generateNaiveBayesInput(piArray, thetaArray, nPoints, 17, "multinomial").toDF()
 
-    val predictionAndLabels = model.transform(validationDataset).select("prediction", "label")
-    validatePrediction(predictionAndLabels)
+    testTransformerByGlobalCheckFunc[(Double, Vector)](validationDataset, model,
+      "prediction", "label") { predictionAndLabels: Seq[Row] =>
+      validatePrediction(predictionAndLabels)
+    }
 
-    val featureAndProbabilities = model.transform(validationDataset)
-      .select("features", "probability")
-    validateProbabilities(featureAndProbabilities, model, "multinomial")
+    testTransformerByGlobalCheckFunc[(Double, Vector)](validationDataset, model,
+      "features", "probability") { featureAndProbabilities: Seq[Row] =>
+      validateProbabilities(featureAndProbabilities, model, "multinomial")
+    }
 
     ProbabilisticClassifierSuite.testPredictMethods[
-      Vector, NaiveBayesModel](model, testDataset)
+      Vector, NaiveBayesModel](this, model, testDataset)
   }
 
   test("Naive Bayes with weighted samples") {
@@ -210,15 +212,18 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
     val validationDataset =
       generateNaiveBayesInput(piArray, thetaArray, nPoints, 20, "bernoulli").toDF()
 
-    val predictionAndLabels = model.transform(validationDataset).select("prediction", "label")
-    validatePrediction(predictionAndLabels)
+    testTransformerByGlobalCheckFunc[(Double, Vector)](validationDataset, model,
+      "prediction", "label") { predictionAndLabels: Seq[Row] =>
+      validatePrediction(predictionAndLabels)
+    }
 
-    val featureAndProbabilities = model.transform(validationDataset)
-      .select("features", "probability")
-    validateProbabilities(featureAndProbabilities, model, "bernoulli")
+    testTransformerByGlobalCheckFunc[(Double, Vector)](validationDataset, model,
+      "features", "probability") { featureAndProbabilities: Seq[Row] =>
+      validateProbabilities(featureAndProbabilities, model, "bernoulli")
+    }
 
     ProbabilisticClassifierSuite.testPredictMethods[
-      Vector, NaiveBayesModel](model, testDataset)
+      Vector, NaiveBayesModel](this, model, testDataset)
   }
 
   test("detect negative values") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
index 25bad59b9c9cf..11e88367108b4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -17,26 +17,24 @@
 
 package org.apache.spark.ml.classification
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.attribute.NominalAttribute
 import org.apache.spark.ml.classification.LogisticRegressionSuite._
 import org.apache.spark.ml.feature.LabeledPoint
 import org.apache.spark.ml.feature.StringIndexer
-import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MetadataUtils, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MetadataUtils, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
 import org.apache.spark.mllib.evaluation.MulticlassMetrics
 import org.apache.spark.mllib.linalg.{Vectors => OldVectors}
 import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.Dataset
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.Metadata
 
-class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class OneVsRestSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -85,10 +83,6 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
     val predictionColSchema = transformedDataset.schema(ovaModel.getPredictionCol)
     assert(MetadataUtils.getNumClasses(predictionColSchema) === Some(3))
 
-    val ovaResults = transformedDataset.select("prediction", "label").rdd.map {
-      row => (row.getDouble(0), row.getDouble(1))
-    }
-
     val lr = new LogisticRegressionWithLBFGS().setIntercept(true).setNumClasses(numClasses)
     lr.optimizer.setRegParam(0.1).setNumIterations(100)
 
@@ -97,8 +91,13 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
     // determine the #confusion matrix in each class.
     // bound how much error we allow compared to multinomial logistic regression.
     val expectedMetrics = new MulticlassMetrics(results)
-    val ovaMetrics = new MulticlassMetrics(ovaResults)
-    assert(expectedMetrics.confusionMatrix.asML ~== ovaMetrics.confusionMatrix.asML absTol 400)
+
+    testTransformerByGlobalCheckFunc[(Double, Vector)](dataset.toDF(), ovaModel,
+      "prediction", "label") { rows =>
+      val ovaResults = rows.map { row => (row.getDouble(0), row.getDouble(1)) }
+      val ovaMetrics = new MulticlassMetrics(sc.makeRDD(ovaResults))
+      assert(expectedMetrics.confusionMatrix.asML ~== ovaMetrics.confusionMatrix.asML absTol 400)
+    }
   }
 
   test("one-vs-rest: tuning parallelism does not change output") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala
index d649ceac949c4..1c8c9829f18d1 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.ml.classification
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.util.MLTest
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.sql.{Dataset, Row}
 
@@ -122,13 +123,15 @@ object ProbabilisticClassifierSuite {
   def testPredictMethods[
       FeaturesType,
       M <: ProbabilisticClassificationModel[FeaturesType, M]](
-    model: M, testData: Dataset[_]): Unit = {
+    mlTest: MLTest, model: M, testData: Dataset[_]): Unit = {
 
     val allColModel = model.copy(ParamMap.empty)
       .setRawPredictionCol("rawPredictionAll")
       .setProbabilityCol("probabilityAll")
       .setPredictionCol("predictionAll")
-    val allColResult = allColModel.transform(testData)
+
+    val allColResult = allColModel.transform(testData.select(allColModel.getFeaturesCol))
+      .select(allColModel.getFeaturesCol, "rawPredictionAll", "probabilityAll", "predictionAll")
 
     for (rawPredictionCol <- Seq("", "rawPredictionSingle")) {
       for (probabilityCol <- Seq("", "probabilitySingle")) {
@@ -138,22 +141,14 @@ object ProbabilisticClassifierSuite {
             .setProbabilityCol(probabilityCol)
             .setPredictionCol(predictionCol)
 
-          val result = newModel.transform(allColResult)
-
-          import org.apache.spark.sql.functions._
-
-          val resultRawPredictionCol =
-            if (rawPredictionCol.isEmpty) col("rawPredictionAll") else col(rawPredictionCol)
-          val resultProbabilityCol =
-            if (probabilityCol.isEmpty) col("probabilityAll") else col(probabilityCol)
-          val resultPredictionCol =
-            if (predictionCol.isEmpty) col("predictionAll") else col(predictionCol)
+          import allColResult.sparkSession.implicits._
 
-          result.select(
-            resultRawPredictionCol, col("rawPredictionAll"),
-            resultProbabilityCol, col("probabilityAll"),
-            resultPredictionCol, col("predictionAll")
-          ).collect().foreach {
+          mlTest.testTransformer[(Vector, Vector, Vector, Double)](allColResult, newModel,
+            if (rawPredictionCol.isEmpty) "rawPredictionAll" else rawPredictionCol,
+            "rawPredictionAll",
+            if (probabilityCol.isEmpty) "probabilityAll" else probabilityCol, "probabilityAll",
+            if (predictionCol.isEmpty) "predictionAll" else predictionCol, "predictionAll"
+          ) {
             case Row(
               rawPredictionSingle: Vector, rawPredictionAll: Vector,
               probabilitySingle: Vector, probabilityAll: Vector,
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
index 2cca2e6c04698..02a9d5c2a18c0 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
@@ -23,11 +23,10 @@ import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.tree.LeafNode
 import org.apache.spark.ml.tree.impl.TreeTests
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
 import org.apache.spark.mllib.tree.{EnsembleTestHelper, RandomForest => OldRandomForest}
 import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Row}
@@ -35,8 +34,7 @@ import org.apache.spark.sql.{DataFrame, Row}
 /**
  * Test suite for [[RandomForestClassifier]].
  */
-class RandomForestClassifierSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class RandomForestClassifierSuite extends MLTest with DefaultReadWriteTest {
 
   import RandomForestClassifierSuite.compareAPIs
   import testImplicits._
@@ -143,11 +141,8 @@ class RandomForestClassifierSuite
 
     MLTestingUtils.checkCopyAndUids(rf, model)
 
-    val predictions = model.transform(df)
-      .select(rf.getPredictionCol, rf.getRawPredictionCol, rf.getProbabilityCol)
-      .collect()
-
-    predictions.foreach { case Row(pred: Double, rawPred: Vector, probPred: Vector) =>
+    testTransformer[(Vector, Double)](df, model, "prediction", "rawPrediction",
+      "probability") { case Row(pred: Double, rawPred: Vector, probPred: Vector) =>
       assert(pred === rawPred.argmax,
         s"Expected prediction $pred but calculated ${rawPred.argmax} from rawPrediction.")
       val sum = rawPred.toArray.sum
@@ -155,8 +150,9 @@ class RandomForestClassifierSuite
         "probability prediction mismatch")
       assert(probPred.toArray.sum ~== 1.0 relTol 1E-5)
     }
+
     ProbabilisticClassifierSuite.testPredictMethods[
-      Vector, RandomForestClassificationModel](model, df)
+      Vector, RandomForestClassificationModel](this, model, df)
   }
 
   test("Fitting without numClasses in metadata") {

From 911b83da42fa850eb3ae419687c204cb2e25767b Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Mon, 5 Mar 2018 11:53:23 -0800
Subject: [PATCH 331/566] [SPARK-23457][SQL][BRANCH-2.3] Register task
 completion listeners first in ParquetFileFormat

## What changes were proposed in this pull request?

ParquetFileFormat leaks opened files in some cases. This PR prevents that by registering task completion listers first before initialization.

- [spark-branch-2.3-test-sbt-hadoop-2.7](https://amplab.cs.berkeley.edu/jenkins/view/Spark%20QA%20Test%20(Dashboard)/job/spark-branch-2.3-test-sbt-hadoop-2.7/205/testReport/org.apache.spark.sql/FileBasedDataSourceSuite/_It_is_not_a_test_it_is_a_sbt_testing_SuiteSelector_/)
- [spark-master-test-sbt-hadoop-2.6](https://amplab.cs.berkeley.edu/jenkins/view/Spark%20QA%20Test%20(Dashboard)/job/spark-master-test-sbt-hadoop-2.6/4228/testReport/junit/org.apache.spark.sql.execution.datasources.parquet/ParquetQuerySuite/_It_is_not_a_test_it_is_a_sbt_testing_SuiteSelector_/)

```
Caused by: sbt.ForkMain$ForkError: java.lang.Throwable: null
	at org.apache.spark.DebugFilesystem$.addOpenStream(DebugFilesystem.scala:36)
	at org.apache.spark.DebugFilesystem.open(DebugFilesystem.scala:70)
	at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:769)
	at org.apache.parquet.hadoop.ParquetFileReader.<init>(ParquetFileReader.java:538)
	at org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:149)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:133)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReaderWithPartitionValues$1.apply(ParquetFileFormat.scala:400)
	at
```

## How was this patch tested?

Manual. The following test case generates the same leakage.

```scala
  test("SPARK-23457 Register task completion listeners first in ParquetFileFormat") {
    withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_BATCH_SIZE.key -> s"${Int.MaxValue}") {
      withTempDir { dir =>
        val basePath = dir.getCanonicalPath
        Seq(0).toDF("a").write.format("parquet").save(new Path(basePath, "first").toString)
        Seq(1).toDF("a").write.format("parquet").save(new Path(basePath, "second").toString)
        val df = spark.read.parquet(
          new Path(basePath, "first").toString,
          new Path(basePath, "second").toString)
        val e = intercept[SparkException] {
          df.collect()
        }
        assert(e.getCause.isInstanceOf[OutOfMemoryError])
      }
    }
  }
```

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20714 from dongjoon-hyun/SPARK-23457-2.3.
---
 .../parquet/ParquetFileFormat.scala           | 22 +++++++++----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
index f53a97ba45a26..a6129dafd2f77 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -394,16 +394,21 @@ class ParquetFileFormat
         ParquetInputFormat.setFilterPredicate(hadoopAttemptContext.getConfiguration, pushed.get)
       }
       val taskContext = Option(TaskContext.get())
-      val parquetReader = if (enableVectorizedReader) {
+      if (enableVectorizedReader) {
         val vectorizedReader = new VectorizedParquetRecordReader(
           convertTz.orNull, enableOffHeapColumnVector && taskContext.isDefined)
+        val iter = new RecordReaderIterator(vectorizedReader)
+        // SPARK-23457 Register a task completion lister before `initialization`.
+        taskContext.foreach(_.addTaskCompletionListener(_ => iter.close()))
         vectorizedReader.initialize(split, hadoopAttemptContext)
         logDebug(s"Appending $partitionSchema ${file.partitionValues}")
         vectorizedReader.initBatch(partitionSchema, file.partitionValues)
         if (returningBatch) {
           vectorizedReader.enableReturningBatches()
         }
-        vectorizedReader
+
+        // UnsafeRowParquetRecordReader appends the columns internally to avoid another copy.
+        iter.asInstanceOf[Iterator[InternalRow]]
       } else {
         logDebug(s"Falling back to parquet-mr")
         // ParquetRecordReader returns UnsafeRow
@@ -413,18 +418,11 @@ class ParquetFileFormat
         } else {
           new ParquetRecordReader[UnsafeRow](new ParquetReadSupport(convertTz))
         }
+        val iter = new RecordReaderIterator(reader)
+        // SPARK-23457 Register a task completion lister before `initialization`.
+        taskContext.foreach(_.addTaskCompletionListener(_ => iter.close()))
         reader.initialize(split, hadoopAttemptContext)
-        reader
-      }
 
-      val iter = new RecordReaderIterator(parquetReader)
-      taskContext.foreach(_.addTaskCompletionListener(_ => iter.close()))
-
-      // UnsafeRowParquetRecordReader appends the columns internally to avoid another copy.
-      if (parquetReader.isInstanceOf[VectorizedParquetRecordReader] &&
-          enableVectorizedReader) {
-        iter.asInstanceOf[Iterator[InternalRow]]
-      } else {
         val fullSchema = requiredSchema.toAttributes ++ partitionSchema.toAttributes
         val joinedRow = new JoinedRow()
         val appendPartitionColumns = GenerateUnsafeProjection.generate(fullSchema, fullSchema)

From b9ea2e87bb24c3731bd2dbd044d10d18dbbf9c6f Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Mon, 5 Mar 2018 14:20:10 -0800
Subject: [PATCH 332/566] [SPARK-23434][SQL][BRANCH-2.3] Spark should not warn
 `metadata directory` for a HDFS file path

## What changes were proposed in this pull request?

In a kerberized cluster, when Spark reads a file path (e.g. `people.json`), it warns with a wrong warning message during looking up `people.json/_spark_metadata`. The root cause of this situation is the difference between `LocalFileSystem` and `DistributedFileSystem`. `LocalFileSystem.exists()` returns `false`, but `DistributedFileSystem.exists` raises `org.apache.hadoop.security.AccessControlException`.

```scala
scala> spark.read.json("file:///usr/hdp/current/spark-client/examples/src/main/resources/people.json").show
+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

scala> spark.read.json("hdfs:///tmp/people.json")
18/02/15 05:00:48 WARN streaming.FileStreamSink: Error while looking for metadata directory.
18/02/15 05:00:48 WARN streaming.FileStreamSink: Error while looking for metadata directory.
```

After this PR,
```scala
scala> spark.read.json("hdfs:///tmp/people.json").show
+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+
```

## How was this patch tested?

Manual.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20713 from dongjoon-hyun/SPARK-23434-2.3.
---
 .../spark/sql/execution/streaming/FileStreamSink.scala    | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
index 2715fa93d0e98..87a17cebdc10c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
@@ -42,9 +42,11 @@ object FileStreamSink extends Logging {
         try {
           val hdfsPath = new Path(singlePath)
           val fs = hdfsPath.getFileSystem(hadoopConf)
-          val metadataPath = new Path(hdfsPath, metadataDir)
-          val res = fs.exists(metadataPath)
-          res
+          if (fs.isDirectory(hdfsPath)) {
+            fs.exists(new Path(hdfsPath, metadataDir))
+          } else {
+            false
+          }
         } catch {
           case NonFatal(e) =>
             logWarning(s"Error while looking for metadata directory.")

From 66c1978f9ca53bbddc92ae490a61c904952be7f1 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Tue, 6 Mar 2018 08:52:28 -0600
Subject: [PATCH 333/566] [SPARK-23601][BUILD] Remove .md5 files from release

## What changes were proposed in this pull request?

Remove .md5 files from release artifacts

## How was this patch tested?

N/A

Author: Sean Owen <sowen@cloudera.com>

Closes #20737 from srowen/SPARK-23601.

(cherry picked from commit 8bceb899dc3220998a4ea4021f3b477f78faaca8)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 dev/create-release/release-build.sh | 20 +-------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index a3579f21fc539..c00b00b845401 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -164,8 +164,6 @@ if [[ "$1" == "package" ]]; then
   tar cvzf spark-$SPARK_VERSION.tgz spark-$SPARK_VERSION
   echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour --output spark-$SPARK_VERSION.tgz.asc \
     --detach-sig spark-$SPARK_VERSION.tgz
-  echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md MD5 spark-$SPARK_VERSION.tgz > \
-    spark-$SPARK_VERSION.tgz.md5
   echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
     SHA512 spark-$SPARK_VERSION.tgz > spark-$SPARK_VERSION.tgz.sha512
   rm -rf spark-$SPARK_VERSION
@@ -215,9 +213,6 @@ if [[ "$1" == "package" ]]; then
       echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \
         --output $R_DIST_NAME.asc \
         --detach-sig $R_DIST_NAME
-      echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
-        MD5 $R_DIST_NAME > \
-        $R_DIST_NAME.md5
       echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
         SHA512 $R_DIST_NAME > \
         $R_DIST_NAME.sha512
@@ -234,9 +229,6 @@ if [[ "$1" == "package" ]]; then
       echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \
         --output $PYTHON_DIST_NAME.asc \
         --detach-sig $PYTHON_DIST_NAME
-      echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
-        MD5 $PYTHON_DIST_NAME > \
-        $PYTHON_DIST_NAME.md5
       echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
         SHA512 $PYTHON_DIST_NAME > \
         $PYTHON_DIST_NAME.sha512
@@ -247,9 +239,6 @@ if [[ "$1" == "package" ]]; then
     echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \
       --output spark-$SPARK_VERSION-bin-$NAME.tgz.asc \
       --detach-sig spark-$SPARK_VERSION-bin-$NAME.tgz
-    echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
-      MD5 spark-$SPARK_VERSION-bin-$NAME.tgz > \
-      spark-$SPARK_VERSION-bin-$NAME.tgz.md5
     echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
       SHA512 spark-$SPARK_VERSION-bin-$NAME.tgz > \
       spark-$SPARK_VERSION-bin-$NAME.tgz.sha512
@@ -382,18 +371,11 @@ if [[ "$1" == "publish-release" ]]; then
   find . -type f |grep -v \.jar |grep -v \.pom | xargs rm
 
   echo "Creating hash and signature files"
-  # this must have .asc, .md5 and .sha1 - it really doesn't like anything else there
+  # this must have .asc and .sha1 - it really doesn't like anything else there
   for file in $(find . -type f)
   do
     echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --output $file.asc \
       --detach-sig --armour $file;
-    if [ $(command -v md5) ]; then
-      # Available on OS X; -q to keep only hash
-      md5 -q $file > $file.md5
-    else
-      # Available on Linux; cut to keep only hash
-      md5sum $file | cut -f1 -d' ' > $file.md5
-    fi
     sha1sum $file | cut -f1 -d' ' > $file.sha1
   done
 

From 8cd6a96535bd8b8c4e40b87869ba02f916a997ed Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Wed, 7 Mar 2018 13:51:44 -0800
Subject: [PATCH 334/566] [SPARK-23525][SQL] Support ALTER TABLE CHANGE COLUMN
 COMMENT for external hive table

## What changes were proposed in this pull request?

The following query doesn't work as expected:
```
CREATE EXTERNAL TABLE ext_table(a STRING, b INT, c STRING) PARTITIONED BY (d STRING)
LOCATION 'sql/core/spark-warehouse/ext_table';
ALTER TABLE ext_table CHANGE a a STRING COMMENT "new comment";
DESC ext_table;
```
The comment of column `a` is not updated, that's because `HiveExternalCatalog.doAlterTable` ignores table schema changes. To fix the issue, we should call `doAlterTableDataSchema` instead of `doAlterTable`.

## How was this patch tested?

Updated `DDLSuite.testChangeColumn`.

Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #20696 from jiangxb1987/alterColumnComment.

(cherry picked from commit ac76eff6a88f6358a321b84cb5e60fb9d6403419)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/execution/command/ddl.scala     | 12 ++++++------
 .../sql-tests/inputs/change-column.sql        |  1 +
 .../sql-tests/results/change-column.sql.out   | 19 ++++++++++++++-----
 .../sql/execution/command/DDLSuite.scala      |  1 +
 4 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 0142f17ce62e2..0f4831b348ce6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -314,8 +314,8 @@ case class AlterTableChangeColumnCommand(
     val resolver = sparkSession.sessionState.conf.resolver
     DDLUtils.verifyAlterTableType(catalog, table, isView = false)
 
-    // Find the origin column from schema by column name.
-    val originColumn = findColumnByName(table.schema, columnName, resolver)
+    // Find the origin column from dataSchema by column name.
+    val originColumn = findColumnByName(table.dataSchema, columnName, resolver)
     // Throw an AnalysisException if the column name/dataType is changed.
     if (!columnEqual(originColumn, newColumn, resolver)) {
       throw new AnalysisException(
@@ -324,7 +324,7 @@ case class AlterTableChangeColumnCommand(
           s"'${newColumn.name}' with type '${newColumn.dataType}'")
     }
 
-    val newSchema = table.schema.fields.map { field =>
+    val newDataSchema = table.dataSchema.fields.map { field =>
       if (field.name == originColumn.name) {
         // Create a new column from the origin column with the new comment.
         addComment(field, newColumn.getComment)
@@ -332,8 +332,7 @@ case class AlterTableChangeColumnCommand(
         field
       }
     }
-    val newTable = table.copy(schema = StructType(newSchema))
-    catalog.alterTable(newTable)
+    catalog.alterTableDataSchema(tableName, StructType(newDataSchema))
 
     Seq.empty[Row]
   }
@@ -345,7 +344,8 @@ case class AlterTableChangeColumnCommand(
     schema.fields.collectFirst {
       case field if resolver(field.name, name) => field
     }.getOrElse(throw new AnalysisException(
-      s"Invalid column reference '$name', table schema is '${schema}'"))
+      s"Can't find column `$name` given table data columns " +
+        s"${schema.fieldNames.mkString("[`", "`, `", "`]")}"))
   }
 
   // Add the comment to a column, if comment is empty, return the original column.
diff --git a/sql/core/src/test/resources/sql-tests/inputs/change-column.sql b/sql/core/src/test/resources/sql-tests/inputs/change-column.sql
index ad0f885f63d3d..2909024e4c9f7 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/change-column.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/change-column.sql
@@ -49,6 +49,7 @@ ALTER TABLE global_temp.global_temp_view CHANGE a a INT COMMENT 'this is column
 -- Change column in partition spec (not supported yet)
 CREATE TABLE partition_table(a INT, b STRING, c INT, d STRING) USING parquet PARTITIONED BY (c, d);
 ALTER TABLE partition_table PARTITION (c = 1) CHANGE COLUMN a new_a INT;
+ALTER TABLE partition_table CHANGE COLUMN c c INT COMMENT 'this is column C';
 
 -- DROP TEST TABLE
 DROP TABLE test_change;
diff --git a/sql/core/src/test/resources/sql-tests/results/change-column.sql.out b/sql/core/src/test/resources/sql-tests/results/change-column.sql.out
index ba8bc936f0c79..ff1ecbcc44c23 100644
--- a/sql/core/src/test/resources/sql-tests/results/change-column.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/change-column.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 32
+-- Number of queries: 33
 
 
 -- !query 0
@@ -154,7 +154,7 @@ ALTER TABLE test_change CHANGE invalid_col invalid_col INT
 struct<>
 -- !query 15 output
 org.apache.spark.sql.AnalysisException
-Invalid column reference 'invalid_col', table schema is 'StructType(StructField(a,IntegerType,true), StructField(b,StringType,true), StructField(c,IntegerType,true))';
+Can't find column `invalid_col` given table data columns [`a`, `b`, `c`];
 
 
 -- !query 16
@@ -291,16 +291,25 @@ ALTER TABLE partition_table PARTITION (c = 1) CHANGE COLUMN a new_a INT
 
 
 -- !query 30
-DROP TABLE test_change
+ALTER TABLE partition_table CHANGE COLUMN c c INT COMMENT 'this is column C'
 -- !query 30 schema
 struct<>
 -- !query 30 output
-
+org.apache.spark.sql.AnalysisException
+Can't find column `c` given table data columns [`a`, `b`];
 
 
 -- !query 31
-DROP TABLE partition_table
+DROP TABLE test_change
 -- !query 31 schema
 struct<>
 -- !query 31 output
 
+
+
+-- !query 32
+DROP TABLE partition_table
+-- !query 32 schema
+struct<>
+-- !query 32 output
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index c03f5d1545f52..63afcb5f400b2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -1568,6 +1568,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
     // Ensure that change column will preserve other metadata fields.
     sql("ALTER TABLE dbx.tab1 CHANGE COLUMN col1 col1 INT COMMENT 'this is col1'")
     assert(getMetadata("col1").getString("key") == "value")
+    assert(getMetadata("col1").getString("comment") == "this is col1")
   }
 
   test("drop build-in function") {

From ee6e79737cab6078dc94ddc8fd02433e8199104f Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 7 Mar 2018 17:06:25 -0800
Subject: [PATCH 335/566] [SPARK-23020][CORE][BRANCH-2.3] Fix another race in
 the in-process launcher test.

First the bad news: there's an unfixable race in the launcher code.
(By unfixable I mean it would take a lot more effort than this change
to fix it.) The good news is that it should only affect super short
lived applications, such as the one run by the flaky test, so it's
possible to work around it in our test.

The fix also uncovered an issue with the recently added "closeAndWait()"
method; closing the connection would still possibly cause data loss,
so this change waits a while for the connection to finish itself, and
closes the socket if that times out. The existing connection timeout
is reused so that if desired it's possible to control how long to wait.

As part of that I also restored the old behavior that disconnect() would
force a disconnection from the child app; the "wait for data to arrive"
approach is only taken when disposing of the handle.

I tested this by inserting a bunch of sleeps in the test and the socket
handling code in the launcher library; with those I was able to reproduce
the error from the jenkins jobs. With the changes, even with all the
sleeps still in place, all tests pass.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20743 from vanzin/SPARK-23020.
---
 .../spark/launcher/SparkLauncherSuite.java    | 44 ++++++++++++++----
 .../spark/launcher/AbstractAppHandle.java     | 45 ++++++++++++-------
 .../spark/launcher/ChildProcAppHandle.java    |  2 +-
 .../spark/launcher/InProcessAppHandle.java    |  2 +-
 .../apache/spark/launcher/LauncherServer.java | 30 ++++++++-----
 .../spark/launcher/LauncherServerSuite.java   |  2 +-
 6 files changed, 88 insertions(+), 37 deletions(-)

diff --git a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
index 4c85a8b56141a..2225591a4ff75 100644
--- a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
+++ b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
@@ -25,7 +25,6 @@
 import java.util.Map;
 import java.util.Properties;
 
-import org.junit.Ignore;
 import org.junit.Test;
 import static org.junit.Assert.*;
 import static org.junit.Assume.*;
@@ -122,8 +121,7 @@ public void testChildProcLauncher() throws Exception {
     assertEquals(0, app.waitFor());
   }
 
-  // TODO: [SPARK-23020] Re-enable this
-  @Ignore
+  @Test
   public void testInProcessLauncher() throws Exception {
     // Because this test runs SparkLauncher in process and in client mode, it pollutes the system
     // properties, and that can cause test failures down the test pipeline. So restore the original
@@ -159,12 +157,24 @@ private void inProcessLauncherTestImpl() throws Exception {
 
     SparkAppHandle handle = null;
     try {
-      handle = new InProcessLauncher()
-        .setMaster("local")
-        .setAppResource(SparkLauncher.NO_RESOURCE)
-        .setMainClass(InProcessTestApp.class.getName())
-        .addAppArgs("hello")
-        .startApplication(listener);
+      synchronized (InProcessTestApp.LOCK) {
+        handle = new InProcessLauncher()
+          .setMaster("local")
+          .setAppResource(SparkLauncher.NO_RESOURCE)
+          .setMainClass(InProcessTestApp.class.getName())
+          .addAppArgs("hello")
+          .startApplication(listener);
+
+        // SPARK-23020: see doc for InProcessTestApp.LOCK for a description of the race. Here
+        // we wait until we know that the connection between the app and the launcher has been
+        // established before allowing the app to finish.
+        final SparkAppHandle _handle = handle;
+        eventually(Duration.ofSeconds(5), Duration.ofMillis(10), () -> {
+          assertNotEquals(SparkAppHandle.State.UNKNOWN, _handle.getState());
+        });
+
+        InProcessTestApp.LOCK.wait(5000);
+      }
 
       waitFor(handle);
       assertEquals(SparkAppHandle.State.FINISHED, handle.getState());
@@ -195,10 +205,26 @@ public static void main(String[] args) throws Exception {
 
   public static class InProcessTestApp {
 
+    /**
+     * SPARK-23020: there's a race caused by a child app finishing too quickly. This would cause
+     * the InProcessAppHandle to dispose of itself even before the child connection was properly
+     * established, so no state changes would be detected for the application and its final
+     * state would be LOST.
+     *
+     * It's not really possible to fix that race safely in the handle code itself without changing
+     * the way in-process apps talk to the launcher library, so we work around that in the test by
+     * synchronizing on this object.
+     */
+    public static final Object LOCK = new Object();
+
     public static void main(String[] args) throws Exception {
       assertNotEquals(0, args.length);
       assertEquals(args[0], "hello");
       new SparkContext().stop();
+
+      synchronized (LOCK) {
+        LOCK.notifyAll();
+      }
     }
 
   }
diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java
index 84a25a5254151..9cbebdaeb33d3 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java
@@ -18,22 +18,22 @@
 package org.apache.spark.launcher;
 
 import java.io.IOException;
-import java.util.ArrayList;
 import java.util.List;
+import java.util.concurrent.CopyOnWriteArrayList;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
 abstract class AbstractAppHandle implements SparkAppHandle {
 
-  private static final Logger LOG = Logger.getLogger(ChildProcAppHandle.class.getName());
+  private static final Logger LOG = Logger.getLogger(AbstractAppHandle.class.getName());
 
   private final LauncherServer server;
 
   private LauncherServer.ServerConnection connection;
   private List<Listener> listeners;
   private AtomicReference<State> state;
-  private String appId;
+  private volatile String appId;
   private volatile boolean disposed;
 
   protected AbstractAppHandle(LauncherServer server) {
@@ -44,7 +44,7 @@ protected AbstractAppHandle(LauncherServer server) {
   @Override
   public synchronized void addListener(Listener l) {
     if (listeners == null) {
-      listeners = new ArrayList<>();
+      listeners = new CopyOnWriteArrayList<>();
     }
     listeners.add(l);
   }
@@ -71,16 +71,14 @@ public void stop() {
 
   @Override
   public synchronized void disconnect() {
-    if (!isDisposed()) {
-      if (connection != null) {
-        try {
-          connection.closeAndWait();
-        } catch (IOException ioe) {
-          // no-op.
-        }
+    if (connection != null && connection.isOpen()) {
+      try {
+        connection.close();
+      } catch (IOException ioe) {
+        // no-op.
       }
-      dispose();
     }
+    dispose();
   }
 
   void setConnection(LauncherServer.ServerConnection connection) {
@@ -97,10 +95,25 @@ boolean isDisposed() {
 
   /**
    * Mark the handle as disposed, and set it as LOST in case the current state is not final.
+   *
+   * This method should be called only when there's a reasonable expectation that the communication
+   * with the child application is not needed anymore, either because the code managing the handle
+   * has said so, or because the child application is finished.
    */
   synchronized void dispose() {
     if (!isDisposed()) {
+      // First wait for all data from the connection to be read. Then unregister the handle.
+      // Otherwise, unregistering might cause the server to be stopped and all child connections
+      // to be closed.
+      if (connection != null) {
+        try {
+          connection.waitForClose();
+        } catch (IOException ioe) {
+          // no-op.
+        }
+      }
       server.unregister(this);
+
       // Set state to LOST if not yet final.
       setState(State.LOST, false);
       this.disposed = true;
@@ -127,11 +140,13 @@ void setState(State s, boolean force) {
       current = state.get();
     }
 
-    LOG.log(Level.WARNING, "Backend requested transition from final state {0} to {1}.",
-      new Object[] { current, s });
+    if (s != State.LOST) {
+      LOG.log(Level.WARNING, "Backend requested transition from final state {0} to {1}.",
+        new Object[] { current, s });
+    }
   }
 
-  synchronized void setAppId(String appId) {
+  void setAppId(String appId) {
     this.appId = appId;
     fireEvent(true);
   }
diff --git a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
index 5e3c95676ecbe..5609f8492f4f4 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
@@ -112,7 +112,7 @@ void monitorChild() {
         }
       }
 
-      disconnect();
+      dispose();
     }
   }
 
diff --git a/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
index b8030e0063a37..4b740d3fad20e 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
@@ -66,7 +66,7 @@ synchronized void start(String appName, Method main, String[] args) {
         setState(State.FAILED);
       }
 
-      disconnect();
+      dispose();
     });
 
     app.setName(String.format(THREAD_NAME_FMT, THREAD_IDS.incrementAndGet(), appName));
diff --git a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
index f4ecd52fdeab8..607879fd02ea9 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
@@ -238,6 +238,7 @@ public void run() {
         };
         ServerConnection clientConnection = new ServerConnection(client, timeout);
         Thread clientThread = factory.newThread(clientConnection);
+        clientConnection.setConnectionThread(clientThread);
         synchronized (clients) {
           clients.add(clientConnection);
         }
@@ -290,17 +291,15 @@ class ServerConnection extends LauncherConnection {
 
     private TimerTask timeout;
     private volatile Thread connectionThread;
-    volatile AbstractAppHandle handle;
+    private volatile AbstractAppHandle handle;
 
     ServerConnection(Socket socket, TimerTask timeout) throws IOException {
       super(socket);
       this.timeout = timeout;
     }
 
-    @Override
-    public void run() {
-      this.connectionThread = Thread.currentThread();
-      super.run();
+    void setConnectionThread(Thread t) {
+      this.connectionThread = t;
     }
 
     @Override
@@ -361,19 +360,30 @@ public void close() throws IOException {
     }
 
     /**
-     * Close the connection and wait for any buffered data to be processed before returning.
+     * Wait for the remote side to close the connection so that any pending data is processed.
      * This ensures any changes reported by the child application take effect.
+     *
+     * This method allows a short period for the above to happen (same amount of time as the
+     * connection timeout, which is configurable). This should be fine for well-behaved
+     * applications, where they close the connection arond the same time the app handle detects the
+     * app has finished.
+     *
+     * In case the connection is not closed within the grace period, this method forcefully closes
+     * it and any subsequent data that may arrive will be ignored.
      */
-    public void closeAndWait() throws IOException {
-      close();
-
+    public void waitForClose() throws IOException {
       Thread connThread = this.connectionThread;
       if (Thread.currentThread() != connThread) {
         try {
-          connThread.join();
+          connThread.join(getConnectionTimeout());
         } catch (InterruptedException ie) {
           // Ignore.
         }
+
+        if (connThread.isAlive()) {
+          LOG.log(Level.WARNING, "Timed out waiting for child connection to close.");
+          close();
+        }
       }
     }
 
diff --git a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
index 024efac33c391..d16337a319be3 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
@@ -94,8 +94,8 @@ public void infoChanged(SparkAppHandle handle) {
       Message stopMsg = client.inbound.poll(30, TimeUnit.SECONDS);
       assertTrue(stopMsg instanceof Stop);
     } finally {
-      handle.kill();
       close(client);
+      handle.kill();
       client.clientThread.join();
     }
   }

From 86ca91551522832141aedc17ba1e47dbeb44d970 Mon Sep 17 00:00:00 2001
From: jx158167 <jx158167@antfin.com>
Date: Wed, 7 Mar 2018 20:08:32 -0800
Subject: [PATCH 336/566] [SPARK-23524] Big local shuffle blocks should not be
 checked for corruption.

## What changes were proposed in this pull request?

In current code, all local blocks will be checked for corruption no matter it's big or not.  The reasons are as below:

Size in FetchResult for local block is set to be 0 (https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala#L327)
SPARK-4105 meant to only check the small blocks(size<maxBytesInFlight/3), but for reason 1, below check will be invalid. https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala#L420

We can fix this and avoid the OOM.

## How was this patch tested?

UT added

Author: jx158167 <jx158167@antfin.com>

Closes #20685 from jinxing64/SPARK-23524.

(cherry picked from commit 77c91cc746f93e609c412f3a220495d9e931f696)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../storage/ShuffleBlockFetcherIterator.scala | 14 +++---
 .../ShuffleBlockFetcherIteratorSuite.scala    | 45 +++++++++++++++++++
 2 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
index 98b5a735a4529..dd9df74689a13 100644
--- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
@@ -90,7 +90,7 @@ final class ShuffleBlockFetcherIterator(
   private[this] val startTime = System.currentTimeMillis
 
   /** Local blocks to fetch, excluding zero-sized blocks. */
-  private[this] val localBlocks = new ArrayBuffer[BlockId]()
+  private[this] val localBlocks = scala.collection.mutable.LinkedHashSet[BlockId]()
 
   /** Remote blocks to fetch, excluding zero-sized blocks. */
   private[this] val remoteBlocks = new HashSet[BlockId]()
@@ -316,6 +316,7 @@ final class ShuffleBlockFetcherIterator(
    * track in-memory are the ManagedBuffer references themselves.
    */
   private[this] def fetchLocalBlocks() {
+    logDebug(s"Start fetching local blocks: ${localBlocks.mkString(", ")}")
     val iter = localBlocks.iterator
     while (iter.hasNext) {
       val blockId = iter.next()
@@ -324,7 +325,8 @@ final class ShuffleBlockFetcherIterator(
         shuffleMetrics.incLocalBlocksFetched(1)
         shuffleMetrics.incLocalBytesRead(buf.size)
         buf.retain()
-        results.put(new SuccessFetchResult(blockId, blockManager.blockManagerId, 0, buf, false))
+        results.put(new SuccessFetchResult(blockId, blockManager.blockManagerId,
+          buf.size(), buf, false))
       } catch {
         case e: Exception =>
           // If we see an exception, stop immediately.
@@ -397,7 +399,9 @@ final class ShuffleBlockFetcherIterator(
             }
             shuffleMetrics.incRemoteBlocksFetched(1)
           }
-          bytesInFlight -= size
+          if (!localBlocks.contains(blockId)) {
+            bytesInFlight -= size
+          }
           if (isNetworkReqDone) {
             reqsInFlight -= 1
             logDebug("Number of requests in flight " + reqsInFlight)
@@ -583,8 +587,8 @@ object ShuffleBlockFetcherIterator {
    * Result of a fetch from a remote block successfully.
    * @param blockId block id
    * @param address BlockManager that the block was fetched from.
-   * @param size estimated size of the block, used to calculate bytesInFlight.
-   *             Note that this is NOT the exact bytes.
+   * @param size estimated size of the block. Note that this is NOT the exact bytes.
+   *             Size of remote block is used to calculate bytesInFlight.
    * @param buf `ManagedBuffer` for the content.
    * @param isNetworkReqDone Is this the last network request for this host in this fetch request.
    */
diff --git a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
index 5bfe9905ff17b..692ae3bf597e0 100644
--- a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
@@ -352,6 +352,51 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT
     intercept[FetchFailedException] { iterator.next() }
   }
 
+  test("big blocks are not checked for corruption") {
+    val corruptStream = mock(classOf[InputStream])
+    when(corruptStream.read(any(), any(), any())).thenThrow(new IOException("corrupt"))
+    val corruptBuffer = mock(classOf[ManagedBuffer])
+    when(corruptBuffer.createInputStream()).thenReturn(corruptStream)
+    doReturn(10000L).when(corruptBuffer).size()
+
+    val blockManager = mock(classOf[BlockManager])
+    val localBmId = BlockManagerId("test-client", "test-client", 1)
+    doReturn(localBmId).when(blockManager).blockManagerId
+    doReturn(corruptBuffer).when(blockManager).getBlockData(ShuffleBlockId(0, 0, 0))
+    val localBlockLengths = Seq[Tuple2[BlockId, Long]](
+      ShuffleBlockId(0, 0, 0) -> corruptBuffer.size()
+    )
+
+    val remoteBmId = BlockManagerId("test-client-1", "test-client-1", 2)
+    val remoteBlockLengths = Seq[Tuple2[BlockId, Long]](
+      ShuffleBlockId(0, 1, 0) -> corruptBuffer.size()
+    )
+
+    val transfer = createMockTransfer(
+      Map(ShuffleBlockId(0, 0, 0) -> corruptBuffer, ShuffleBlockId(0, 1, 0) -> corruptBuffer))
+
+    val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long)])](
+      (localBmId, localBlockLengths),
+      (remoteBmId, remoteBlockLengths)
+    )
+
+    val taskContext = TaskContext.empty()
+    val iterator = new ShuffleBlockFetcherIterator(
+      taskContext,
+      transfer,
+      blockManager,
+      blocksByAddress,
+      (_, in) => new LimitedInputStream(in, 10000),
+      2048,
+      Int.MaxValue,
+      Int.MaxValue,
+      Int.MaxValue,
+      true)
+    // Blocks should be returned without exceptions.
+    assert(Set(iterator.next()._1, iterator.next()._1) ===
+        Set(ShuffleBlockId(0, 0, 0), ShuffleBlockId(0, 1, 0)))
+  }
+
   test("retry corrupt blocks (disabled)") {
     val blockManager = mock(classOf[BlockManager])
     val localBmId = BlockManagerId("test-client", "test-client", 1)

From 1dd37ff3b8c84c60858b159e745339ce19e53432 Mon Sep 17 00:00:00 2001
From: Wang Gengliang <gengliang.wang@databricks.com>
Date: Wed, 7 Mar 2018 21:53:26 -0800
Subject: [PATCH 337/566] [SPARK-23490][BACKPORT][SQL] Check
 storage.locationUri with existing table in CreateTable

Backport #20660 to branch 2.3
=====================================
## What changes were proposed in this pull request?

For CreateTable with Append mode, we should check if `storage.locationUri` is the same with existing table in `PreprocessTableCreation`

In the current code, there is only a simple exception if the `storage.locationUri` is different with existing table:
`org.apache.spark.sql.AnalysisException: Table or view not found:`

which can be improved.

## How was this patch tested?

Unit test

Author: Wang Gengliang <gengliang.wang@databricks.com>

Closes #20766 from gengliangwang/backport_20660_to_2.3.
---
 .../sql/execution/datasources/rules.scala     |  8 +++++
 .../sql/execution/command/DDLSuite.scala      | 29 +++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index 5cc21eeaeaa94..0dea767840ed3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -118,6 +118,14 @@ case class PreprocessTableCreation(sparkSession: SparkSession) extends Rule[Logi
           s"`${existingProvider.getSimpleName}`. It doesn't match the specified format " +
           s"`${specifiedProvider.getSimpleName}`.")
       }
+      tableDesc.storage.locationUri match {
+        case Some(location) if location.getPath != existingTable.location.getPath =>
+          throw new AnalysisException(
+            s"The location of the existing table ${tableIdentWithDB.quotedString} is " +
+              s"`${existingTable.location}`. It doesn't match the specified location " +
+              s"`${tableDesc.location}`.")
+        case _ =>
+      }
 
       if (query.schema.length != existingTable.schema.length) {
         throw new AnalysisException(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 63afcb5f400b2..1426df50d95c0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -536,6 +536,35 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
     }
   }
 
+  test("create table - append to a non-partitioned table created with different paths") {
+    import testImplicits._
+    withTempDir { dir1 =>
+      withTempDir { dir2 =>
+        withTable("path_test") {
+          Seq(1L -> "a").toDF("v1", "v2")
+            .write
+            .mode(SaveMode.Append)
+            .format("json")
+            .option("path", dir1.getCanonicalPath)
+            .saveAsTable("path_test")
+
+          val ex = intercept[AnalysisException] {
+            Seq((3L, "c")).toDF("v1", "v2")
+              .write
+              .mode(SaveMode.Append)
+              .format("json")
+              .option("path", dir2.getCanonicalPath)
+              .saveAsTable("path_test")
+          }.getMessage
+          assert(ex.contains("The location of the existing table `default`.`path_test`"))
+
+          checkAnswer(
+            spark.table("path_test"), Row(1L, "a") :: Nil)
+        }
+      }
+    }
+  }
+
   test("Refresh table after changing the data source table partitioning") {
     import testImplicits._
 

From 404f7e2013ecfdf993a17fd942d8890d9a8100e7 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Wed, 7 Mar 2018 21:58:57 -0800
Subject: [PATCH 338/566] [SPARK-23406][SS] Enable stream-stream self-joins for
 branch-2.3

This is a backport of #20598.

## What changes were proposed in this pull request?

Solved two bugs to enable stream-stream self joins.

### Incorrect analysis due to missing MultiInstanceRelation trait
Streaming leaf nodes did not extend MultiInstanceRelation, which is necessary for the catalyst analyzer to convert the self-join logical plan DAG into a tree (by creating new instances of the leaf relations). This was causing the error `Failure when resolving conflicting references in Join:` (see JIRA for details).

### Incorrect attribute rewrite when splicing batch plans in MicroBatchExecution
When splicing the source's batch plan into the streaming plan (by replacing the StreamingExecutionPlan), we were rewriting the attribute reference in the streaming plan with the new attribute references from the batch plan. This was incorrectly handling the scenario when multiple StreamingExecutionRelation point to the same source, and therefore eventually point to the same batch plan returned by the source. Here is an example query, and its corresponding plan transformations.
```
val df = input.toDF
val join =
      df.select('value % 5 as "key", 'value).join(
        df.select('value % 5 as "key", 'value), "key")
```
Streaming logical plan before splicing the batch plan
```
Project [key#6, value#1, value#12]
+- Join Inner, (key#6 = key#9)
   :- Project [(value#1 % 5) AS key#6, value#1]
   :  +- StreamingExecutionRelation Memory[#1], value#1
   +- Project [(value#12 % 5) AS key#9, value#12]
      +- StreamingExecutionRelation Memory[#1], value#12  // two different leaves pointing to same source
```
Batch logical plan after splicing the batch plan and before rewriting
```
Project [key#6, value#1, value#12]
+- Join Inner, (key#6 = key#9)
   :- Project [(value#1 % 5) AS key#6, value#1]
   :  +- LocalRelation [value#66]           // replaces StreamingExecutionRelation Memory[#1], value#1
   +- Project [(value#12 % 5) AS key#9, value#12]
      +- LocalRelation [value#66]           // replaces StreamingExecutionRelation Memory[#1], value#12
```
Batch logical plan after rewriting the attributes. Specifically, for spliced, the new output attributes (value#66) replace the earlier output attributes (value#12, and value#1, one for each StreamingExecutionRelation).
```
Project [key#6, value#66, value#66]       // both value#1 and value#12 replaces by value#66
+- Join Inner, (key#6 = key#9)
   :- Project [(value#66 % 5) AS key#6, value#66]
   :  +- LocalRelation [value#66]
   +- Project [(value#66 % 5) AS key#9, value#66]
      +- LocalRelation [value#66]
```
This causes the optimizer to eliminate value#66 from one side of the join.
```
Project [key#6, value#66, value#66]
+- Join Inner, (key#6 = key#9)
   :- Project [(value#66 % 5) AS key#6, value#66]
   :  +- LocalRelation [value#66]
   +- Project [(value#66 % 5) AS key#9]   // this does not generate value, incorrect join results
      +- LocalRelation [value#66]
```

**Solution**: Instead of rewriting attributes, use a Project to introduce aliases between the output attribute references and the new reference generated by the spliced plans. The analyzer and optimizer will take care of the rest.
```
Project [key#6, value#1, value#12]
+- Join Inner, (key#6 = key#9)
   :- Project [(value#1 % 5) AS key#6, value#1]
   :  +- Project [value#66 AS value#1]   // solution: project with aliases
   :     +- LocalRelation [value#66]
   +- Project [(value#12 % 5) AS key#9, value#12]
      +- Project [value#66 AS value#12]    // solution: project with aliases
         +- LocalRelation [value#66]
```

## How was this patch tested?
New unit test

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #20765 from tdas/SPARK-23406-2.3.
---
 .../streaming/MicroBatchExecution.scala       | 16 ++++++------
 .../streaming/StreamingRelation.scala         | 20 ++++++++++-----
 .../sql/streaming/StreamingJoinSuite.scala    | 25 ++++++++++++++++++-
 3 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
index 2e6ecb8fdbdfa..6a264ad708dae 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
@@ -24,8 +24,8 @@ import scala.collection.mutable.{ArrayBuffer, Map => MutableMap}
 
 import org.apache.spark.sql.{Dataset, SparkSession}
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp}
-import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
+import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeMap, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp}
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Project}
 import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.datasources.v2.{StreamingDataSourceV2Relation, WriteToDataSourceV2}
 import org.apache.spark.sql.execution.streaming.sources.{InternalRowMicroBatchWriter, MicroBatchWriter}
@@ -410,8 +410,6 @@ class MicroBatchExecution(
       }
     }
 
-    // A list of attributes that will need to be updated.
-    val replacements = new ArrayBuffer[(Attribute, Attribute)]
     // Replace sources in the logical plan with data that has arrived since the last batch.
     val newBatchesPlan = logicalPlan transform {
       case StreamingExecutionRelation(source, output) =>
@@ -419,18 +417,18 @@ class MicroBatchExecution(
           assert(output.size == dataPlan.output.size,
             s"Invalid batch: ${Utils.truncatedString(output, ",")} != " +
               s"${Utils.truncatedString(dataPlan.output, ",")}")
-          replacements ++= output.zip(dataPlan.output)
-          dataPlan
+
+          val aliases = output.zip(dataPlan.output).map { case (to, from) =>
+            Alias(from, to.name)(exprId = to.exprId, explicitMetadata = Some(from.metadata))
+          }
+          Project(aliases, dataPlan)
         }.getOrElse {
           LocalRelation(output, isStreaming = true)
         }
     }
 
     // Rewire the plan to use the new attributes that were returned by the source.
-    val replacementMap = AttributeMap(replacements)
     val newAttributePlan = newBatchesPlan transformAllExpressions {
-      case a: Attribute if replacementMap.contains(a) =>
-        replacementMap(a).withMetadata(a.metadata)
       case ct: CurrentTimestamp =>
         CurrentBatchTimestamp(offsetSeqMetadata.batchTimestampMs,
           ct.dataType)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
index 7146190645b37..f02d3a2c3733f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
@@ -20,9 +20,9 @@ package org.apache.spark.sql.execution.streaming
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.plans.logical.LeafNode
-import org.apache.spark.sql.catalyst.plans.logical.Statistics
+import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
 import org.apache.spark.sql.execution.LeafExecNode
 import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceV2}
@@ -42,7 +42,7 @@ object StreamingRelation {
  * passing to [[StreamExecution]] to run a query.
  */
 case class StreamingRelation(dataSource: DataSource, sourceName: String, output: Seq[Attribute])
-  extends LeafNode {
+  extends LeafNode with MultiInstanceRelation {
   override def isStreaming: Boolean = true
   override def toString: String = sourceName
 
@@ -53,6 +53,8 @@ case class StreamingRelation(dataSource: DataSource, sourceName: String, output:
   override def computeStats(): Statistics = Statistics(
     sizeInBytes = BigInt(dataSource.sparkSession.sessionState.conf.defaultSizeInBytes)
   )
+
+  override def newInstance(): LogicalPlan = this.copy(output = output.map(_.newInstance()))
 }
 
 /**
@@ -62,7 +64,7 @@ case class StreamingRelation(dataSource: DataSource, sourceName: String, output:
 case class StreamingExecutionRelation(
     source: BaseStreamingSource,
     output: Seq[Attribute])(session: SparkSession)
-  extends LeafNode {
+  extends LeafNode with MultiInstanceRelation {
 
   override def isStreaming: Boolean = true
   override def toString: String = source.toString
@@ -74,6 +76,8 @@ case class StreamingExecutionRelation(
   override def computeStats(): Statistics = Statistics(
     sizeInBytes = BigInt(session.sessionState.conf.defaultSizeInBytes)
   )
+
+  override def newInstance(): LogicalPlan = this.copy(output = output.map(_.newInstance()))(session)
 }
 
 // We have to pack in the V1 data source as a shim, for the case when a source implements
@@ -92,13 +96,15 @@ case class StreamingRelationV2(
     extraOptions: Map[String, String],
     output: Seq[Attribute],
     v1Relation: Option[StreamingRelation])(session: SparkSession)
-  extends LeafNode {
+  extends LeafNode with MultiInstanceRelation {
   override def isStreaming: Boolean = true
   override def toString: String = sourceName
 
   override def computeStats(): Statistics = Statistics(
     sizeInBytes = BigInt(session.sessionState.conf.defaultSizeInBytes)
   )
+
+  override def newInstance(): LogicalPlan = this.copy(output = output.map(_.newInstance()))(session)
 }
 
 /**
@@ -108,7 +114,7 @@ case class ContinuousExecutionRelation(
     source: ContinuousReadSupport,
     extraOptions: Map[String, String],
     output: Seq[Attribute])(session: SparkSession)
-  extends LeafNode {
+  extends LeafNode with MultiInstanceRelation {
 
   override def isStreaming: Boolean = true
   override def toString: String = source.toString
@@ -120,6 +126,8 @@ case class ContinuousExecutionRelation(
   override def computeStats(): Statistics = Statistics(
     sizeInBytes = BigInt(session.sessionState.conf.defaultSizeInBytes)
   )
+
+  override def newInstance(): LogicalPlan = this.copy(output = output.map(_.newInstance()))(session)
 }
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
index 54eb863dacc83..92087f68ad74a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
@@ -28,7 +28,9 @@ import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SparkSession}
 import org.apache.spark.sql.catalyst.analysis.StreamingJoinHelper
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeSet, Literal}
 import org.apache.spark.sql.catalyst.plans.logical.{EventTimeWatermark, Filter}
-import org.apache.spark.sql.execution.LogicalRDD
+import org.apache.spark.sql.catalyst.trees.TreeNode
+import org.apache.spark.sql.execution.{FileSourceScanExec, LogicalRDD}
+import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.execution.streaming.{MemoryStream, StatefulOperatorStateInfo, StreamingSymmetricHashJoinHelper}
 import org.apache.spark.sql.execution.streaming.state.{StateStore, StateStoreProviderId}
 import org.apache.spark.sql.functions._
@@ -323,6 +325,27 @@ class StreamingInnerJoinSuite extends StreamTest with StateStoreMetricsTest with
     assert(e.toString.contains("Stream stream joins without equality predicate is not supported"))
   }
 
+  test("stream stream self join") {
+    val input = MemoryStream[Int]
+    val df = input.toDF
+    val join =
+      df.select('value % 5 as "key", 'value).join(
+        df.select('value % 5 as "key", 'value), "key")
+
+    testStream(join)(
+      AddData(input, 1, 2),
+      CheckAnswer((1, 1, 1), (2, 2, 2)),
+      StopStream,
+      StartStream(),
+      AddData(input, 3, 6),
+      /*
+      (1, 1)     (1, 1)
+      (2, 2)  x  (2, 2)  =  (1, 1, 1), (1, 1, 6), (2, 2, 2), (1, 6, 1), (1, 6, 6)
+      (1, 6)     (1, 6)
+      */
+      CheckAnswer((3, 3, 3), (1, 1, 1), (1, 1, 6), (2, 2, 2), (1, 6, 1), (1, 6, 6)))
+  }
+
   test("locality preferences of StateStoreAwareZippedRDD") {
     import StreamingSymmetricHashJoinHelper._
 

From 8ff8e16e2da4aa5457e4b1e5d575bd1a3a1f0358 Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Fri, 9 Mar 2018 21:41:23 +0900
Subject: [PATCH 339/566] [SPARK-23436][SQL][BACKPORT-2.3] Infer partition as
 Date only if it can be casted to Date

This PR is to backport https://github.com/apache/spark/pull/20621 to branch 2.3

---
## What changes were proposed in this pull request?

Before the patch, Spark could infer as Date a partition value which cannot be casted to Date (this can happen when there are extra characters after a valid date, like `2018-02-15AAA`).

When this happens and the input format has metadata which define the schema of the table, then `null` is returned as a value for the partition column, because the `cast` operator used in (`PartitioningAwareFileIndex.inferPartitioning`) is unable to convert the value.

The PR checks in the partition inference that values can be casted to Date and Timestamp, in order to infer that datatype to them.

## How was this patch tested?

added UT

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #20764 from gatorsmile/backport23436.
---
 .../datasources/PartitioningUtils.scala       | 40 ++++++++++++++-----
 .../ParquetPartitionDiscoverySuite.scala      | 14 +++++++
 2 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index 472bf82d3604d..379acb67f7c71 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -407,6 +407,34 @@ object PartitioningUtils {
       Literal(bigDecimal)
     }
 
+    val dateTry = Try {
+      // try and parse the date, if no exception occurs this is a candidate to be resolved as
+      // DateType
+      DateTimeUtils.getThreadLocalDateFormat.parse(raw)
+      // SPARK-23436: Casting the string to date may still return null if a bad Date is provided.
+      // This can happen since DateFormat.parse  may not use the entire text of the given string:
+      // so if there are extra-characters after the date, it returns correctly.
+      // We need to check that we can cast the raw string since we later can use Cast to get
+      // the partition values with the right DataType (see
+      // org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex.inferPartitioning)
+      val dateValue = Cast(Literal(raw), DateType).eval()
+      // Disallow DateType if the cast returned null
+      require(dateValue != null)
+      Literal.create(dateValue, DateType)
+    }
+
+    val timestampTry = Try {
+      val unescapedRaw = unescapePathName(raw)
+      // try and parse the date, if no exception occurs this is a candidate to be resolved as
+      // TimestampType
+      DateTimeUtils.getThreadLocalTimestampFormat(timeZone).parse(unescapedRaw)
+      // SPARK-23436: see comment for date
+      val timestampValue = Cast(Literal(unescapedRaw), TimestampType, Some(timeZone.getID)).eval()
+      // Disallow TimestampType if the cast returned null
+      require(timestampValue != null)
+      Literal.create(timestampValue, TimestampType)
+    }
+
     if (typeInference) {
       // First tries integral types
       Try(Literal.create(Integer.parseInt(raw), IntegerType))
@@ -415,16 +443,8 @@ object PartitioningUtils {
         // Then falls back to fractional types
         .orElse(Try(Literal.create(JDouble.parseDouble(raw), DoubleType)))
         // Then falls back to date/timestamp types
-        .orElse(Try(
-          Literal.create(
-            DateTimeUtils.getThreadLocalTimestampFormat(timeZone)
-              .parse(unescapePathName(raw)).getTime * 1000L,
-            TimestampType)))
-        .orElse(Try(
-          Literal.create(
-            DateTimeUtils.millisToDays(
-              DateTimeUtils.getThreadLocalDateFormat.parse(raw).getTime),
-            DateType)))
+        .orElse(timestampTry)
+        .orElse(dateTry)
         // Then falls back to string
         .getOrElse {
           if (raw == DEFAULT_PARTITION_NAME) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index d4902641e335f..edb3da904d10d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -1120,4 +1120,18 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
           Row(3, BigDecimal("2" * 30)) :: Nil)
     }
   }
+
+  test("SPARK-23436: invalid Dates should be inferred as String in partition inference") {
+    withTempPath { path =>
+      val data = Seq(("1", "2018-01", "2018-01-01-04", "test"))
+        .toDF("id", "date_month", "date_hour", "data")
+
+      data.write.partitionBy("date_month", "date_hour").parquet(path.getAbsolutePath)
+      val input = spark.read.parquet(path.getAbsolutePath).select("id",
+        "date_month", "date_hour", "data")
+
+      assert(input.schema.sameType(input.schema))
+      checkAnswer(input, data)
+    }
+  }
 }

From bc5ce047658272eb48d744e8d069c6cccdf37682 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 9 Mar 2018 10:36:38 -0800
Subject: [PATCH 340/566] [SPARK-23630][YARN] Allow user's hadoop conf
 customizations to take effect.

This change restores functionality that was inadvertently removed as part
of the fix for SPARK-22372.

Also modified an existing unit test to make sure the feature works as intended.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20776 from vanzin/SPARK-23630.

(cherry picked from commit 2c3673680e16f88f1d1cd73a3f7445ded5b3daa8)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../apache/spark/deploy/SparkHadoopUtil.scala | 11 +++++-
 .../org/apache/spark/deploy/yarn/Client.scala | 14 ++++----
 .../spark/deploy/yarn/YarnClusterSuite.scala  | 34 ++++++++++++++-----
 3 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index e14f9845e6db6..177295fb7af0f 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -111,7 +111,9 @@ class SparkHadoopUtil extends Logging {
    * subsystems.
    */
   def newConfiguration(conf: SparkConf): Configuration = {
-    SparkHadoopUtil.newConfiguration(conf)
+    val hadoopConf = SparkHadoopUtil.newConfiguration(conf)
+    hadoopConf.addResource(SparkHadoopUtil.SPARK_HADOOP_CONF_FILE)
+    hadoopConf
   }
 
   /**
@@ -435,6 +437,13 @@ object SparkHadoopUtil {
    */
   private[spark] val UPDATE_INPUT_METRICS_INTERVAL_RECORDS = 1000
 
+  /**
+   * Name of the file containing the gateway's Hadoop configuration, to be overlayed on top of the
+   * cluster's Hadoop config. It is up to the Spark code launching the application to create
+   * this file if it's desired. If the file doesn't exist, it will just be ignored.
+   */
+  private[spark] val SPARK_HADOOP_CONF_FILE = "__spark_hadoop_conf__.xml"
+
   def get: SparkHadoopUtil = instance
 
   /**
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 8cd3cd9746a3a..28087dee831d1 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -696,7 +696,13 @@ private[spark] class Client(
       }
     }
 
-    Seq("HADOOP_CONF_DIR", "YARN_CONF_DIR").foreach { envKey =>
+    // SPARK-23630: during testing, Spark scripts filter out hadoop conf dirs so that user's
+    // environments do not interfere with tests. This allows a special env variable during
+    // tests so that custom conf dirs can be used by unit tests.
+    val confDirs = Seq("HADOOP_CONF_DIR", "YARN_CONF_DIR") ++
+      (if (Utils.isTesting) Seq("SPARK_TEST_HADOOP_CONF_DIR") else Nil)
+
+    confDirs.foreach { envKey =>
       sys.env.get(envKey).foreach { path =>
         val dir = new File(path)
         if (dir.isDirectory()) {
@@ -753,7 +759,7 @@ private[spark] class Client(
 
       // Save the YARN configuration into a separate file that will be overlayed on top of the
       // cluster's Hadoop conf.
-      confStream.putNextEntry(new ZipEntry(SPARK_HADOOP_CONF_FILE))
+      confStream.putNextEntry(new ZipEntry(SparkHadoopUtil.SPARK_HADOOP_CONF_FILE))
       hadoopConf.writeXml(confStream)
       confStream.closeEntry()
 
@@ -1220,10 +1226,6 @@ private object Client extends Logging {
   // Name of the file in the conf archive containing Spark configuration.
   val SPARK_CONF_FILE = "__spark_conf__.properties"
 
-  // Name of the file containing the gateway's Hadoop configuration, to be overlayed on top of the
-  // cluster's Hadoop config.
-  val SPARK_HADOOP_CONF_FILE = "__spark_hadoop_conf__.xml"
-
   // Subdirectory where the user's python files (not archives) will be placed.
   val LOCALIZED_PYTHON_DIR = "__pyfiles__"
 
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index 5003326b440bf..33d400a5b1b2e 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -114,12 +114,25 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
       ))
   }
 
-  test("yarn-cluster should respect conf overrides in SparkHadoopUtil (SPARK-16414)") {
+  test("yarn-cluster should respect conf overrides in SparkHadoopUtil (SPARK-16414, SPARK-23630)") {
+    // Create a custom hadoop config file, to make sure it's contents are propagated to the driver.
+    val customConf = Utils.createTempDir()
+    val coreSite = """<?xml version="1.0" encoding="UTF-8"?>
+      |<configuration>
+      |  <property>
+      |    <name>spark.test.key</name>
+      |    <value>testvalue</value>
+      |  </property>
+      |</configuration>
+      |""".stripMargin
+    Files.write(coreSite, new File(customConf, "core-site.xml"), StandardCharsets.UTF_8)
+
     val result = File.createTempFile("result", null, tempDir)
     val finalState = runSpark(false,
       mainClassName(YarnClusterDriverUseSparkHadoopUtilConf.getClass),
-      appArgs = Seq("key=value", result.getAbsolutePath()),
-      extraConf = Map("spark.hadoop.key" -> "value"))
+      appArgs = Seq("key=value", "spark.test.key=testvalue", result.getAbsolutePath()),
+      extraConf = Map("spark.hadoop.key" -> "value"),
+      extraEnv = Map("SPARK_TEST_HADOOP_CONF_DIR" -> customConf.getAbsolutePath()))
     checkResult(finalState, result)
   }
 
@@ -319,13 +332,13 @@ private object YarnClusterDriverWithFailure extends Logging with Matchers {
 
 private object YarnClusterDriverUseSparkHadoopUtilConf extends Logging with Matchers {
   def main(args: Array[String]): Unit = {
-    if (args.length != 2) {
+    if (args.length < 2) {
       // scalastyle:off println
       System.err.println(
         s"""
         |Invalid command line: ${args.mkString(" ")}
         |
-        |Usage: YarnClusterDriverUseSparkHadoopUtilConf [hadoopConfKey=value] [result file]
+        |Usage: YarnClusterDriverUseSparkHadoopUtilConf [hadoopConfKey=value]+ [result file]
         """.stripMargin)
       // scalastyle:on println
       System.exit(1)
@@ -335,11 +348,16 @@ private object YarnClusterDriverUseSparkHadoopUtilConf extends Logging with Matc
       .set("spark.extraListeners", classOf[SaveExecutorInfo].getName)
       .setAppName("yarn test using SparkHadoopUtil's conf"))
 
-    val kv = args(0).split("=")
-    val status = new File(args(1))
+    val kvs = args.take(args.length - 1).map { kv =>
+      val parsed = kv.split("=")
+      (parsed(0), parsed(1))
+    }
+    val status = new File(args.last)
     var result = "failure"
     try {
-      SparkHadoopUtil.get.conf.get(kv(0)) should be (kv(1))
+      kvs.foreach { case (k, v) =>
+        SparkHadoopUtil.get.conf.get(k) should be (v)
+      }
       result = "success"
     } finally {
       Files.write(result, status, StandardCharsets.UTF_8)

From 3ec25d5a803888e5e24a47a511e9d88c423c5310 Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Fri, 9 Mar 2018 13:48:12 -0800
Subject: [PATCH 341/566] [SPARK-23628][SQL][BACKPORT-2.3] calculateParamLength
 should not return 1 + num of expressions

## What changes were proposed in this pull request?

Backport of ea480990e726aed59750f1cea8d40adba56d991a to branch 2.3.

## How was this patch tested?

added UT

cc cloud-fan hvanhovell

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #20783 from mgaido91/SPARK-23628_2.3.
---
 .../expressions/codegen/CodeGenerator.scala      |  7 ++++---
 .../expressions/CodeGenerationSuite.scala        |  6 ++++++
 .../sql/execution/WholeStageCodegenSuite.scala   | 16 ++++++++--------
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 4dcbb702893da..a54af0309f667 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -1253,14 +1253,15 @@ class CodegenContext {
    */
   def calculateParamLength(params: Seq[Expression]): Int = {
     def paramLengthForExpr(input: Expression): Int = {
-      // For a nullable expression, we need to pass in an extra boolean parameter.
-      (if (input.nullable) 1 else 0) + javaType(input.dataType) match {
+      val javaParamLength = javaType(input.dataType) match {
         case JAVA_LONG | JAVA_DOUBLE => 2
         case _ => 1
       }
+      // For a nullable expression, we need to pass in an extra boolean parameter.
+      (if (input.nullable) 1 else 0) + javaParamLength
     }
     // Initial value is 1 for `this`.
-    1 + params.map(paramLengthForExpr(_)).sum
+    1 + params.map(paramLengthForExpr).sum
   }
 
   /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
index 676ba3956ddc8..3958bff53aa9c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
@@ -436,4 +436,10 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper {
     ctx.addImmutableStateIfNotExists("String", mutableState2)
     assert(ctx.inlinedMutableStates.length == 2)
   }
+
+  test("SPARK-23628: calculateParamLength should compute properly the param length") {
+    val ctx = new CodegenContext
+    assert(ctx.calculateParamLength(Seq.range(0, 100).map(Literal(_))) == 101)
+    assert(ctx.calculateParamLength(Seq.range(0, 100).map(x => Literal(x.toLong))) == 201)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
index ef16292a8e75c..0fb9dd2017a09 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
 import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec
 import org.apache.spark.sql.execution.joins.SortMergeJoinExec
 import org.apache.spark.sql.expressions.scalalang.typed
-import org.apache.spark.sql.functions.{avg, broadcast, col, max}
+import org.apache.spark.sql.functions.{avg, broadcast, col, lit, max}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
@@ -249,12 +249,12 @@ class WholeStageCodegenSuite extends QueryTest with SharedSQLContext {
   }
 
   test("Skip splitting consume function when parameter number exceeds JVM limit") {
-    import testImplicits._
-
-    Seq((255, false), (254, true)).foreach { case (columnNum, hasSplit) =>
+    // since every field is nullable we have 2 params for each input column (one for the value
+    // and one for the isNull variable)
+    Seq((128, false), (127, true)).foreach { case (columnNum, hasSplit) =>
       withTempPath { dir =>
         val path = dir.getCanonicalPath
-        spark.range(10).select(Seq.tabulate(columnNum) {i => ('id + i).as(s"c$i")} : _*)
+        spark.range(10).select(Seq.tabulate(columnNum) {i => lit(i).as(s"c$i")} : _*)
           .write.mode(SaveMode.Overwrite).parquet(path)
 
         withSQLConf(SQLConf.WHOLESTAGE_MAX_NUM_FIELDS.key -> "255",
@@ -263,10 +263,10 @@ class WholeStageCodegenSuite extends QueryTest with SharedSQLContext {
           val df = spark.read.parquet(path).selectExpr(projection: _*)
 
           val plan = df.queryExecution.executedPlan
-          val wholeStageCodeGenExec = plan.find(p => p match {
-            case wp: WholeStageCodegenExec => true
+          val wholeStageCodeGenExec = plan.find {
+            case _: WholeStageCodegenExec => true
             case _ => false
-          })
+          }
           assert(wholeStageCodeGenExec.isDefined)
           val code = wholeStageCodeGenExec.get.asInstanceOf[WholeStageCodegenExec].doCodeGen()._2
           assert(code.body.contains("project_doConsume") == hasSplit)

From b083bd107d25bd3f7a4cdcf3aafa07b9895878b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=C5=9Awitakowski?=
 <michal.switakowski@databricks.com>
Date: Fri, 9 Mar 2018 14:29:31 -0800
Subject: [PATCH 342/566] [SPARK-23173][SQL] Avoid creating corrupt parquet
 files when loading data from JSON
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

The from_json() function accepts an additional parameter, where the user might specify the schema. The issue is that the specified schema might not be compatible with data. In particular, the JSON data might be missing data for fields declared as non-nullable in the schema. The from_json() function does not verify the data against such errors. When data with missing fields is sent to the parquet encoder, there is no verification either. The end results is a corrupt parquet file.

To avoid corruptions, make sure that all fields in the user-specified schema are set to be nullable.
Since this changes the behavior of a public function, we need to include it in release notes.
The behavior can be reverted by setting `spark.sql.fromJsonForceNullableSchema=false`

## How was this patch tested?

Added two new tests.

Author: Michał Świtakowski <michal.switakowski@databricks.com>

Closes #20694 from mswit-databricks/SPARK-23173.

(cherry picked from commit 2ca9bb083c515511d2bfee271fc3e0269aceb9d5)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../expressions/jsonExpressions.scala         | 22 +++++++++-----
 .../apache/spark/sql/internal/SQLConf.scala   |  8 +++++
 .../expressions/JsonExpressionsSuite.scala    | 30 ++++++++++++++++++-
 .../datasources/parquet/ParquetIOSuite.scala  | 19 ++++++++++++
 4 files changed, 70 insertions(+), 9 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index 18b4fed597447..fdd672c416a03 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -30,6 +30,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.catalyst.json._
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
 import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, BadRecordException, FailFastMode, GenericArrayData, MapData}
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.Utils
@@ -515,10 +516,15 @@ case class JsonToStructs(
     child: Expression,
     timeZoneId: Option[String] = None)
   extends UnaryExpression with TimeZoneAwareExpression with CodegenFallback with ExpectsInputTypes {
-  override def nullable: Boolean = true
 
-  def this(schema: DataType, options: Map[String, String], child: Expression) =
-    this(schema, options, child, None)
+  val forceNullableSchema = SQLConf.get.getConf(SQLConf.FROM_JSON_FORCE_NULLABLE_SCHEMA)
+
+  // The JSON input data might be missing certain fields. We force the nullability
+  // of the user-provided schema to avoid data corruptions. In particular, the parquet-mr encoder
+  // can generate incorrect files if values are missing in columns declared as non-nullable.
+  val nullableSchema = if (forceNullableSchema) schema.asNullable else schema
+
+  override def nullable: Boolean = true
 
   // Used in `FunctionRegistry`
   def this(child: Expression, schema: Expression) =
@@ -535,22 +541,22 @@ case class JsonToStructs(
       child = child,
       timeZoneId = None)
 
-  override def checkInputDataTypes(): TypeCheckResult = schema match {
+  override def checkInputDataTypes(): TypeCheckResult = nullableSchema match {
     case _: StructType | ArrayType(_: StructType, _) =>
       super.checkInputDataTypes()
     case _ => TypeCheckResult.TypeCheckFailure(
-      s"Input schema ${schema.simpleString} must be a struct or an array of structs.")
+      s"Input schema ${nullableSchema.simpleString} must be a struct or an array of structs.")
   }
 
   @transient
-  lazy val rowSchema = schema match {
+  lazy val rowSchema = nullableSchema match {
     case st: StructType => st
     case ArrayType(st: StructType, _) => st
   }
 
   // This converts parsed rows to the desired output by the given schema.
   @transient
-  lazy val converter = schema match {
+  lazy val converter = nullableSchema match {
     case _: StructType =>
       (rows: Seq[InternalRow]) => if (rows.length == 1) rows.head else null
     case ArrayType(_: StructType, _) =>
@@ -563,7 +569,7 @@ case class JsonToStructs(
       rowSchema,
       new JSONOptions(options + ("mode" -> FailFastMode.name), timeZoneId.get))
 
-  override def dataType: DataType = schema
+  override def dataType: DataType = nullableSchema
 
   override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression =
     copy(timeZoneId = Option(timeZoneId))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 47fcf34b2cd3d..eebb4c754819d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -481,6 +481,14 @@ object SQLConf {
     .stringConf
     .createWithDefault("_corrupt_record")
 
+  val FROM_JSON_FORCE_NULLABLE_SCHEMA = buildConf("spark.sql.fromJsonForceNullableSchema")
+    .internal()
+    .doc("When true, force the output schema of the from_json() function to be nullable " +
+      "(including all the fields). Otherwise, the schema might not be compatible with" +
+      "actual data, which leads to curruptions.")
+    .booleanConf
+    .createWithDefault(true)
+
   val BROADCAST_TIMEOUT = buildConf("spark.sql.broadcastTimeout")
     .doc("Timeout in seconds for the broadcast wait time in broadcast joins.")
     .timeConf(TimeUnit.SECONDS)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
index a0bbe02f92354..7812319756eae 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
@@ -22,11 +22,13 @@ import java.util.Calendar
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
+import org.apache.spark.sql.catalyst.plans.PlanTestBase
 import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeTestUtils, DateTimeUtils, GenericArrayData, PermissiveMode}
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
-class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with PlanTestBase {
   val json =
     """
       |{"store":{"fruit":[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],
@@ -680,4 +682,30 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       )
     }
   }
+
+  test("from_json missing fields") {
+    for (forceJsonNullableSchema <- Seq(false, true)) {
+      withSQLConf(SQLConf.FROM_JSON_FORCE_NULLABLE_SCHEMA.key -> forceJsonNullableSchema.toString) {
+        val input =
+          """{
+            |  "a": 1,
+            |  "c": "foo"
+            |}
+            |""".stripMargin
+        val jsonSchema = new StructType()
+          .add("a", LongType, nullable = false)
+          .add("b", StringType, nullable = false)
+          .add("c", StringType, nullable = false)
+        val output = InternalRow(1L, null, UTF8String.fromString("foo"))
+        checkEvaluation(
+          JsonToStructs(jsonSchema, Map.empty, Literal.create(input, StringType), gmtId),
+          output
+        )
+        val schema = JsonToStructs(jsonSchema, Map.empty, Literal.create(input, StringType), gmtId)
+          .dataType
+        val schemaToCompare = if (forceJsonNullableSchema) jsonSchema.asNullable else jsonSchema
+        assert(schemaToCompare == schema)
+      }
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
index f3ece5b15e26a..e4e0e6e68403e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -43,6 +43,7 @@ import org.apache.spark.sql.catalyst.{InternalRow, ScalaReflection}
 import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeRow}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
@@ -771,6 +772,24 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
       assert(option.compressionCodecClassName == "UNCOMPRESSED")
     }
   }
+
+  test("SPARK-23173 Writing a file with data converted from JSON with and incorrect user schema") {
+    withTempPath { file =>
+      val jsonData =
+        """{
+        |  "a": 1,
+        |  "c": "foo"
+        |}
+        |""".stripMargin
+      val jsonSchema = new StructType()
+        .add("a", LongType, nullable = false)
+        .add("b", StringType, nullable = false)
+        .add("c", StringType, nullable = false)
+      spark.range(1).select(from_json(lit(jsonData), jsonSchema) as "input")
+        .write.parquet(file.getAbsolutePath)
+      checkAnswer(spark.read.parquet(file.getAbsolutePath), Seq(Row(Row(1, null, "foo"))))
+    }
+  }
 }
 
 class JobCommitFailureParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext)

From 5bd306c3896f7967243f7f3be30e4f095b51e0fe Mon Sep 17 00:00:00 2001
From: Wang Gengliang <gengliang.wang@databricks.com>
Date: Fri, 9 Mar 2018 15:41:19 -0800
Subject: [PATCH 343/566] [SPARK-23624][SQL] Revise doc of method pushFilters
 in Datasource V2

## What changes were proposed in this pull request?

Revise doc of method pushFilters in SupportsPushDownFilters/SupportsPushDownCatalystFilters

In `FileSourceStrategy`, except `partitionKeyFilters`(the references of which is subset of partition keys), all filters needs to be evaluated after scanning. Otherwise, Spark will get wrong result from data sources like Orc/Parquet.

This PR is to improve the doc.

Author: Wang Gengliang <gengliang.wang@databricks.com>

Closes #20769 from gengliangwang/revise_pushdown_doc.

(cherry picked from commit 10b0657b035641ce735055bba2c8459e71bc2400)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../sql/sources/v2/reader/SupportsPushDownCatalystFilters.java  | 2 +-
 .../spark/sql/sources/v2/reader/SupportsPushDownFilters.java    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java
index 98224102374aa..290d614805ac7 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java
@@ -34,7 +34,7 @@
 public interface SupportsPushDownCatalystFilters extends DataSourceReader {
 
   /**
-   * Pushes down filters, and returns unsupported filters.
+   * Pushes down filters, and returns filters that need to be evaluated after scanning.
    */
   Expression[] pushCatalystFilters(Expression[] filters);
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java
index f35c711b0387a..1cff024232a44 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java
@@ -32,7 +32,7 @@
 public interface SupportsPushDownFilters extends DataSourceReader {
 
   /**
-   * Pushes down filters, and returns unsupported filters.
+   * Pushes down filters, and returns filters that need to be evaluated after scanning.
    */
   Filter[] pushFilters(Filter[] filters);
 

From 265e61ee96d7dae41711df10b21c571c79f003b7 Mon Sep 17 00:00:00 2001
From: DylanGuedes <djmgguedes@gmail.com>
Date: Sat, 10 Mar 2018 19:48:29 +0900
Subject: [PATCH 344/566] [PYTHON] Changes input variable to not conflict with
 built-in function

Signed-off-by: DylanGuedes <djmgguedesgmail.com>

## What changes were proposed in this pull request?

Changes variable name conflict: [input is a built-in python function](https://stackoverflow.com/questions/20670732/is-input-a-keyword-in-python).

## How was this patch tested?

I runned the example and it works fine.

Author: DylanGuedes <djmgguedes@gmail.com>

Closes #20775 from DylanGuedes/input_variable.

(cherry picked from commit b6f837c9d3cb0f76f0a52df37e34aea8944f6867)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 examples/src/main/python/ml/dataframe_example.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/src/main/python/ml/dataframe_example.py b/examples/src/main/python/ml/dataframe_example.py
index 109f901012c9c..ee162fa582b10 100644
--- a/examples/src/main/python/ml/dataframe_example.py
+++ b/examples/src/main/python/ml/dataframe_example.py
@@ -17,7 +17,7 @@
 
 """
 An example of how to use DataFrame for ML. Run with::
-    bin/spark-submit examples/src/main/python/ml/dataframe_example.py <input>
+    bin/spark-submit examples/src/main/python/ml/dataframe_example.py <input_path>
 """
 from __future__ import print_function
 
@@ -35,18 +35,18 @@
         print("Usage: dataframe_example.py <libsvm file>", file=sys.stderr)
         exit(-1)
     elif len(sys.argv) == 2:
-        input = sys.argv[1]
+        input_path = sys.argv[1]
     else:
-        input = "data/mllib/sample_libsvm_data.txt"
+        input_path = "data/mllib/sample_libsvm_data.txt"
 
     spark = SparkSession \
         .builder \
         .appName("DataFrameExample") \
         .getOrCreate()
 
-    # Load input data
-    print("Loading LIBSVM file with UDT from " + input + ".")
-    df = spark.read.format("libsvm").load(input).cache()
+    # Load an input file
+    print("Loading LIBSVM file with UDT from " + input_path + ".")
+    df = spark.read.format("libsvm").load(input_path).cache()
     print("Schema from LIBSVM:")
     df.printSchema()
     print("Loaded training data as a DataFrame with " +

From a8e357ada8a52b9ec239f41de7ac0c0225c76050 Mon Sep 17 00:00:00 2001
From: Xiayun Sun <xiayunsun@gmail.com>
Date: Mon, 12 Mar 2018 22:13:28 +0900
Subject: [PATCH 345/566] [SPARK-23462][SQL] improve missing field error
 message in `StructType`

## What changes were proposed in this pull request?

The error message ```s"""Field "$name" does not exist."""``` is thrown when looking up an unknown field in StructType. In the error message, we should also contain the information about which columns/fields exist in this struct.

## How was this patch tested?

Added new unit tests.

Note: I created a new `StructTypeSuite.scala` as I couldn't find an existing suite that's suitable to place these tests. I may be missing something so feel free to propose new locations.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Xiayun Sun <xiayunsun@gmail.com>

Closes #20649 from xysun/SPARK-23462.

(cherry picked from commit b304e07e0671faf96530f9d8f49c55a83b07fa15)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 .../apache/spark/sql/types/StructType.scala   | 11 +++--
 .../spark/sql/types/StructTypeSuite.scala     | 40 +++++++++++++++++++
 2 files changed, 48 insertions(+), 3 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/types/StructTypeSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index e3b0969283a84..68d5f52f251a4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -264,7 +264,9 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru
    */
   def apply(name: String): StructField = {
     nameToField.getOrElse(name,
-      throw new IllegalArgumentException(s"""Field "$name" does not exist."""))
+      throw new IllegalArgumentException(
+        s"""Field "$name" does not exist.
+           |Available fields: ${fieldNames.mkString(", ")}""".stripMargin))
   }
 
   /**
@@ -277,7 +279,8 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru
     val nonExistFields = names -- fieldNamesSet
     if (nonExistFields.nonEmpty) {
       throw new IllegalArgumentException(
-        s"Field ${nonExistFields.mkString(",")} does not exist.")
+        s"""Nonexistent field(s): ${nonExistFields.mkString(", ")}.
+           |Available fields: ${fieldNames.mkString(", ")}""".stripMargin)
     }
     // Preserve the original order of fields.
     StructType(fields.filter(f => names.contains(f.name)))
@@ -290,7 +293,9 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru
    */
   def fieldIndex(name: String): Int = {
     nameToIndex.getOrElse(name,
-      throw new IllegalArgumentException(s"""Field "$name" does not exist."""))
+      throw new IllegalArgumentException(
+        s"""Field "$name" does not exist.
+           |Available fields: ${fieldNames.mkString(", ")}""".stripMargin))
   }
 
   private[sql] def getFieldIndex(name: String): Option[Int] = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/StructTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/StructTypeSuite.scala
new file mode 100644
index 0000000000000..c6ca8bb005429
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/StructTypeSuite.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.types
+
+import org.apache.spark.SparkFunSuite
+
+class StructTypeSuite extends SparkFunSuite {
+
+  val s = StructType.fromDDL("a INT, b STRING")
+
+  test("lookup a single missing field should output existing fields") {
+    val e = intercept[IllegalArgumentException](s("c")).getMessage
+    assert(e.contains("Available fields: a, b"))
+  }
+
+  test("lookup a set of missing fields should output existing fields") {
+    val e = intercept[IllegalArgumentException](s(Set("a", "c"))).getMessage
+    assert(e.contains("Available fields: a, b"))
+  }
+
+  test("lookup fieldIndex for missing field should output existing fields") {
+    val e = intercept[IllegalArgumentException](s.fieldIndex("c")).getMessage
+    assert(e.contains("Available fields: a, b"))
+  }
+}

From 33ba8db8d8c8bf388606a6f5e34b082469038205 Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Mon, 12 Mar 2018 21:47:38 -0700
Subject: [PATCH 346/566] [SPARK-23523][SQL][BACKPORT-2.3] Fix the incorrect
 result caused by the rule OptimizeMetadataOnlyQuery

This PR is to backport https://github.com/apache/spark/pull/20684 and https://github.com/apache/spark/pull/20693 to Spark 2.3 branch

---

## What changes were proposed in this pull request?
```Scala
val tablePath = new File(s"${path.getCanonicalPath}/cOl3=c/cOl1=a/cOl5=e")
 Seq(("a", "b", "c", "d", "e")).toDF("cOl1", "cOl2", "cOl3", "cOl4", "cOl5")
 .write.json(tablePath.getCanonicalPath)
 val df = spark.read.json(path.getCanonicalPath).select("CoL1", "CoL5", "CoL3").distinct()
 df.show()
```

It generates a wrong result.
```
[c,e,a]
```

We have a bug in the rule `OptimizeMetadataOnlyQuery `. We should respect the attribute order in the original leaf node. This PR is to fix it.

## How was this patch tested?
Added a test case

Author: Xingbo Jiang <xingbo.jiang@databricks.com>
Author: gatorsmile <gatorsmile@gmail.com>

Closes #20763 from gatorsmile/backport23523.
---
 .../plans/logical/LocalRelation.scala         |  9 ++++----
 .../execution/OptimizeMetadataOnlyQuery.scala | 12 ++++++++--
 .../datasources/HadoopFsRelation.scala        |  3 +++
 .../OptimizeMetadataOnlyQuerySuite.scala      | 22 +++++++++++++++++++
 4 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
index d73d7e73f28d5..b05508db786ad 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
@@ -43,10 +43,11 @@ object LocalRelation {
   }
 }
 
-case class LocalRelation(output: Seq[Attribute],
-                         data: Seq[InternalRow] = Nil,
-                         // Indicates whether this relation has data from a streaming source.
-                         override val isStreaming: Boolean = false)
+case class LocalRelation(
+    output: Seq[Attribute],
+    data: Seq[InternalRow] = Nil,
+    // Indicates whether this relation has data from a streaming source.
+    override val isStreaming: Boolean = false)
   extends LeafNode with analysis.MultiInstanceRelation {
 
   // A local relation must have resolved output.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
index 18f6f697bc857..dc4aff9f12580 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.sql.execution
 
+import java.util.Locale
+
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.catalog.{HiveTableRelation, SessionCatalog}
 import org.apache.spark.sql.catalyst.expressions._
@@ -80,8 +83,13 @@ case class OptimizeMetadataOnlyQuery(catalog: SessionCatalog) extends Rule[Logic
   private def getPartitionAttrs(
       partitionColumnNames: Seq[String],
       relation: LogicalPlan): Seq[Attribute] = {
-    val partColumns = partitionColumnNames.map(_.toLowerCase).toSet
-    relation.output.filter(a => partColumns.contains(a.name.toLowerCase))
+    val attrMap = relation.output.map(a => a.name.toLowerCase(Locale.ROOT) -> a).toMap
+    partitionColumnNames.map { colName =>
+      attrMap.getOrElse(colName.toLowerCase(Locale.ROOT),
+        throw new AnalysisException(s"Unable to find the column `$colName` " +
+          s"given [${relation.output.map(_.name).mkString(", ")}]")
+      )
+    }
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
index 6b34638529770..b2f73b7f8d1fc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
@@ -67,6 +67,9 @@ case class HadoopFsRelation(
     }
   }
 
+  // When data and partition schemas have overlapping columns, the output
+  // schema respects the order of the data schema for the overlapping columns, and it
+  // respects the data types of the partition schema.
   val schema: StructType = {
     StructType(dataSchema.map(f => overlappedPartCols.getOrElse(getColName(f), f)) ++
       partitionSchema.filterNot(f => overlappedPartCols.contains(getColName(f))))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
index 78c1e5dae566d..a543eb8351656 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
@@ -17,9 +17,12 @@
 
 package org.apache.spark.sql.execution
 
+import java.io.File
+
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.SQLConf.OPTIMIZER_METADATA_ONLY
 import org.apache.spark.sql.test.SharedSQLContext
 
 class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext {
@@ -125,4 +128,23 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext {
       sql("SELECT COUNT(DISTINCT p) FROM t_1000").collect()
     }
   }
+
+  test("Incorrect result caused by the rule OptimizeMetadataOnlyQuery") {
+    withSQLConf(OPTIMIZER_METADATA_ONLY.key -> "true") {
+      withTempPath { path =>
+        val tablePath = new File(s"${path.getCanonicalPath}/cOl3=c/cOl1=a/cOl5=e")
+        Seq(("a", "b", "c", "d", "e")).toDF("cOl1", "cOl2", "cOl3", "cOl4", "cOl5")
+          .write.json(tablePath.getCanonicalPath)
+
+        val df = spark.read.json(path.getCanonicalPath).select("CoL1", "CoL5", "CoL3").distinct()
+        checkAnswer(df, Row("a", "e", "c"))
+
+        val localRelation = df.queryExecution.optimizedPlan.collectFirst {
+          case l: LocalRelation => l
+        }
+        assert(localRelation.nonEmpty, "expect to see a LocalRelation")
+        assert(localRelation.get.output.map(_.name) == Seq("cOl3", "cOl1", "cOl5"))
+      }
+    }
+  }
 }

From f3efbfa4b973cdb8cf992e30540609d0006e0cfe Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Tue, 13 Mar 2018 23:04:16 +0100
Subject: [PATCH 347/566] [SPARK-23598][SQL] Make methods in
 BufferedRowIterator public to avoid runtime error for a large query

## What changes were proposed in this pull request?

This PR fixes runtime error regarding a large query when a generated code has split classes. The issue is `append()`, `stopEarly()`, and other methods are not accessible from split classes that are not subclasses of `BufferedRowIterator`.
This PR fixes this issue by making them `public`.

Before applying the PR, we see the following exception by running the attached program with `CodeGenerator.GENERATED_CLASS_SIZE_THRESHOLD=-1`.
```
  test("SPARK-23598") {
    // When set -1 to CodeGenerator.GENERATED_CLASS_SIZE_THRESHOLD, an exception is thrown
    val df_pet_age = Seq((8, "bat"), (15, "mouse"), (5, "horse")).toDF("age", "name")
    df_pet_age.groupBy("name").avg("age").show()
  }
```

Exception:
```
19:40:52.591 WARN org.apache.hadoop.util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
19:41:32.319 ERROR org.apache.spark.executor.Executor: Exception in task 0.0 in stage 0.0 (TID 0)
java.lang.IllegalAccessError: tried to access method org.apache.spark.sql.execution.BufferedRowIterator.shouldStop()Z from class org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1$agg_NestedClass1
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1$agg_NestedClass1.agg_doAggregateWithKeys$(generated.java:203)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(generated.java:160)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$11$$anon$1.hasNext(WholeStageCodegenExec.scala:616)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)
...
```

Generated code (line 195 calles `stopEarly()`).
```
/* 001 */ public Object generate(Object[] references) {
/* 002 */   return new GeneratedIteratorForCodegenStage1(references);
/* 003 */ }
/* 004 */
/* 005 */ // codegenStageId=1
/* 006 */ final class GeneratedIteratorForCodegenStage1 extends org.apache.spark.sql.execution.BufferedRowIterator {
/* 007 */   private Object[] references;
/* 008 */   private scala.collection.Iterator[] inputs;
/* 009 */   private boolean agg_initAgg;
/* 010 */   private boolean agg_bufIsNull;
/* 011 */   private double agg_bufValue;
/* 012 */   private boolean agg_bufIsNull1;
/* 013 */   private long agg_bufValue1;
/* 014 */   private agg_FastHashMap agg_fastHashMap;
/* 015 */   private org.apache.spark.unsafe.KVIterator<UnsafeRow, UnsafeRow> agg_fastHashMapIter;
/* 016 */   private org.apache.spark.unsafe.KVIterator agg_mapIter;
/* 017 */   private org.apache.spark.sql.execution.UnsafeFixedWidthAggregationMap agg_hashMap;
/* 018 */   private org.apache.spark.sql.execution.UnsafeKVExternalSorter agg_sorter;
/* 019 */   private scala.collection.Iterator inputadapter_input;
/* 020 */   private boolean agg_agg_isNull11;
/* 021 */   private boolean agg_agg_isNull25;
/* 022 */   private org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder[] agg_mutableStateArray1 = new org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder[2];
/* 023 */   private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] agg_mutableStateArray2 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[2];
/* 024 */   private UnsafeRow[] agg_mutableStateArray = new UnsafeRow[2];
/* 025 */
/* 026 */   public GeneratedIteratorForCodegenStage1(Object[] references) {
/* 027 */     this.references = references;
/* 028 */   }
/* 029 */
/* 030 */   public void init(int index, scala.collection.Iterator[] inputs) {
/* 031 */     partitionIndex = index;
/* 032 */     this.inputs = inputs;
/* 033 */
/* 034 */     agg_fastHashMap = new agg_FastHashMap(((org.apache.spark.sql.execution.aggregate.HashAggregateExec) references[0] /* plan */).getTaskMemoryManager(), ((org.apache.spark.sql.execution.aggregate.HashAggregateExec) references[0] /* plan */).getEmptyAggregationBuffer());
/* 035 */     agg_hashMap = ((org.apache.spark.sql.execution.aggregate.HashAggregateExec) references[0] /* plan */).createHashMap();
/* 036 */     inputadapter_input = inputs[0];
/* 037 */     agg_mutableStateArray[0] = new UnsafeRow(1);
/* 038 */     agg_mutableStateArray1[0] = new org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder(agg_mutableStateArray[0], 32);
/* 039 */     agg_mutableStateArray2[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(agg_mutableStateArray1[0], 1);
/* 040 */     agg_mutableStateArray[1] = new UnsafeRow(3);
/* 041 */     agg_mutableStateArray1[1] = new org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder(agg_mutableStateArray[1], 32);
/* 042 */     agg_mutableStateArray2[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(agg_mutableStateArray1[1], 3);
/* 043 */
/* 044 */   }
/* 045 */
/* 046 */   public class agg_FastHashMap {
/* 047 */     private org.apache.spark.sql.catalyst.expressions.RowBasedKeyValueBatch batch;
/* 048 */     private int[] buckets;
/* 049 */     private int capacity = 1 << 16;
/* 050 */     private double loadFactor = 0.5;
/* 051 */     private int numBuckets = (int) (capacity / loadFactor);
/* 052 */     private int maxSteps = 2;
/* 053 */     private int numRows = 0;
/* 054 */     private org.apache.spark.sql.types.StructType keySchema = new org.apache.spark.sql.types.StructType().add(((java.lang.String) references[1] /* keyName */), org.apache.spark.sql.types.DataTypes.StringType);
/* 055 */     private org.apache.spark.sql.types.StructType valueSchema = new org.apache.spark.sql.types.StructType().add(((java.lang.String) references[2] /* keyName */), org.apache.spark.sql.types.DataTypes.DoubleType)
/* 056 */     .add(((java.lang.String) references[3] /* keyName */), org.apache.spark.sql.types.DataTypes.LongType);
/* 057 */     private Object emptyVBase;
/* 058 */     private long emptyVOff;
/* 059 */     private int emptyVLen;
/* 060 */     private boolean isBatchFull = false;
/* 061 */
/* 062 */     public agg_FastHashMap(
/* 063 */       org.apache.spark.memory.TaskMemoryManager taskMemoryManager,
/* 064 */       InternalRow emptyAggregationBuffer) {
/* 065 */       batch = org.apache.spark.sql.catalyst.expressions.RowBasedKeyValueBatch
/* 066 */       .allocate(keySchema, valueSchema, taskMemoryManager, capacity);
/* 067 */
/* 068 */       final UnsafeProjection valueProjection = UnsafeProjection.create(valueSchema);
/* 069 */       final byte[] emptyBuffer = valueProjection.apply(emptyAggregationBuffer).getBytes();
/* 070 */
/* 071 */       emptyVBase = emptyBuffer;
/* 072 */       emptyVOff = Platform.BYTE_ARRAY_OFFSET;
/* 073 */       emptyVLen = emptyBuffer.length;
/* 074 */
/* 075 */       buckets = new int[numBuckets];
/* 076 */       java.util.Arrays.fill(buckets, -1);
/* 077 */     }
/* 078 */
/* 079 */     public org.apache.spark.sql.catalyst.expressions.UnsafeRow findOrInsert(UTF8String agg_key) {
/* 080 */       long h = hash(agg_key);
/* 081 */       int step = 0;
/* 082 */       int idx = (int) h & (numBuckets - 1);
/* 083 */       while (step < maxSteps) {
/* 084 */         // Return bucket index if it's either an empty slot or already contains the key
/* 085 */         if (buckets[idx] == -1) {
/* 086 */           if (numRows < capacity && !isBatchFull) {
/* 087 */             // creating the unsafe for new entry
/* 088 */             UnsafeRow agg_result = new UnsafeRow(1);
/* 089 */             org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder agg_holder
/* 090 */             = new org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder(agg_result,
/* 091 */               32);
/* 092 */             org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter agg_rowWriter
/* 093 */             = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(
/* 094 */               agg_holder,
/* 095 */               1);
/* 096 */             agg_holder.reset(); //TODO: investigate if reset or zeroout are actually needed
/* 097 */             agg_rowWriter.zeroOutNullBytes();
/* 098 */             agg_rowWriter.write(0, agg_key);
/* 099 */             agg_result.setTotalSize(agg_holder.totalSize());
/* 100 */             Object kbase = agg_result.getBaseObject();
/* 101 */             long koff = agg_result.getBaseOffset();
/* 102 */             int klen = agg_result.getSizeInBytes();
/* 103 */
/* 104 */             UnsafeRow vRow
/* 105 */             = batch.appendRow(kbase, koff, klen, emptyVBase, emptyVOff, emptyVLen);
/* 106 */             if (vRow == null) {
/* 107 */               isBatchFull = true;
/* 108 */             } else {
/* 109 */               buckets[idx] = numRows++;
/* 110 */             }
/* 111 */             return vRow;
/* 112 */           } else {
/* 113 */             // No more space
/* 114 */             return null;
/* 115 */           }
/* 116 */         } else if (equals(idx, agg_key)) {
/* 117 */           return batch.getValueRow(buckets[idx]);
/* 118 */         }
/* 119 */         idx = (idx + 1) & (numBuckets - 1);
/* 120 */         step++;
/* 121 */       }
/* 122 */       // Didn't find it
/* 123 */       return null;
/* 124 */     }
/* 125 */
/* 126 */     private boolean equals(int idx, UTF8String agg_key) {
/* 127 */       UnsafeRow row = batch.getKeyRow(buckets[idx]);
/* 128 */       return (row.getUTF8String(0).equals(agg_key));
/* 129 */     }
/* 130 */
/* 131 */     private long hash(UTF8String agg_key) {
/* 132 */       long agg_hash = 0;
/* 133 */
/* 134 */       int agg_result = 0;
/* 135 */       byte[] agg_bytes = agg_key.getBytes();
/* 136 */       for (int i = 0; i < agg_bytes.length; i++) {
/* 137 */         int agg_hash1 = agg_bytes[i];
/* 138 */         agg_result = (agg_result ^ (0x9e3779b9)) + agg_hash1 + (agg_result << 6) + (agg_result >>> 2);
/* 139 */       }
/* 140 */
/* 141 */       agg_hash = (agg_hash ^ (0x9e3779b9)) + agg_result + (agg_hash << 6) + (agg_hash >>> 2);
/* 142 */
/* 143 */       return agg_hash;
/* 144 */     }
/* 145 */
/* 146 */     public org.apache.spark.unsafe.KVIterator<UnsafeRow, UnsafeRow> rowIterator() {
/* 147 */       return batch.rowIterator();
/* 148 */     }
/* 149 */
/* 150 */     public void close() {
/* 151 */       batch.close();
/* 152 */     }
/* 153 */
/* 154 */   }
/* 155 */
/* 156 */   protected void processNext() throws java.io.IOException {
/* 157 */     if (!agg_initAgg) {
/* 158 */       agg_initAgg = true;
/* 159 */       long wholestagecodegen_beforeAgg = System.nanoTime();
/* 160 */       agg_nestedClassInstance1.agg_doAggregateWithKeys();
/* 161 */       ((org.apache.spark.sql.execution.metric.SQLMetric) references[8] /* aggTime */).add((System.nanoTime() - wholestagecodegen_beforeAgg) / 1000000);
/* 162 */     }
/* 163 */
/* 164 */     // output the result
/* 165 */
/* 166 */     while (agg_fastHashMapIter.next()) {
/* 167 */       UnsafeRow agg_aggKey = (UnsafeRow) agg_fastHashMapIter.getKey();
/* 168 */       UnsafeRow agg_aggBuffer = (UnsafeRow) agg_fastHashMapIter.getValue();
/* 169 */       wholestagecodegen_nestedClassInstance.agg_doAggregateWithKeysOutput(agg_aggKey, agg_aggBuffer);
/* 170 */
/* 171 */       if (shouldStop()) return;
/* 172 */     }
/* 173 */     agg_fastHashMap.close();
/* 174 */
/* 175 */     while (agg_mapIter.next()) {
/* 176 */       UnsafeRow agg_aggKey = (UnsafeRow) agg_mapIter.getKey();
/* 177 */       UnsafeRow agg_aggBuffer = (UnsafeRow) agg_mapIter.getValue();
/* 178 */       wholestagecodegen_nestedClassInstance.agg_doAggregateWithKeysOutput(agg_aggKey, agg_aggBuffer);
/* 179 */
/* 180 */       if (shouldStop()) return;
/* 181 */     }
/* 182 */
/* 183 */     agg_mapIter.close();
/* 184 */     if (agg_sorter == null) {
/* 185 */       agg_hashMap.free();
/* 186 */     }
/* 187 */   }
/* 188 */
/* 189 */   private wholestagecodegen_NestedClass wholestagecodegen_nestedClassInstance = new wholestagecodegen_NestedClass();
/* 190 */   private agg_NestedClass1 agg_nestedClassInstance1 = new agg_NestedClass1();
/* 191 */   private agg_NestedClass agg_nestedClassInstance = new agg_NestedClass();
/* 192 */
/* 193 */   private class agg_NestedClass1 {
/* 194 */     private void agg_doAggregateWithKeys() throws java.io.IOException {
/* 195 */       while (inputadapter_input.hasNext() && !stopEarly()) {
/* 196 */         InternalRow inputadapter_row = (InternalRow) inputadapter_input.next();
/* 197 */         int inputadapter_value = inputadapter_row.getInt(0);
/* 198 */         boolean inputadapter_isNull1 = inputadapter_row.isNullAt(1);
/* 199 */         UTF8String inputadapter_value1 = inputadapter_isNull1 ?
/* 200 */         null : (inputadapter_row.getUTF8String(1));
/* 201 */
/* 202 */         agg_nestedClassInstance.agg_doConsume(inputadapter_row, inputadapter_value, inputadapter_value1, inputadapter_isNull1);
/* 203 */         if (shouldStop()) return;
/* 204 */       }
/* 205 */
/* 206 */       agg_fastHashMapIter = agg_fastHashMap.rowIterator();
/* 207 */       agg_mapIter = ((org.apache.spark.sql.execution.aggregate.HashAggregateExec) references[0] /* plan */).finishAggregate(agg_hashMap, agg_sorter, ((org.apache.spark.sql.execution.metric.SQLMetric) references[4] /* peakMemory */), ((org.apache.spark.sql.execution.metric.SQLMetric) references[5] /* spillSize */), ((org.apache.spark.sql.execution.metric.SQLMetric) references[6] /* avgHashProbe */));
/* 208 */
/* 209 */     }
/* 210 */
/* 211 */   }
/* 212 */
/* 213 */   private class wholestagecodegen_NestedClass {
/* 214 */     private void agg_doAggregateWithKeysOutput(UnsafeRow agg_keyTerm, UnsafeRow agg_bufferTerm)
/* 215 */     throws java.io.IOException {
/* 216 */       ((org.apache.spark.sql.execution.metric.SQLMetric) references[7] /* numOutputRows */).add(1);
/* 217 */
/* 218 */       boolean agg_isNull35 = agg_keyTerm.isNullAt(0);
/* 219 */       UTF8String agg_value37 = agg_isNull35 ?
/* 220 */       null : (agg_keyTerm.getUTF8String(0));
/* 221 */       boolean agg_isNull36 = agg_bufferTerm.isNullAt(0);
/* 222 */       double agg_value38 = agg_isNull36 ?
/* 223 */       -1.0 : (agg_bufferTerm.getDouble(0));
/* 224 */       boolean agg_isNull37 = agg_bufferTerm.isNullAt(1);
/* 225 */       long agg_value39 = agg_isNull37 ?
/* 226 */       -1L : (agg_bufferTerm.getLong(1));
/* 227 */
/* 228 */       agg_mutableStateArray1[1].reset();
/* 229 */
/* 230 */       agg_mutableStateArray2[1].zeroOutNullBytes();
/* 231 */
/* 232 */       if (agg_isNull35) {
/* 233 */         agg_mutableStateArray2[1].setNullAt(0);
/* 234 */       } else {
/* 235 */         agg_mutableStateArray2[1].write(0, agg_value37);
/* 236 */       }
/* 237 */
/* 238 */       if (agg_isNull36) {
/* 239 */         agg_mutableStateArray2[1].setNullAt(1);
/* 240 */       } else {
/* 241 */         agg_mutableStateArray2[1].write(1, agg_value38);
/* 242 */       }
/* 243 */
/* 244 */       if (agg_isNull37) {
/* 245 */         agg_mutableStateArray2[1].setNullAt(2);
/* 246 */       } else {
/* 247 */         agg_mutableStateArray2[1].write(2, agg_value39);
/* 248 */       }
/* 249 */       agg_mutableStateArray[1].setTotalSize(agg_mutableStateArray1[1].totalSize());
/* 250 */       append(agg_mutableStateArray[1]);
/* 251 */
/* 252 */     }
/* 253 */
/* 254 */   }
/* 255 */
/* 256 */   private class agg_NestedClass {
/* 257 */     private void agg_doConsume(InternalRow inputadapter_row, int agg_expr_0, UTF8String agg_expr_1, boolean agg_exprIsNull_1) throws java.io.IOException {
/* 258 */       UnsafeRow agg_unsafeRowAggBuffer = null;
/* 259 */       UnsafeRow agg_fastAggBuffer = null;
/* 260 */
/* 261 */       if (true) {
/* 262 */         if (!agg_exprIsNull_1) {
/* 263 */           agg_fastAggBuffer = agg_fastHashMap.findOrInsert(
/* 264 */             agg_expr_1);
/* 265 */         }
/* 266 */       }
/* 267 */       // Cannot find the key in fast hash map, try regular hash map.
/* 268 */       if (agg_fastAggBuffer == null) {
/* 269 */         // generate grouping key
/* 270 */         agg_mutableStateArray1[0].reset();
/* 271 */
/* 272 */         agg_mutableStateArray2[0].zeroOutNullBytes();
/* 273 */
/* 274 */         if (agg_exprIsNull_1) {
/* 275 */           agg_mutableStateArray2[0].setNullAt(0);
/* 276 */         } else {
/* 277 */           agg_mutableStateArray2[0].write(0, agg_expr_1);
/* 278 */         }
/* 279 */         agg_mutableStateArray[0].setTotalSize(agg_mutableStateArray1[0].totalSize());
/* 280 */         int agg_value7 = 42;
/* 281 */
/* 282 */         if (!agg_exprIsNull_1) {
/* 283 */           agg_value7 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(agg_expr_1.getBaseObject(), agg_expr_1.getBaseOffset(), agg_expr_1.numBytes(), agg_value7);
/* 284 */         }
/* 285 */         if (true) {
/* 286 */           // try to get the buffer from hash map
/* 287 */           agg_unsafeRowAggBuffer =
/* 288 */           agg_hashMap.getAggregationBufferFromUnsafeRow(agg_mutableStateArray[0], agg_value7);
/* 289 */         }
/* 290 */         // Can't allocate buffer from the hash map. Spill the map and fallback to sort-based
/* 291 */         // aggregation after processing all input rows.
/* 292 */         if (agg_unsafeRowAggBuffer == null) {
/* 293 */           if (agg_sorter == null) {
/* 294 */             agg_sorter = agg_hashMap.destructAndCreateExternalSorter();
/* 295 */           } else {
/* 296 */             agg_sorter.merge(agg_hashMap.destructAndCreateExternalSorter());
/* 297 */           }
/* 298 */
/* 299 */           // the hash map had be spilled, it should have enough memory now,
/* 300 */           // try to allocate buffer again.
/* 301 */           agg_unsafeRowAggBuffer = agg_hashMap.getAggregationBufferFromUnsafeRow(
/* 302 */             agg_mutableStateArray[0], agg_value7);
/* 303 */           if (agg_unsafeRowAggBuffer == null) {
/* 304 */             // failed to allocate the first page
/* 305 */             throw new OutOfMemoryError("No enough memory for aggregation");
/* 306 */           }
/* 307 */         }
/* 308 */
/* 309 */       }
/* 310 */
/* 311 */       if (agg_fastAggBuffer != null) {
/* 312 */         // common sub-expressions
/* 313 */         boolean agg_isNull21 = false;
/* 314 */         long agg_value23 = -1L;
/* 315 */         if (!false) {
/* 316 */           agg_value23 = (long) agg_expr_0;
/* 317 */         }
/* 318 */         // evaluate aggregate function
/* 319 */         boolean agg_isNull23 = true;
/* 320 */         double agg_value25 = -1.0;
/* 321 */
/* 322 */         boolean agg_isNull24 = agg_fastAggBuffer.isNullAt(0);
/* 323 */         double agg_value26 = agg_isNull24 ?
/* 324 */         -1.0 : (agg_fastAggBuffer.getDouble(0));
/* 325 */         if (!agg_isNull24) {
/* 326 */           agg_agg_isNull25 = true;
/* 327 */           double agg_value27 = -1.0;
/* 328 */           do {
/* 329 */             boolean agg_isNull26 = agg_isNull21;
/* 330 */             double agg_value28 = -1.0;
/* 331 */             if (!agg_isNull21) {
/* 332 */               agg_value28 = (double) agg_value23;
/* 333 */             }
/* 334 */             if (!agg_isNull26) {
/* 335 */               agg_agg_isNull25 = false;
/* 336 */               agg_value27 = agg_value28;
/* 337 */               continue;
/* 338 */             }
/* 339 */
/* 340 */             boolean agg_isNull27 = false;
/* 341 */             double agg_value29 = -1.0;
/* 342 */             if (!false) {
/* 343 */               agg_value29 = (double) 0;
/* 344 */             }
/* 345 */             if (!agg_isNull27) {
/* 346 */               agg_agg_isNull25 = false;
/* 347 */               agg_value27 = agg_value29;
/* 348 */               continue;
/* 349 */             }
/* 350 */
/* 351 */           } while (false);
/* 352 */
/* 353 */           agg_isNull23 = false; // resultCode could change nullability.
/* 354 */           agg_value25 = agg_value26 + agg_value27;
/* 355 */
/* 356 */         }
/* 357 */         boolean agg_isNull29 = false;
/* 358 */         long agg_value31 = -1L;
/* 359 */         if (!false && agg_isNull21) {
/* 360 */           boolean agg_isNull31 = agg_fastAggBuffer.isNullAt(1);
/* 361 */           long agg_value33 = agg_isNull31 ?
/* 362 */           -1L : (agg_fastAggBuffer.getLong(1));
/* 363 */           agg_isNull29 = agg_isNull31;
/* 364 */           agg_value31 = agg_value33;
/* 365 */         } else {
/* 366 */           boolean agg_isNull32 = true;
/* 367 */           long agg_value34 = -1L;
/* 368 */
/* 369 */           boolean agg_isNull33 = agg_fastAggBuffer.isNullAt(1);
/* 370 */           long agg_value35 = agg_isNull33 ?
/* 371 */           -1L : (agg_fastAggBuffer.getLong(1));
/* 372 */           if (!agg_isNull33) {
/* 373 */             agg_isNull32 = false; // resultCode could change nullability.
/* 374 */             agg_value34 = agg_value35 + 1L;
/* 375 */
/* 376 */           }
/* 377 */           agg_isNull29 = agg_isNull32;
/* 378 */           agg_value31 = agg_value34;
/* 379 */         }
/* 380 */         // update fast row
/* 381 */         if (!agg_isNull23) {
/* 382 */           agg_fastAggBuffer.setDouble(0, agg_value25);
/* 383 */         } else {
/* 384 */           agg_fastAggBuffer.setNullAt(0);
/* 385 */         }
/* 386 */
/* 387 */         if (!agg_isNull29) {
/* 388 */           agg_fastAggBuffer.setLong(1, agg_value31);
/* 389 */         } else {
/* 390 */           agg_fastAggBuffer.setNullAt(1);
/* 391 */         }
/* 392 */       } else {
/* 393 */         // common sub-expressions
/* 394 */         boolean agg_isNull7 = false;
/* 395 */         long agg_value9 = -1L;
/* 396 */         if (!false) {
/* 397 */           agg_value9 = (long) agg_expr_0;
/* 398 */         }
/* 399 */         // evaluate aggregate function
/* 400 */         boolean agg_isNull9 = true;
/* 401 */         double agg_value11 = -1.0;
/* 402 */
/* 403 */         boolean agg_isNull10 = agg_unsafeRowAggBuffer.isNullAt(0);
/* 404 */         double agg_value12 = agg_isNull10 ?
/* 405 */         -1.0 : (agg_unsafeRowAggBuffer.getDouble(0));
/* 406 */         if (!agg_isNull10) {
/* 407 */           agg_agg_isNull11 = true;
/* 408 */           double agg_value13 = -1.0;
/* 409 */           do {
/* 410 */             boolean agg_isNull12 = agg_isNull7;
/* 411 */             double agg_value14 = -1.0;
/* 412 */             if (!agg_isNull7) {
/* 413 */               agg_value14 = (double) agg_value9;
/* 414 */             }
/* 415 */             if (!agg_isNull12) {
/* 416 */               agg_agg_isNull11 = false;
/* 417 */               agg_value13 = agg_value14;
/* 418 */               continue;
/* 419 */             }
/* 420 */
/* 421 */             boolean agg_isNull13 = false;
/* 422 */             double agg_value15 = -1.0;
/* 423 */             if (!false) {
/* 424 */               agg_value15 = (double) 0;
/* 425 */             }
/* 426 */             if (!agg_isNull13) {
/* 427 */               agg_agg_isNull11 = false;
/* 428 */               agg_value13 = agg_value15;
/* 429 */               continue;
/* 430 */             }
/* 431 */
/* 432 */           } while (false);
/* 433 */
/* 434 */           agg_isNull9 = false; // resultCode could change nullability.
/* 435 */           agg_value11 = agg_value12 + agg_value13;
/* 436 */
/* 437 */         }
/* 438 */         boolean agg_isNull15 = false;
/* 439 */         long agg_value17 = -1L;
/* 440 */         if (!false && agg_isNull7) {
/* 441 */           boolean agg_isNull17 = agg_unsafeRowAggBuffer.isNullAt(1);
/* 442 */           long agg_value19 = agg_isNull17 ?
/* 443 */           -1L : (agg_unsafeRowAggBuffer.getLong(1));
/* 444 */           agg_isNull15 = agg_isNull17;
/* 445 */           agg_value17 = agg_value19;
/* 446 */         } else {
/* 447 */           boolean agg_isNull18 = true;
/* 448 */           long agg_value20 = -1L;
/* 449 */
/* 450 */           boolean agg_isNull19 = agg_unsafeRowAggBuffer.isNullAt(1);
/* 451 */           long agg_value21 = agg_isNull19 ?
/* 452 */           -1L : (agg_unsafeRowAggBuffer.getLong(1));
/* 453 */           if (!agg_isNull19) {
/* 454 */             agg_isNull18 = false; // resultCode could change nullability.
/* 455 */             agg_value20 = agg_value21 + 1L;
/* 456 */
/* 457 */           }
/* 458 */           agg_isNull15 = agg_isNull18;
/* 459 */           agg_value17 = agg_value20;
/* 460 */         }
/* 461 */         // update unsafe row buffer
/* 462 */         if (!agg_isNull9) {
/* 463 */           agg_unsafeRowAggBuffer.setDouble(0, agg_value11);
/* 464 */         } else {
/* 465 */           agg_unsafeRowAggBuffer.setNullAt(0);
/* 466 */         }
/* 467 */
/* 468 */         if (!agg_isNull15) {
/* 469 */           agg_unsafeRowAggBuffer.setLong(1, agg_value17);
/* 470 */         } else {
/* 471 */           agg_unsafeRowAggBuffer.setNullAt(1);
/* 472 */         }
/* 473 */
/* 474 */       }
/* 475 */
/* 476 */     }
/* 477 */
/* 478 */   }
/* 479 */
/* 480 */ }
```

## How was this patch tested?

Added UT into `WholeStageCodegenSuite`

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #20779 from kiszk/SPARK-23598.

(cherry picked from commit 1098933b0ac5cdb18101d3aebefa773c2ce05a50)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../spark/sql/execution/BufferedRowIterator.java     | 12 ++++++++----
 .../spark/sql/execution/WholeStageCodegenSuite.scala | 12 ++++++++++++
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/BufferedRowIterator.java b/sql/core/src/main/java/org/apache/spark/sql/execution/BufferedRowIterator.java
index 730a4ae8d5605..74c9c05992719 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/BufferedRowIterator.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/BufferedRowIterator.java
@@ -62,10 +62,14 @@ public long durationMs() {
    */
   public abstract void init(int index, Iterator<InternalRow>[] iters);
 
+  /*
+   * Attributes of the following four methods are public. Thus, they can be also accessed from
+   * methods in inner classes. See SPARK-23598
+   */
   /**
    * Append a row to currentRows.
    */
-  protected void append(InternalRow row) {
+  public void append(InternalRow row) {
     currentRows.add(row);
   }
 
@@ -75,7 +79,7 @@ protected void append(InternalRow row) {
    * If it returns true, the caller should exit the loop that [[InputAdapter]] generates.
    * This interface is mainly used to limit the number of input rows.
    */
-  protected boolean stopEarly() {
+  public boolean stopEarly() {
     return false;
   }
 
@@ -84,14 +88,14 @@ protected boolean stopEarly() {
    *
    * If it returns true, the caller should exit the loop (return from processNext()).
    */
-  protected boolean shouldStop() {
+  public boolean shouldStop() {
     return !currentRows.isEmpty();
   }
 
   /**
    * Increase the peak execution memory for current task.
    */
-  protected void incPeakExecutionMemory(long size) {
+  public void incPeakExecutionMemory(long size) {
     TaskContext.get().taskMetrics().incPeakExecutionMemory(size);
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
index 0fb9dd2017a09..4b40e4ef7571c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -32,6 +32,8 @@ import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
 
 class WholeStageCodegenSuite extends QueryTest with SharedSQLContext {
 
+  import testImplicits._
+
   test("range/filter should be combined") {
     val df = spark.range(10).filter("id = 1").selectExpr("id + 1")
     val plan = df.queryExecution.executedPlan
@@ -307,4 +309,14 @@ class WholeStageCodegenSuite extends QueryTest with SharedSQLContext {
       // a different query can result in codegen cache miss, that's by design
     }
   }
+
+  test("SPARK-23598: Codegen working for lots of aggregation operations without runtime errors") {
+    withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
+      var df = Seq((8, "bat"), (15, "mouse"), (5, "horse")).toDF("age", "name")
+      for (i <- 0 until 70) {
+        df = df.groupBy("name").agg(avg("age").alias("age"))
+      }
+      assert(df.limit(1).collect() === Array(Row("bat", 8.0)))
+    }
+  }
 }

From 0663b61193b37094b9d00c7f2cbb0268ad946e25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Cattilapiros=E2=80=9D?=
 <piros.attila.zsolt@gmail.com>
Date: Wed, 14 Mar 2018 18:36:31 -0700
Subject: [PATCH 348/566] [SPARK-22915][MLLIB] Streaming tests for
 spark.ml.feature, from N to Z
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What changes were proposed in this pull request?

Adds structured streaming tests using testTransformer for these suites:

- NGramSuite
- NormalizerSuite
- OneHotEncoderEstimatorSuite
- OneHotEncoderSuite
- PCASuite
- PolynomialExpansionSuite
- QuantileDiscretizerSuite
- RFormulaSuite
- SQLTransformerSuite
- StandardScalerSuite
- StopWordsRemoverSuite
- StringIndexerSuite
- TokenizerSuite
- RegexTokenizerSuite
- VectorAssemblerSuite
- VectorIndexerSuite
- VectorSizeHintSuite
- VectorSlicerSuite
- Word2VecSuite

# How was this patch tested?

They are unit test.

Author: “attilapiros” <piros.attila.zsolt@gmail.com>

Closes #20686 from attilapiros/SPARK-22915.

(cherry picked from commit 279b3db8970809104c30941254e57e3d62da5041)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 .../apache/spark/ml/feature/NGramSuite.scala  |  23 +-
 .../spark/ml/feature/NormalizerSuite.scala    |  57 ++---
 .../feature/OneHotEncoderEstimatorSuite.scala | 193 ++++++++---------
 .../spark/ml/feature/OneHotEncoderSuite.scala | 124 ++++++-----
 .../apache/spark/ml/feature/PCASuite.scala    |  14 +-
 .../ml/feature/PolynomialExpansionSuite.scala |  62 +++---
 .../ml/feature/QuantileDiscretizerSuite.scala | 198 +++++++++--------
 .../spark/ml/feature/RFormulaSuite.scala      | 158 +++++++-------
 .../ml/feature/SQLTransformerSuite.scala      |  35 +--
 .../ml/feature/StandardScalerSuite.scala      |  33 +--
 .../ml/feature/StopWordsRemoverSuite.scala    |  37 ++--
 .../spark/ml/feature/StringIndexerSuite.scala | 204 +++++++++---------
 .../spark/ml/feature/TokenizerSuite.scala     |  30 +--
 .../spark/ml/feature/VectorIndexerSuite.scala | 183 +++++++++-------
 .../ml/feature/VectorSizeHintSuite.scala      |  88 +++++---
 .../spark/ml/feature/VectorSlicerSuite.scala  |  27 +--
 .../spark/ml/feature/Word2VecSuite.scala      |  28 +--
 .../org/apache/spark/ml/util/MLTest.scala     |  33 ++-
 18 files changed, 809 insertions(+), 718 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala
index d4975c0b4e20e..e5956ee9942aa 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala
@@ -19,17 +19,15 @@ package org.apache.spark.ml.feature
 
 import scala.beans.BeanInfo
 
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
-import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.{Dataset, Row}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
+import org.apache.spark.sql.{DataFrame, Row}
+
 
 @BeanInfo
 case class NGramTestData(inputTokens: Array[String], wantedNGrams: Array[String])
 
-class NGramSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class NGramSuite extends MLTest with DefaultReadWriteTest {
 
-  import org.apache.spark.ml.feature.NGramSuite._
   import testImplicits._
 
   test("default behavior yields bigram features") {
@@ -83,16 +81,11 @@ class NGramSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultRe
       .setN(3)
     testDefaultReadWrite(t)
   }
-}
-
-object NGramSuite extends SparkFunSuite {
 
-  def testNGram(t: NGram, dataset: Dataset[_]): Unit = {
-    t.transform(dataset)
-      .select("nGrams", "wantedNGrams")
-      .collect()
-      .foreach { case Row(actualNGrams, wantedNGrams) =>
+  def testNGram(t: NGram, dataFrame: DataFrame): Unit = {
+    testTransformer[(Seq[String], Seq[String])](dataFrame, t, "nGrams", "wantedNGrams") {
+      case Row(actualNGrams : Seq[String], wantedNGrams: Seq[String]) =>
         assert(actualNGrams === wantedNGrams)
-      }
+    }
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/NormalizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/NormalizerSuite.scala
index c75027fb4553d..eff57f1223af4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/NormalizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/NormalizerSuite.scala
@@ -17,21 +17,17 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
-import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
 import org.apache.spark.ml.util.TestingUtils._
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{DataFrame, Row}
 
 
-class NormalizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class NormalizerSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
   @transient var data: Array[Vector] = _
-  @transient var dataFrame: DataFrame = _
-  @transient var normalizer: Normalizer = _
   @transient var l1Normalized: Array[Vector] = _
   @transient var l2Normalized: Array[Vector] = _
 
@@ -62,49 +58,40 @@ class NormalizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
       Vectors.dense(0.897906166, 0.113419726, 0.42532397),
       Vectors.sparse(3, Seq())
     )
-
-    dataFrame = data.map(NormalizerSuite.FeatureData).toSeq.toDF()
-    normalizer = new Normalizer()
-      .setInputCol("features")
-      .setOutputCol("normalized_features")
-  }
-
-  def collectResult(result: DataFrame): Array[Vector] = {
-    result.select("normalized_features").collect().map {
-      case Row(features: Vector) => features
-    }
   }
 
-  def assertTypeOfVector(lhs: Array[Vector], rhs: Array[Vector]): Unit = {
-    assert((lhs, rhs).zipped.forall {
+  def assertTypeOfVector(lhs: Vector, rhs: Vector): Unit = {
+    assert((lhs, rhs) match {
       case (v1: DenseVector, v2: DenseVector) => true
       case (v1: SparseVector, v2: SparseVector) => true
       case _ => false
     }, "The vector type should be preserved after normalization.")
   }
 
-  def assertValues(lhs: Array[Vector], rhs: Array[Vector]): Unit = {
-    assert((lhs, rhs).zipped.forall { (vector1, vector2) =>
-      vector1 ~== vector2 absTol 1E-5
-    }, "The vector value is not correct after normalization.")
+  def assertValues(lhs: Vector, rhs: Vector): Unit = {
+    assert(lhs ~== rhs absTol 1E-5, "The vector value is not correct after normalization.")
   }
 
   test("Normalization with default parameter") {
-    val result = collectResult(normalizer.transform(dataFrame))
-
-    assertTypeOfVector(data, result)
+    val normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized")
+    val dataFrame: DataFrame = data.zip(l2Normalized).seq.toDF("features", "expected")
 
-    assertValues(result, l2Normalized)
+    testTransformer[(Vector, Vector)](dataFrame, normalizer, "features", "normalized", "expected") {
+      case Row(features: Vector, normalized: Vector, expected: Vector) =>
+        assertTypeOfVector(normalized, features)
+        assertValues(normalized, expected)
+    }
   }
 
   test("Normalization with setter") {
-    normalizer.setP(1)
+    val dataFrame: DataFrame = data.zip(l1Normalized).seq.toDF("features", "expected")
+    val normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized").setP(1)
 
-    val result = collectResult(normalizer.transform(dataFrame))
-
-    assertTypeOfVector(data, result)
-
-    assertValues(result, l1Normalized)
+    testTransformer[(Vector, Vector)](dataFrame, normalizer, "features", "normalized", "expected") {
+      case Row(features: Vector, normalized: Vector, expected: Vector) =>
+        assertTypeOfVector(normalized, features)
+        assertValues(normalized, expected)
+    }
   }
 
   test("read/write") {
@@ -115,7 +102,3 @@ class NormalizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
     testDefaultReadWrite(t)
   }
 }
-
-private object NormalizerSuite {
-  case class FeatureData(features: Vector)
-}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderEstimatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderEstimatorSuite.scala
index 1d3f845586426..d549e13262273 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderEstimatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderEstimatorSuite.scala
@@ -17,18 +17,16 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NominalAttribute}
 import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
-import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
+import org.apache.spark.sql.{Encoder, Row}
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.functions.col
 import org.apache.spark.sql.types._
 
-class OneHotEncoderEstimatorSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class OneHotEncoderEstimatorSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -57,13 +55,10 @@ class OneHotEncoderEstimatorSuite
     assert(encoder.getDropLast === true)
     encoder.setDropLast(false)
     assert(encoder.getDropLast === false)
-
     val model = encoder.fit(df)
-    val encoded = model.transform(df)
-    encoded.select("output", "expected").rdd.map { r =>
-      (r.getAs[Vector](0), r.getAs[Vector](1))
-    }.collect().foreach { case (vec1, vec2) =>
-      assert(vec1 === vec2)
+    testTransformer[(Double, Vector)](df, model, "output", "expected") {
+      case Row(output: Vector, expected: Vector) =>
+        assert(output === expected)
     }
   }
 
@@ -87,11 +82,9 @@ class OneHotEncoderEstimatorSuite
       .setOutputCols(Array("output"))
 
     val model = encoder.fit(df)
-    val encoded = model.transform(df)
-    encoded.select("output", "expected").rdd.map { r =>
-      (r.getAs[Vector](0), r.getAs[Vector](1))
-    }.collect().foreach { case (vec1, vec2) =>
-      assert(vec1 === vec2)
+    testTransformer[(Double, Vector)](df, model, "output", "expected") {
+      case Row(output: Vector, expected: Vector) =>
+        assert(output === expected)
     }
   }
 
@@ -103,11 +96,12 @@ class OneHotEncoderEstimatorSuite
       .setInputCols(Array("size"))
       .setOutputCols(Array("encoded"))
     val model = encoder.fit(df)
-    val output = model.transform(df)
-    val group = AttributeGroup.fromStructField(output.schema("encoded"))
-    assert(group.size === 2)
-    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("small").withIndex(0))
-    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("medium").withIndex(1))
+    testTransformerByGlobalCheckFunc[(Double)](df, model, "encoded") { rows =>
+        val group = AttributeGroup.fromStructField(rows.head.schema("encoded"))
+        assert(group.size === 2)
+        assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("small").withIndex(0))
+        assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("medium").withIndex(1))
+    }
   }
 
   test("input column without ML attribute") {
@@ -116,11 +110,12 @@ class OneHotEncoderEstimatorSuite
       .setInputCols(Array("index"))
       .setOutputCols(Array("encoded"))
     val model = encoder.fit(df)
-    val output = model.transform(df)
-    val group = AttributeGroup.fromStructField(output.schema("encoded"))
-    assert(group.size === 2)
-    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("0").withIndex(0))
-    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("1").withIndex(1))
+    testTransformerByGlobalCheckFunc[(Double)](df, model, "encoded") { rows =>
+      val group = AttributeGroup.fromStructField(rows.head.schema("encoded"))
+      assert(group.size === 2)
+      assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("0").withIndex(0))
+      assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("1").withIndex(1))
+    }
   }
 
   test("read/write") {
@@ -151,29 +146,30 @@ class OneHotEncoderEstimatorSuite
 
     val df = spark.createDataFrame(sc.parallelize(data), schema)
 
-    val dfWithTypes = df
-      .withColumn("shortInput", df("input").cast(ShortType))
-      .withColumn("longInput", df("input").cast(LongType))
-      .withColumn("intInput", df("input").cast(IntegerType))
-      .withColumn("floatInput", df("input").cast(FloatType))
-      .withColumn("decimalInput", df("input").cast(DecimalType(10, 0)))
-
-    val cols = Array("input", "shortInput", "longInput", "intInput",
-      "floatInput", "decimalInput")
-    for (col <- cols) {
-      val encoder = new OneHotEncoderEstimator()
-        .setInputCols(Array(col))
+    class NumericTypeWithEncoder[A](val numericType: NumericType)
+      (implicit val encoder: Encoder[(A, Vector)])
+
+    val types = Seq(
+      new NumericTypeWithEncoder[Short](ShortType),
+      new NumericTypeWithEncoder[Long](LongType),
+      new NumericTypeWithEncoder[Int](IntegerType),
+      new NumericTypeWithEncoder[Float](FloatType),
+      new NumericTypeWithEncoder[Byte](ByteType),
+      new NumericTypeWithEncoder[Double](DoubleType),
+      new NumericTypeWithEncoder[Decimal](DecimalType(10, 0))(ExpressionEncoder()))
+
+    for (t <- types) {
+      val dfWithTypes = df.select(col("input").cast(t.numericType), col("expected"))
+      val estimator = new OneHotEncoderEstimator()
+        .setInputCols(Array("input"))
         .setOutputCols(Array("output"))
         .setDropLast(false)
 
-      val model = encoder.fit(dfWithTypes)
-      val encoded = model.transform(dfWithTypes)
-
-      encoded.select("output", "expected").rdd.map { r =>
-        (r.getAs[Vector](0), r.getAs[Vector](1))
-      }.collect().foreach { case (vec1, vec2) =>
-        assert(vec1 === vec2)
-      }
+      val model = estimator.fit(dfWithTypes)
+      testTransformer(dfWithTypes, model, "output", "expected") {
+        case Row(output: Vector, expected: Vector) =>
+          assert(output === expected)
+      }(t.encoder)
     }
   }
 
@@ -202,12 +198,16 @@ class OneHotEncoderEstimatorSuite
     assert(encoder.getDropLast === false)
 
     val model = encoder.fit(df)
-    val encoded = model.transform(df)
-    encoded.select("output1", "expected1", "output2", "expected2").rdd.map { r =>
-      (r.getAs[Vector](0), r.getAs[Vector](1), r.getAs[Vector](2), r.getAs[Vector](3))
-    }.collect().foreach { case (vec1, vec2, vec3, vec4) =>
-      assert(vec1 === vec2)
-      assert(vec3 === vec4)
+    testTransformer[(Double, Vector, Double, Vector)](
+      df,
+      model,
+      "output1",
+      "output2",
+      "expected1",
+      "expected2") {
+      case Row(output1: Vector, output2: Vector, expected1: Vector, expected2: Vector) =>
+        assert(output1 === expected1)
+        assert(output2 === expected2)
     }
   }
 
@@ -233,12 +233,16 @@ class OneHotEncoderEstimatorSuite
       .setOutputCols(Array("output1", "output2"))
 
     val model = encoder.fit(df)
-    val encoded = model.transform(df)
-    encoded.select("output1", "expected1", "output2", "expected2").rdd.map { r =>
-      (r.getAs[Vector](0), r.getAs[Vector](1), r.getAs[Vector](2), r.getAs[Vector](3))
-    }.collect().foreach { case (vec1, vec2, vec3, vec4) =>
-      assert(vec1 === vec2)
-      assert(vec3 === vec4)
+    testTransformer[(Double, Vector, Double, Vector)](
+      df,
+      model,
+      "output1",
+      "output2",
+      "expected1",
+      "expected2") {
+      case Row(output1: Vector, output2: Vector, expected1: Vector, expected2: Vector) =>
+        assert(output1 === expected1)
+        assert(output2 === expected2)
     }
   }
 
@@ -253,10 +257,12 @@ class OneHotEncoderEstimatorSuite
       .setOutputCols(Array("encoded"))
 
     val model = encoder.fit(trainingDF)
-    val err = intercept[SparkException] {
-      model.transform(testDF).show
-    }
-    err.getMessage.contains("Unseen value: 3.0. To handle unseen values")
+    testTransformerByInterceptingException[(Int, Int)](
+      testDF,
+      model,
+      expectedMessagePart = "Unseen value: 3.0. To handle unseen values",
+      firstResultCol = "encoded")
+
   }
 
   test("Can't transform on negative input") {
@@ -268,10 +274,11 @@ class OneHotEncoderEstimatorSuite
       .setOutputCols(Array("encoded"))
 
     val model = encoder.fit(trainingDF)
-    val err = intercept[SparkException] {
-      model.transform(testDF).collect()
-    }
-    err.getMessage.contains("Negative value: -1.0. Input can't be negative")
+    testTransformerByInterceptingException[(Int, Int)](
+      testDF,
+      model,
+      expectedMessagePart = "Negative value: -1.0. Input can't be negative",
+      firstResultCol = "encoded")
   }
 
   test("Keep on invalid values: dropLast = false") {
@@ -295,11 +302,9 @@ class OneHotEncoderEstimatorSuite
       .setDropLast(false)
 
     val model = encoder.fit(trainingDF)
-    val encoded = model.transform(testDF)
-    encoded.select("output", "expected").rdd.map { r =>
-      (r.getAs[Vector](0), r.getAs[Vector](1))
-    }.collect().foreach { case (vec1, vec2) =>
-      assert(vec1 === vec2)
+    testTransformer[(Double, Vector)](testDF, model, "output", "expected") {
+      case Row(output: Vector, expected: Vector) =>
+        assert(output === expected)
     }
   }
 
@@ -324,11 +329,9 @@ class OneHotEncoderEstimatorSuite
       .setDropLast(true)
 
     val model = encoder.fit(trainingDF)
-    val encoded = model.transform(testDF)
-    encoded.select("output", "expected").rdd.map { r =>
-      (r.getAs[Vector](0), r.getAs[Vector](1))
-    }.collect().foreach { case (vec1, vec2) =>
-      assert(vec1 === vec2)
+    testTransformer[(Double, Vector)](testDF, model, "output", "expected") {
+      case Row(output: Vector, expected: Vector) =>
+        assert(output === expected)
     }
   }
 
@@ -355,19 +358,15 @@ class OneHotEncoderEstimatorSuite
     val model = encoder.fit(df)
 
     model.setDropLast(false)
-    val encoded1 = model.transform(df)
-    encoded1.select("output", "expected1").rdd.map { r =>
-      (r.getAs[Vector](0), r.getAs[Vector](1))
-    }.collect().foreach { case (vec1, vec2) =>
-      assert(vec1 === vec2)
+    testTransformer[(Double, Vector, Vector)](df, model, "output", "expected1") {
+      case Row(output: Vector, expected1: Vector) =>
+        assert(output === expected1)
     }
 
     model.setDropLast(true)
-    val encoded2 = model.transform(df)
-    encoded2.select("output", "expected2").rdd.map { r =>
-      (r.getAs[Vector](0), r.getAs[Vector](1))
-    }.collect().foreach { case (vec1, vec2) =>
-      assert(vec1 === vec2)
+    testTransformer[(Double, Vector, Vector)](df, model, "output", "expected2") {
+      case Row(output: Vector, expected2: Vector) =>
+        assert(output === expected2)
     }
   }
 
@@ -392,13 +391,14 @@ class OneHotEncoderEstimatorSuite
     val model = encoder.fit(trainingDF)
     model.setHandleInvalid("error")
 
-    val err = intercept[SparkException] {
-      model.transform(testDF).collect()
-    }
-    err.getMessage.contains("Unseen value: 3.0. To handle unseen values")
+    testTransformerByInterceptingException[(Double, Vector)](
+      testDF,
+      model,
+      expectedMessagePart = "Unseen value: 3.0. To handle unseen values",
+      firstResultCol = "output")
 
     model.setHandleInvalid("keep")
-    model.transform(testDF).collect()
+    testTransformerByGlobalCheckFunc[(Double, Vector)](testDF, model, "output") { _ => }
   }
 
   test("Transforming on mismatched attributes") {
@@ -413,9 +413,10 @@ class OneHotEncoderEstimatorSuite
     val testAttr = NominalAttribute.defaultAttr.withValues("tiny", "small", "medium", "large")
     val testDF = Seq(0.0, 1.0, 2.0, 3.0).map(Tuple1.apply).toDF("size")
       .select(col("size").as("size", testAttr.toMetadata()))
-    val err = intercept[Exception] {
-      model.transform(testDF).collect()
-    }
-    err.getMessage.contains("OneHotEncoderModel expected 2 categorical values")
+    testTransformerByInterceptingException[(Double)](
+      testDF,
+      model,
+      expectedMessagePart = "OneHotEncoderModel expected 2 categorical values",
+      firstResultCol = "encoded")
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala
index c44c6813a94be..41b32b2ffa096 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala
@@ -17,18 +17,18 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NominalAttribute}
 import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
-import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.DataFrame
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
+import org.apache.spark.sql.{DataFrame, Encoder, Row}
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.functions.col
 import org.apache.spark.sql.types._
 
 class OneHotEncoderSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+  extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -54,16 +54,19 @@ class OneHotEncoderSuite
     assert(encoder.getDropLast === true)
     encoder.setDropLast(false)
     assert(encoder.getDropLast === false)
-    val encoded = encoder.transform(transformed)
-
-    val output = encoded.select("id", "labelVec").rdd.map { r =>
-      val vec = r.getAs[Vector](1)
-      (r.getInt(0), vec(0), vec(1), vec(2))
-    }.collect().toSet
-    // a -> 0, b -> 2, c -> 1
-    val expected = Set((0, 1.0, 0.0, 0.0), (1, 0.0, 0.0, 1.0), (2, 0.0, 1.0, 0.0),
-      (3, 1.0, 0.0, 0.0), (4, 1.0, 0.0, 0.0), (5, 0.0, 1.0, 0.0))
-    assert(output === expected)
+    val expected = Seq(
+      (0, Vectors.sparse(3, Seq((0, 1.0)))),
+      (1, Vectors.sparse(3, Seq((2, 1.0)))),
+      (2, Vectors.sparse(3, Seq((1, 1.0)))),
+      (3, Vectors.sparse(3, Seq((0, 1.0)))),
+      (4, Vectors.sparse(3, Seq((0, 1.0)))),
+      (5, Vectors.sparse(3, Seq((1, 1.0))))).toDF("id", "expected")
+
+    val withExpected = transformed.join(expected, "id")
+    testTransformer[(Int, String, Double, Vector)](withExpected, encoder, "labelVec", "expected") {
+      case Row(output: Vector, expected: Vector) =>
+        assert(output === expected)
+    }
   }
 
   test("OneHotEncoder dropLast = true") {
@@ -71,16 +74,19 @@ class OneHotEncoderSuite
     val encoder = new OneHotEncoder()
       .setInputCol("labelIndex")
       .setOutputCol("labelVec")
-    val encoded = encoder.transform(transformed)
-
-    val output = encoded.select("id", "labelVec").rdd.map { r =>
-      val vec = r.getAs[Vector](1)
-      (r.getInt(0), vec(0), vec(1))
-    }.collect().toSet
-    // a -> 0, b -> 2, c -> 1
-    val expected = Set((0, 1.0, 0.0), (1, 0.0, 0.0), (2, 0.0, 1.0),
-      (3, 1.0, 0.0), (4, 1.0, 0.0), (5, 0.0, 1.0))
-    assert(output === expected)
+    val expected = Seq(
+      (0, Vectors.sparse(2, Seq((0, 1.0)))),
+      (1, Vectors.sparse(2, Seq())),
+      (2, Vectors.sparse(2, Seq((1, 1.0)))),
+      (3, Vectors.sparse(2, Seq((0, 1.0)))),
+      (4, Vectors.sparse(2, Seq((0, 1.0)))),
+      (5, Vectors.sparse(2, Seq((1, 1.0))))).toDF("id", "expected")
+
+    val withExpected = transformed.join(expected, "id")
+    testTransformer[(Int, String, Double, Vector)](withExpected, encoder, "labelVec", "expected") {
+      case Row(output: Vector, expected: Vector) =>
+        assert(output === expected)
+    }
   }
 
   test("input column with ML attribute") {
@@ -90,20 +96,22 @@ class OneHotEncoderSuite
     val encoder = new OneHotEncoder()
       .setInputCol("size")
       .setOutputCol("encoded")
-    val output = encoder.transform(df)
-    val group = AttributeGroup.fromStructField(output.schema("encoded"))
-    assert(group.size === 2)
-    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("small").withIndex(0))
-    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("medium").withIndex(1))
+    testTransformerByGlobalCheckFunc[(Double)](df, encoder, "encoded") { rows =>
+      val group = AttributeGroup.fromStructField(rows.head.schema("encoded"))
+      assert(group.size === 2)
+      assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("small").withIndex(0))
+      assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("medium").withIndex(1))
+    }
   }
 
+
   test("input column without ML attribute") {
     val df = Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply).toDF("index")
     val encoder = new OneHotEncoder()
       .setInputCol("index")
       .setOutputCol("encoded")
-    val output = encoder.transform(df)
-    val group = AttributeGroup.fromStructField(output.schema("encoded"))
+    val rows = encoder.transform(df).select("encoded").collect()
+    val group = AttributeGroup.fromStructField(rows.head.schema("encoded"))
     assert(group.size === 2)
     assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("0").withIndex(0))
     assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("1").withIndex(1))
@@ -119,29 +127,41 @@ class OneHotEncoderSuite
 
   test("OneHotEncoder with varying types") {
     val df = stringIndexed()
-    val dfWithTypes = df
-      .withColumn("shortLabel", df("labelIndex").cast(ShortType))
-      .withColumn("longLabel", df("labelIndex").cast(LongType))
-      .withColumn("intLabel", df("labelIndex").cast(IntegerType))
-      .withColumn("floatLabel", df("labelIndex").cast(FloatType))
-      .withColumn("decimalLabel", df("labelIndex").cast(DecimalType(10, 0)))
-    val cols = Array("labelIndex", "shortLabel", "longLabel", "intLabel",
-      "floatLabel", "decimalLabel")
-    for (col <- cols) {
+    val attr = NominalAttribute.defaultAttr.withValues("small", "medium", "large")
+    val expected = Seq(
+      (0, Vectors.sparse(3, Seq((0, 1.0)))),
+      (1, Vectors.sparse(3, Seq((2, 1.0)))),
+      (2, Vectors.sparse(3, Seq((1, 1.0)))),
+      (3, Vectors.sparse(3, Seq((0, 1.0)))),
+      (4, Vectors.sparse(3, Seq((0, 1.0)))),
+      (5, Vectors.sparse(3, Seq((1, 1.0))))).toDF("id", "expected")
+
+    val withExpected = df.join(expected, "id")
+
+    class NumericTypeWithEncoder[A](val numericType: NumericType)
+       (implicit val encoder: Encoder[(A, Vector)])
+
+    val types = Seq(
+      new NumericTypeWithEncoder[Short](ShortType),
+      new NumericTypeWithEncoder[Long](LongType),
+      new NumericTypeWithEncoder[Int](IntegerType),
+      new NumericTypeWithEncoder[Float](FloatType),
+      new NumericTypeWithEncoder[Byte](ByteType),
+      new NumericTypeWithEncoder[Double](DoubleType),
+      new NumericTypeWithEncoder[Decimal](DecimalType(10, 0))(ExpressionEncoder()))
+
+    for (t <- types) {
+      val dfWithTypes = withExpected.select(col("labelIndex")
+        .cast(t.numericType).as("labelIndex", attr.toMetadata()), col("expected"))
       val encoder = new OneHotEncoder()
-        .setInputCol(col)
+        .setInputCol("labelIndex")
         .setOutputCol("labelVec")
         .setDropLast(false)
-      val encoded = encoder.transform(dfWithTypes)
-
-      val output = encoded.select("id", "labelVec").rdd.map { r =>
-        val vec = r.getAs[Vector](1)
-        (r.getInt(0), vec(0), vec(1), vec(2))
-      }.collect().toSet
-      // a -> 0, b -> 2, c -> 1
-      val expected = Set((0, 1.0, 0.0, 0.0), (1, 0.0, 0.0, 1.0), (2, 0.0, 1.0, 0.0),
-        (3, 1.0, 0.0, 0.0), (4, 1.0, 0.0, 0.0), (5, 0.0, 1.0, 0.0))
-      assert(output === expected)
+
+      testTransformer(dfWithTypes, encoder, "labelVec", "expected") {
+        case Row(output: Vector, expected: Vector) =>
+          assert(output === expected)
+      }(t.encoder)
     }
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/PCASuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/PCASuite.scala
index 3067a52a4df76..531b1d7c4d9f7 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/PCASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/PCASuite.scala
@@ -17,17 +17,15 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg._
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.linalg.{Vectors => OldVectors}
 import org.apache.spark.mllib.linalg.distributed.RowMatrix
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.Row
 
-class PCASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class PCASuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -62,10 +60,10 @@ class PCASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultRead
     val pcaModel = pca.fit(df)
 
     MLTestingUtils.checkCopyAndUids(pca, pcaModel)
-
-    pcaModel.transform(df).select("pca_features", "expected").collect().foreach {
-      case Row(x: Vector, y: Vector) =>
-        assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.")
+    testTransformer[(Vector, Vector)](df, pcaModel, "pca_features", "expected") {
+      case Row(result: Vector, expected: Vector) =>
+        assert(result ~== expected absTol 1e-5,
+          "Transformed vector is different with expected vector.")
     }
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala
index e4b0ddf98bfad..0be7aa6c83f29 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala
@@ -17,18 +17,13 @@
 
 package org.apache.spark.ml.feature
 
-import org.scalatest.exceptions.TestFailedException
-
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
 import org.apache.spark.ml.util.TestingUtils._
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.Row
 
-class PolynomialExpansionSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class PolynomialExpansionSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -60,6 +55,18 @@ class PolynomialExpansionSuite
       -1.08, 3.3, 1.98, -3.63, 9.0, 5.4, -9.9, -27.0),
     Vectors.sparse(19, Array.empty, Array.empty))
 
+  def assertTypeOfVector(lhs: Vector, rhs: Vector): Unit = {
+    assert((lhs, rhs) match {
+      case (v1: DenseVector, v2: DenseVector) => true
+      case (v1: SparseVector, v2: SparseVector) => true
+      case _ => false
+    }, "The vector type should be preserved after polynomial expansion.")
+  }
+
+  def assertValues(lhs: Vector, rhs: Vector): Unit = {
+    assert(lhs ~== rhs absTol 1e-1, "The vector value is not correct after polynomial expansion.")
+  }
+
   test("Polynomial expansion with default parameter") {
     val df = data.zip(twoDegreeExpansion).toSeq.toDF("features", "expected")
 
@@ -67,13 +74,10 @@ class PolynomialExpansionSuite
       .setInputCol("features")
       .setOutputCol("polyFeatures")
 
-    polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach {
-      case Row(expanded: DenseVector, expected: DenseVector) =>
-        assert(expanded ~== expected absTol 1e-1)
-      case Row(expanded: SparseVector, expected: SparseVector) =>
-        assert(expanded ~== expected absTol 1e-1)
-      case _ =>
-        throw new TestFailedException("Unmatched data types after polynomial expansion", 0)
+    testTransformer[(Vector, Vector)](df, polynomialExpansion, "polyFeatures", "expected") {
+      case Row(expanded: Vector, expected: Vector) =>
+        assertTypeOfVector(expanded, expected)
+        assertValues(expanded, expected)
     }
   }
 
@@ -85,13 +89,10 @@ class PolynomialExpansionSuite
       .setOutputCol("polyFeatures")
       .setDegree(3)
 
-    polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach {
-      case Row(expanded: DenseVector, expected: DenseVector) =>
-        assert(expanded ~== expected absTol 1e-1)
-      case Row(expanded: SparseVector, expected: SparseVector) =>
-        assert(expanded ~== expected absTol 1e-1)
-      case _ =>
-        throw new TestFailedException("Unmatched data types after polynomial expansion", 0)
+    testTransformer[(Vector, Vector)](df, polynomialExpansion, "polyFeatures", "expected") {
+      case Row(expanded: Vector, expected: Vector) =>
+        assertTypeOfVector(expanded, expected)
+        assertValues(expanded, expected)
     }
   }
 
@@ -103,11 +104,9 @@ class PolynomialExpansionSuite
       .setOutputCol("polyFeatures")
       .setDegree(1)
 
-    polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach {
+    testTransformer[(Vector, Vector)](df, polynomialExpansion, "polyFeatures", "expected") {
       case Row(expanded: Vector, expected: Vector) =>
-        assert(expanded ~== expected absTol 1e-1)
-      case _ =>
-        throw new TestFailedException("Unmatched data types after polynomial expansion", 0)
+        assertValues(expanded, expected)
     }
   }
 
@@ -133,12 +132,13 @@ class PolynomialExpansionSuite
       .setOutputCol("polyFeatures")
 
     for (i <- Seq(10, 11)) {
-      val transformed = t.setDegree(i)
-        .transform(df)
-        .select(s"expectedPoly${i}size", "polyFeatures")
-        .rdd.map { case Row(expected: Int, v: Vector) => expected == v.size }
-
-      assert(transformed.collect.forall(identity))
+      testTransformer[(Vector, Int, Int)](
+        df,
+        t.setDegree(i),
+        s"expectedPoly${i}size",
+        "polyFeatures") { case Row(size: Int, expected: Vector) =>
+            assert(size === expected.size)
+      }
     }
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
index 6c363799dd300..b009038bbd833 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
@@ -17,15 +17,11 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.Pipeline
-import org.apache.spark.ml.util.DefaultReadWriteTest
-import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
 import org.apache.spark.sql._
-import org.apache.spark.sql.functions.udf
 
-class QuantileDiscretizerSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class QuantileDiscretizerSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -40,19 +36,19 @@ class QuantileDiscretizerSuite
       .setInputCol("input")
       .setOutputCol("result")
       .setNumBuckets(numBuckets)
-    val result = discretizer.fit(df).transform(df)
-
-    val observedNumBuckets = result.select("result").distinct.count
-    assert(observedNumBuckets === numBuckets,
-      "Observed number of buckets does not equal expected number of buckets.")
+    val model = discretizer.fit(df)
 
-    val relativeError = discretizer.getRelativeError
-    val isGoodBucket = udf {
-      (size: Int) => math.abs( size - (datasetSize / numBuckets)) <= (relativeError * datasetSize)
+    testTransformerByGlobalCheckFunc[(Double)](df, model, "result") { rows =>
+      val result = rows.map { r => Tuple1(r.getDouble(0)) }.toDF("result")
+      val observedNumBuckets = result.select("result").distinct.count
+      assert(observedNumBuckets === numBuckets,
+        "Observed number of buckets does not equal expected number of buckets.")
+      val relativeError = discretizer.getRelativeError
+      val numGoodBuckets = result.groupBy("result").count
+        .filter(s"abs(count - ${datasetSize / numBuckets}) <= ${relativeError * datasetSize}").count
+      assert(numGoodBuckets === numBuckets,
+        "Bucket sizes are not within expected relative error tolerance.")
     }
-    val numGoodBuckets = result.groupBy("result").count.filter(isGoodBucket($"count")).count
-    assert(numGoodBuckets === numBuckets,
-      "Bucket sizes are not within expected relative error tolerance.")
   }
 
   test("Test on data with high proportion of duplicated values") {
@@ -67,11 +63,14 @@ class QuantileDiscretizerSuite
       .setInputCol("input")
       .setOutputCol("result")
       .setNumBuckets(numBuckets)
-    val result = discretizer.fit(df).transform(df)
-    val observedNumBuckets = result.select("result").distinct.count
-    assert(observedNumBuckets == expectedNumBuckets,
-      s"Observed number of buckets are not correct." +
-        s" Expected $expectedNumBuckets but found $observedNumBuckets")
+    val model = discretizer.fit(df)
+    testTransformerByGlobalCheckFunc[(Double)](df, model, "result") { rows =>
+      val result = rows.map { r => Tuple1(r.getDouble(0)) }.toDF("result")
+      val observedNumBuckets = result.select("result").distinct.count
+      assert(observedNumBuckets == expectedNumBuckets,
+        s"Observed number of buckets are not correct." +
+          s" Expected $expectedNumBuckets but found $observedNumBuckets")
+    }
   }
 
   test("Test transform on data with NaN value") {
@@ -90,17 +89,20 @@ class QuantileDiscretizerSuite
 
     withClue("QuantileDiscretizer with handleInvalid=error should throw exception for NaN values") {
       val dataFrame: DataFrame = validData.toSeq.toDF("input")
-      intercept[SparkException] {
-        discretizer.fit(dataFrame).transform(dataFrame).collect()
-      }
+      val model = discretizer.fit(dataFrame)
+      testTransformerByInterceptingException[(Double)](
+        dataFrame,
+        model,
+        expectedMessagePart = "Bucketizer encountered NaN value.",
+        firstResultCol = "result")
     }
 
     List(("keep", expectedKeep), ("skip", expectedSkip)).foreach{
       case(u, v) =>
         discretizer.setHandleInvalid(u)
         val dataFrame: DataFrame = validData.zip(v).toSeq.toDF("input", "expected")
-        val result = discretizer.fit(dataFrame).transform(dataFrame)
-        result.select("result", "expected").collect().foreach {
+        val model = discretizer.fit(dataFrame)
+        testTransformer[(Double, Double)](dataFrame, model, "result", "expected") {
           case Row(x: Double, y: Double) =>
             assert(x === y,
               s"The feature value is not correct after bucketing.  Expected $y but found $x")
@@ -119,14 +121,17 @@ class QuantileDiscretizerSuite
       .setOutputCol("result")
       .setNumBuckets(5)
 
-    val result = discretizer.fit(trainDF).transform(testDF)
-    val firstBucketSize = result.filter(result("result") === 0.0).count
-    val lastBucketSize = result.filter(result("result") === 4.0).count
+    val model = discretizer.fit(trainDF)
+    testTransformerByGlobalCheckFunc[(Double)](testDF, model, "result") { rows =>
+      val result = rows.map { r => Tuple1(r.getDouble(0)) }.toDF("result")
+      val firstBucketSize = result.filter(result("result") === 0.0).count
+      val lastBucketSize = result.filter(result("result") === 4.0).count
 
-    assert(firstBucketSize === 30L,
-      s"Size of first bucket ${firstBucketSize} did not equal expected value of 30.")
-    assert(lastBucketSize === 31L,
-      s"Size of last bucket ${lastBucketSize} did not equal expected value of 31.")
+      assert(firstBucketSize === 30L,
+        s"Size of first bucket ${firstBucketSize} did not equal expected value of 30.")
+      assert(lastBucketSize === 31L,
+        s"Size of last bucket ${lastBucketSize} did not equal expected value of 31.")
+    }
   }
 
   test("read/write") {
@@ -167,21 +172,24 @@ class QuantileDiscretizerSuite
       .setInputCols(Array("input1", "input2"))
       .setOutputCols(Array("result1", "result2"))
       .setNumBuckets(numBuckets)
-    val result = discretizer.fit(df).transform(df)
-
-    val relativeError = discretizer.getRelativeError
-    val isGoodBucket = udf {
-      (size: Int) => math.abs( size - (datasetSize / numBuckets)) <= (relativeError * datasetSize)
-    }
-
-    for (i <- 1 to 2) {
-      val observedNumBuckets = result.select("result" + i).distinct.count
-      assert(observedNumBuckets === numBuckets,
-        "Observed number of buckets does not equal expected number of buckets.")
-
-      val numGoodBuckets = result.groupBy("result" + i).count.filter(isGoodBucket($"count")).count
-      assert(numGoodBuckets === numBuckets,
-        "Bucket sizes are not within expected relative error tolerance.")
+    val model = discretizer.fit(df)
+    testTransformerByGlobalCheckFunc[(Double, Double)](df, model, "result1", "result2") { rows =>
+      val result =
+        rows.map { r => Tuple2(r.getDouble(0), r.getDouble(1)) }.toDF("result1", "result2")
+      val relativeError = discretizer.getRelativeError
+      for (i <- 1 to 2) {
+        val observedNumBuckets = result.select("result" + i).distinct.count
+        assert(observedNumBuckets === numBuckets,
+          "Observed number of buckets does not equal expected number of buckets.")
+
+        val numGoodBuckets = result
+          .groupBy("result" + i)
+          .count
+          .filter(s"abs(count - ${datasetSize / numBuckets}) <= ${relativeError * datasetSize}")
+          .count
+        assert(numGoodBuckets === numBuckets,
+          "Bucket sizes are not within expected relative error tolerance.")
+      }
     }
   }
 
@@ -198,12 +206,16 @@ class QuantileDiscretizerSuite
       .setInputCols(Array("input1", "input2"))
       .setOutputCols(Array("result1", "result2"))
       .setNumBuckets(numBuckets)
-    val result = discretizer.fit(df).transform(df)
-    for (i <- 1 to 2) {
-      val observedNumBuckets = result.select("result" + i).distinct.count
-      assert(observedNumBuckets == expectedNumBucket,
-        s"Observed number of buckets are not correct." +
-          s" Expected $expectedNumBucket but found ($observedNumBuckets")
+    val model = discretizer.fit(df)
+    testTransformerByGlobalCheckFunc[(Double, Double)](df, model, "result1", "result2") { rows =>
+      val result =
+        rows.map { r => Tuple2(r.getDouble(0), r.getDouble(1)) }.toDF("result1", "result2")
+      for (i <- 1 to 2) {
+        val observedNumBuckets = result.select("result" + i).distinct.count
+        assert(observedNumBuckets == expectedNumBucket,
+          s"Observed number of buckets are not correct." +
+            s" Expected $expectedNumBucket but found ($observedNumBuckets")
+      }
     }
   }
 
@@ -226,9 +238,12 @@ class QuantileDiscretizerSuite
 
     withClue("QuantileDiscretizer with handleInvalid=error should throw exception for NaN values") {
       val dataFrame: DataFrame = validData1.zip(validData2).toSeq.toDF("input1", "input2")
-      intercept[SparkException] {
-        discretizer.fit(dataFrame).transform(dataFrame).collect()
-      }
+      val model = discretizer.fit(dataFrame)
+      testTransformerByInterceptingException[(Double, Double)](
+        dataFrame,
+        model,
+        expectedMessagePart = "Bucketizer encountered NaN value.",
+        firstResultCol = "result1")
     }
 
     List(("keep", expectedKeep1, expectedKeep2), ("skip", expectedSkip1, expectedSkip2)).foreach {
@@ -237,8 +252,14 @@ class QuantileDiscretizerSuite
         val dataFrame: DataFrame = validData1.zip(validData2).zip(v).zip(w).map {
           case (((a, b), c), d) => (a, b, c, d)
         }.toSeq.toDF("input1", "input2", "expected1", "expected2")
-        val result = discretizer.fit(dataFrame).transform(dataFrame)
-        result.select("result1", "expected1", "result2", "expected2").collect().foreach {
+        val model = discretizer.fit(dataFrame)
+        testTransformer[(Double, Double, Double, Double)](
+          dataFrame,
+          model,
+          "result1",
+          "expected1",
+          "result2",
+          "expected2") {
           case Row(x: Double, y: Double, z: Double, w: Double) =>
             assert(x === y && w === z)
         }
@@ -270,9 +291,16 @@ class QuantileDiscretizerSuite
       .setOutputCols(Array("result1", "result2", "result3"))
       .setNumBucketsArray(numBucketsArray)
 
-    discretizer.fit(df).transform(df).
-      select("result1", "expected1", "result2", "expected2", "result3", "expected3")
-      .collect().foreach {
+    val model = discretizer.fit(df)
+    testTransformer[(Double, Double, Double, Double, Double, Double)](
+      df,
+      model,
+      "result1",
+      "expected1",
+      "result2",
+      "expected2",
+      "result3",
+      "expected3") {
       case Row(r1: Double, e1: Double, r2: Double, e2: Double, r3: Double, e3: Double) =>
         assert(r1 === e1,
           s"The result value is not correct after bucketing. Expected $e1 but found $r1")
@@ -324,20 +352,16 @@ class QuantileDiscretizerSuite
       .setStages(Array(discretizerForCol1, discretizerForCol2, discretizerForCol3))
       .fit(df)
 
-    val resultForMultiCols = plForMultiCols.transform(df)
-      .select("result1", "result2", "result3")
-      .collect()
-
-    val resultForSingleCol = plForSingleCol.transform(df)
-      .select("result1", "result2", "result3")
-      .collect()
+    val expected = plForSingleCol.transform(df).select("result1", "result2", "result3").collect()
 
-    resultForSingleCol.zip(resultForMultiCols).foreach {
-      case (rowForSingle, rowForMultiCols) =>
-        assert(rowForSingle.getDouble(0) == rowForMultiCols.getDouble(0) &&
-          rowForSingle.getDouble(1) == rowForMultiCols.getDouble(1) &&
-          rowForSingle.getDouble(2) == rowForMultiCols.getDouble(2))
-    }
+    testTransformerByGlobalCheckFunc[(Double, Double, Double)](
+      df,
+      plForMultiCols,
+      "result1",
+      "result2",
+      "result3") { rows =>
+        assert(rows === expected)
+      }
   }
 
   test("Multiple Columns: Comparing setting numBuckets with setting numBucketsArray " +
@@ -364,18 +388,16 @@ class QuantileDiscretizerSuite
       .setOutputCols(Array("result1", "result2", "result3"))
       .setNumBucketsArray(Array(10, 10, 10))
 
-    val result1 = discretizerSingleNumBuckets.fit(df).transform(df)
-      .select("result1", "result2", "result3")
-      .collect()
-    val result2 = discretizerNumBucketsArray.fit(df).transform(df)
-      .select("result1", "result2", "result3")
-      .collect()
-
-    result1.zip(result2).foreach {
-      case (row1, row2) =>
-        assert(row1.getDouble(0) == row2.getDouble(0) &&
-          row1.getDouble(1) == row2.getDouble(1) &&
-          row1.getDouble(2) == row2.getDouble(2))
+    val model = discretizerSingleNumBuckets.fit(df)
+    val expected = model.transform(df).select("result1", "result2", "result3").collect()
+
+    testTransformerByGlobalCheckFunc[(Double, Double, Double)](
+      df,
+      discretizerNumBucketsArray.fit(df),
+      "result1",
+      "result2",
+      "result3") { rows =>
+      assert(rows === expected)
     }
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index bfe38d32dd77d..27d570f0b68ad 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkException
 import org.apache.spark.ml.attribute._
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
@@ -32,10 +31,20 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
   def testRFormulaTransform[A: Encoder](
       dataframe: DataFrame,
       formulaModel: RFormulaModel,
-      expected: DataFrame): Unit = {
+      expected: DataFrame,
+      expectedAttributes: AttributeGroup*): Unit = {
+    val resultSchema = formulaModel.transformSchema(dataframe.schema)
+    assert(resultSchema.json === expected.schema.json)
+    assert(resultSchema === expected.schema)
     val (first +: rest) = expected.schema.fieldNames.toSeq
     val expectedRows = expected.collect()
     testTransformerByGlobalCheckFunc[A](dataframe, formulaModel, first, rest: _*) { rows =>
+      assert(rows.head.schema.toString() == resultSchema.toString())
+      for (expectedAttributeGroup <- expectedAttributes) {
+        val attributeGroup =
+          AttributeGroup.fromStructField(rows.head.schema(expectedAttributeGroup.name))
+        assert(attributeGroup === expectedAttributeGroup)
+      }
       assert(rows === expectedRows)
     }
   }
@@ -49,15 +58,10 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
     val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2")
     val model = formula.fit(original)
     MLTestingUtils.checkCopyAndUids(formula, model)
-    val result = model.transform(original)
-    val resultSchema = model.transformSchema(original.schema)
     val expected = Seq(
       (0, 1.0, 3.0, Vectors.dense(1.0, 3.0), 0.0),
       (2, 2.0, 5.0, Vectors.dense(2.0, 5.0), 2.0)
     ).toDF("id", "v1", "v2", "features", "label")
-    // TODO(ekl) make schema comparisons ignore metadata, to avoid .toString
-    assert(result.schema.toString == resultSchema.toString)
-    assert(resultSchema == expected.schema)
     testRFormulaTransform[(Int, Double, Double)](original, model, expected)
   }
 
@@ -73,9 +77,13 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
     val formula = new RFormula().setFormula("y ~ x").setLabelCol("y")
     val original = Seq((0, 1.0), (2, 2.0)).toDF("x", "y")
     val model = formula.fit(original)
+    val expected = Seq(
+      (0, 1.0, Vectors.dense(0.0)),
+      (2, 2.0, Vectors.dense(2.0))
+    ).toDF("x", "y", "features")
     val resultSchema = model.transformSchema(original.schema)
     assert(resultSchema.length == 3)
-    assert(resultSchema.toString == model.transform(original).schema.toString)
+    testRFormulaTransform[(Int, Double)](original, model, expected)
   }
 
   test("label column already exists but forceIndexLabel was set with true") {
@@ -93,9 +101,11 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
     intercept[IllegalArgumentException] {
       model.transformSchema(original.schema)
     }
-    intercept[IllegalArgumentException] {
-      model.transform(original)
-    }
+    testTransformerByInterceptingException[(Int, Boolean)](
+      original,
+      model,
+      "Label column already exists and is not of type NumericType.",
+      "x")
   }
 
   test("allow missing label column for test datasets") {
@@ -105,21 +115,22 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
     val resultSchema = model.transformSchema(original.schema)
     assert(resultSchema.length == 3)
     assert(!resultSchema.exists(_.name == "label"))
-    assert(resultSchema.toString == model.transform(original).schema.toString)
+    val expected = Seq(
+      (0, 1.0, Vectors.dense(0.0)),
+      (2, 2.0, Vectors.dense(2.0))
+    ).toDF("x", "_not_y", "features")
+    testRFormulaTransform[(Int, Double)](original, model, expected)
   }
 
   test("allow empty label") {
     val original = Seq((1, 2.0, 3.0), (4, 5.0, 6.0), (7, 8.0, 9.0)).toDF("id", "a", "b")
     val formula = new RFormula().setFormula("~ a + b")
     val model = formula.fit(original)
-    val result = model.transform(original)
-    val resultSchema = model.transformSchema(original.schema)
     val expected = Seq(
       (1, 2.0, 3.0, Vectors.dense(2.0, 3.0)),
       (4, 5.0, 6.0, Vectors.dense(5.0, 6.0)),
       (7, 8.0, 9.0, Vectors.dense(8.0, 9.0))
     ).toDF("id", "a", "b", "features")
-    assert(result.schema.toString == resultSchema.toString)
     testRFormulaTransform[(Int, Double, Double)](original, model, expected)
   }
 
@@ -128,15 +139,12 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
     val original = Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "baz", 5))
       .toDF("id", "a", "b")
     val model = formula.fit(original)
-    val result = model.transform(original)
-    val resultSchema = model.transformSchema(original.schema)
     val expected = Seq(
         (1, "foo", 4, Vectors.dense(0.0, 1.0, 4.0), 1.0),
         (2, "bar", 4, Vectors.dense(1.0, 0.0, 4.0), 2.0),
         (3, "bar", 5, Vectors.dense(1.0, 0.0, 5.0), 3.0),
         (4, "baz", 5, Vectors.dense(0.0, 0.0, 5.0), 4.0)
       ).toDF("id", "a", "b", "features", "label")
-    assert(result.schema.toString == resultSchema.toString)
     testRFormulaTransform[(Int, String, Int)](original, model, expected)
   }
 
@@ -175,9 +183,6 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
     var idx = 0
     for (orderType <- StringIndexer.supportedStringOrderType) {
       val model = formula.setStringIndexerOrderType(orderType).fit(original)
-      val result = model.transform(original)
-      val resultSchema = model.transformSchema(original.schema)
-      assert(result.schema.toString == resultSchema.toString)
       testRFormulaTransform[(Int, String, Int)](original, model, expected(idx))
       idx += 1
     }
@@ -218,9 +223,6 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
     ).toDF("id", "a", "b", "features", "label")
 
     val model = formula.fit(original)
-    val result = model.transform(original)
-    val resultSchema = model.transformSchema(original.schema)
-    assert(result.schema.toString == resultSchema.toString)
     testRFormulaTransform[(Int, String, Int)](original, model, expected)
   }
 
@@ -254,19 +256,6 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
     val formula1 = new RFormula().setFormula("id ~ a + b + c - 1")
       .setStringIndexerOrderType(StringIndexer.alphabetDesc)
     val model1 = formula1.fit(original)
-    val result1 = model1.transform(original)
-    val resultSchema1 = model1.transformSchema(original.schema)
-    // Note the column order is different between R and Spark.
-    val expected1 = Seq(
-      (1, "foo", "zq", 4, Vectors.sparse(5, Array(0, 4), Array(1.0, 4.0)), 1.0),
-      (2, "bar", "zz", 4, Vectors.dense(0.0, 0.0, 1.0, 1.0, 4.0), 2.0),
-      (3, "bar", "zz", 5, Vectors.dense(0.0, 0.0, 1.0, 1.0, 5.0), 3.0),
-      (4, "baz", "zz", 5, Vectors.dense(0.0, 1.0, 0.0, 1.0, 5.0), 4.0)
-    ).toDF("id", "a", "b", "c", "features", "label")
-    assert(result1.schema.toString == resultSchema1.toString)
-    testRFormulaTransform[(Int, String, String, Int)](original, model1, expected1)
-
-    val attrs1 = AttributeGroup.fromStructField(result1.schema("features"))
     val expectedAttrs1 = new AttributeGroup(
       "features",
       Array[Attribute](
@@ -275,14 +264,20 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
         new BinaryAttribute(Some("a_bar"), Some(3)),
         new BinaryAttribute(Some("b_zz"), Some(4)),
         new NumericAttribute(Some("c"), Some(5))))
-    assert(attrs1 === expectedAttrs1)
+    // Note the column order is different between R and Spark.
+    val expected1 = Seq(
+      (1, "foo", "zq", 4, Vectors.sparse(5, Array(0, 4), Array(1.0, 4.0)), 1.0),
+      (2, "bar", "zz", 4, Vectors.dense(0.0, 0.0, 1.0, 1.0, 4.0), 2.0),
+      (3, "bar", "zz", 5, Vectors.dense(0.0, 0.0, 1.0, 1.0, 5.0), 3.0),
+      (4, "baz", "zz", 5, Vectors.dense(0.0, 1.0, 0.0, 1.0, 5.0), 4.0)
+    ).toDF("id", "a", "b", "c", "features", "label")
+
+    testRFormulaTransform[(Int, String, String, Int)](original, model1, expected1, expectedAttrs1)
 
     // There is no impact for string terms interaction.
     val formula2 = new RFormula().setFormula("id ~ a:b + c - 1")
       .setStringIndexerOrderType(StringIndexer.alphabetDesc)
     val model2 = formula2.fit(original)
-    val result2 = model2.transform(original)
-    val resultSchema2 = model2.transformSchema(original.schema)
     // Note the column order is different between R and Spark.
     val expected2 = Seq(
       (1, "foo", "zq", 4, Vectors.sparse(7, Array(1, 6), Array(1.0, 4.0)), 1.0),
@@ -290,10 +285,6 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
       (3, "bar", "zz", 5, Vectors.sparse(7, Array(4, 6), Array(1.0, 5.0)), 3.0),
       (4, "baz", "zz", 5, Vectors.sparse(7, Array(2, 6), Array(1.0, 5.0)), 4.0)
     ).toDF("id", "a", "b", "c", "features", "label")
-    assert(result2.schema.toString == resultSchema2.toString)
-    testRFormulaTransform[(Int, String, String, Int)](original, model2, expected2)
-
-    val attrs2 = AttributeGroup.fromStructField(result2.schema("features"))
     val expectedAttrs2 = new AttributeGroup(
       "features",
       Array[Attribute](
@@ -304,7 +295,8 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
         new NumericAttribute(Some("a_bar:b_zz"), Some(5)),
         new NumericAttribute(Some("a_bar:b_zq"), Some(6)),
         new NumericAttribute(Some("c"), Some(7))))
-    assert(attrs2 === expectedAttrs2)
+
+    testRFormulaTransform[(Int, String, String, Int)](original, model2, expected2, expectedAttrs2)
   }
 
   test("index string label") {
@@ -313,13 +305,14 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
       Seq(("male", "foo", 4), ("female", "bar", 4), ("female", "bar", 5), ("male", "baz", 5))
         .toDF("id", "a", "b")
     val model = formula.fit(original)
+    val attr = NominalAttribute.defaultAttr
     val expected = Seq(
         ("male", "foo", 4, Vectors.dense(0.0, 1.0, 4.0), 1.0),
         ("female", "bar", 4, Vectors.dense(1.0, 0.0, 4.0), 0.0),
         ("female", "bar", 5, Vectors.dense(1.0, 0.0, 5.0), 0.0),
         ("male", "baz", 5, Vectors.dense(0.0, 0.0, 5.0), 1.0)
     ).toDF("id", "a", "b", "features", "label")
-    // assert(result.schema.toString == resultSchema.toString)
+      .select($"id", $"a", $"b", $"features", $"label".as("label", attr.toMetadata()))
     testRFormulaTransform[(String, String, Int)](original, model, expected)
   }
 
@@ -329,13 +322,14 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
       Seq((1.0, "foo", 4), (1.0, "bar", 4), (0.0, "bar", 5), (1.0, "baz", 5))
     ).toDF("id", "a", "b")
     val model = formula.fit(original)
-    val expected = spark.createDataFrame(
-      Seq(
+    val attr = NominalAttribute.defaultAttr
+    val expected = Seq(
         (1.0, "foo", 4, Vectors.dense(0.0, 1.0, 4.0), 0.0),
         (1.0, "bar", 4, Vectors.dense(1.0, 0.0, 4.0), 0.0),
         (0.0, "bar", 5, Vectors.dense(1.0, 0.0, 5.0), 1.0),
         (1.0, "baz", 5, Vectors.dense(0.0, 0.0, 5.0), 0.0))
-    ).toDF("id", "a", "b", "features", "label")
+      .toDF("id", "a", "b", "features", "label")
+      .select($"id", $"a", $"b", $"features", $"label".as("label", attr.toMetadata()))
     testRFormulaTransform[(Double, String, Int)](original, model, expected)
   }
 
@@ -344,15 +338,20 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
     val original = Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "baz", 5))
       .toDF("id", "a", "b")
     val model = formula.fit(original)
-    val result = model.transform(original)
-    val attrs = AttributeGroup.fromStructField(result.schema("features"))
+    val expected = Seq(
+      (1, "foo", 4, Vectors.dense(0.0, 1.0, 4.0), 1.0),
+      (2, "bar", 4, Vectors.dense(1.0, 0.0, 4.0), 2.0),
+      (3, "bar", 5, Vectors.dense(1.0, 0.0, 5.0), 3.0),
+      (4, "baz", 5, Vectors.dense(0.0, 0.0, 5.0), 4.0))
+      .toDF("id", "a", "b", "features", "label")
     val expectedAttrs = new AttributeGroup(
       "features",
       Array(
         new BinaryAttribute(Some("a_bar"), Some(1)),
         new BinaryAttribute(Some("a_foo"), Some(2)),
         new NumericAttribute(Some("b"), Some(3))))
-    assert(attrs === expectedAttrs)
+    testRFormulaTransform[(Int, String, Int)](original, model, expected, expectedAttrs)
+
   }
 
   test("vector attribute generation") {
@@ -360,14 +359,19 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
     val original = Seq((1, Vectors.dense(0.0, 1.0)), (2, Vectors.dense(1.0, 2.0)))
       .toDF("id", "vec")
     val model = formula.fit(original)
-    val result = model.transform(original)
-    val attrs = AttributeGroup.fromStructField(result.schema("features"))
+    val attrs = new AttributeGroup("vec", 2)
+    val expected = Seq(
+      (1, Vectors.dense(0.0, 1.0), Vectors.dense(0.0, 1.0), 1.0),
+      (2, Vectors.dense(1.0, 2.0), Vectors.dense(1.0, 2.0), 2.0))
+      .toDF("id", "vec", "features", "label")
+      .select($"id", $"vec".as("vec", attrs.toMetadata()), $"features", $"label")
     val expectedAttrs = new AttributeGroup(
       "features",
       Array[Attribute](
         new NumericAttribute(Some("vec_0"), Some(1)),
         new NumericAttribute(Some("vec_1"), Some(2))))
-    assert(attrs === expectedAttrs)
+
+    testRFormulaTransform[(Int, Vector)](original, model, expected, expectedAttrs)
   }
 
   test("vector attribute generation with unnamed input attrs") {
@@ -381,31 +385,31 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
         NumericAttribute.defaultAttr)).toMetadata()
     val original = base.select(base.col("id"), base.col("vec").as("vec2", metadata))
     val model = formula.fit(original)
-    val result = model.transform(original)
-    val attrs = AttributeGroup.fromStructField(result.schema("features"))
+    val expected = Seq(
+      (1, Vectors.dense(0.0, 1.0), Vectors.dense(0.0, 1.0), 1.0),
+      (2, Vectors.dense(1.0, 2.0), Vectors.dense(1.0, 2.0), 2.0)
+    ).toDF("id", "vec2", "features", "label")
+      .select($"id", $"vec2".as("vec2", metadata), $"features", $"label")
     val expectedAttrs = new AttributeGroup(
       "features",
       Array[Attribute](
         new NumericAttribute(Some("vec2_0"), Some(1)),
         new NumericAttribute(Some("vec2_1"), Some(2))))
-    assert(attrs === expectedAttrs)
+    testRFormulaTransform[(Int, Vector)](original, model, expected, expectedAttrs)
   }
 
   test("numeric interaction") {
     val formula = new RFormula().setFormula("a ~ b:c:d")
     val original = Seq((1, 2, 4, 2), (2, 3, 4, 1)).toDF("a", "b", "c", "d")
     val model = formula.fit(original)
-    val result = model.transform(original)
     val expected = Seq(
       (1, 2, 4, 2, Vectors.dense(16.0), 1.0),
       (2, 3, 4, 1, Vectors.dense(12.0), 2.0)
     ).toDF("a", "b", "c", "d", "features", "label")
-    testRFormulaTransform[(Int, Int, Int, Int)](original, model, expected)
-    val attrs = AttributeGroup.fromStructField(result.schema("features"))
     val expectedAttrs = new AttributeGroup(
       "features",
       Array[Attribute](new NumericAttribute(Some("b:c:d"), Some(1))))
-    assert(attrs === expectedAttrs)
+    testRFormulaTransform[(Int, Int, Int, Int)](original, model, expected, expectedAttrs)
   }
 
   test("factor numeric interaction") {
@@ -414,7 +418,6 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
       Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "baz", 5), (4, "baz", 5), (4, "baz", 5))
         .toDF("id", "a", "b")
     val model = formula.fit(original)
-    val result = model.transform(original)
     val expected = Seq(
       (1, "foo", 4, Vectors.dense(0.0, 0.0, 4.0), 1.0),
       (2, "bar", 4, Vectors.dense(0.0, 4.0, 0.0), 2.0),
@@ -423,15 +426,13 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
       (4, "baz", 5, Vectors.dense(5.0, 0.0, 0.0), 4.0),
       (4, "baz", 5, Vectors.dense(5.0, 0.0, 0.0), 4.0)
     ).toDF("id", "a", "b", "features", "label")
-    testRFormulaTransform[(Int, String, Int)](original, model, expected)
-    val attrs = AttributeGroup.fromStructField(result.schema("features"))
     val expectedAttrs = new AttributeGroup(
       "features",
       Array[Attribute](
         new NumericAttribute(Some("a_baz:b"), Some(1)),
         new NumericAttribute(Some("a_bar:b"), Some(2)),
         new NumericAttribute(Some("a_foo:b"), Some(3))))
-    assert(attrs === expectedAttrs)
+    testRFormulaTransform[(Int, String, Int)](original, model, expected, expectedAttrs)
   }
 
   test("factor factor interaction") {
@@ -439,14 +440,12 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
     val original =
       Seq((1, "foo", "zq"), (2, "bar", "zq"), (3, "bar", "zz")).toDF("id", "a", "b")
     val model = formula.fit(original)
-    val result = model.transform(original)
     val expected = Seq(
       (1, "foo", "zq", Vectors.dense(0.0, 0.0, 1.0, 0.0), 1.0),
       (2, "bar", "zq", Vectors.dense(1.0, 0.0, 0.0, 0.0), 2.0),
       (3, "bar", "zz", Vectors.dense(0.0, 1.0, 0.0, 0.0), 3.0)
     ).toDF("id", "a", "b", "features", "label")
     testRFormulaTransform[(Int, String, String)](original, model, expected)
-    val attrs = AttributeGroup.fromStructField(result.schema("features"))
     val expectedAttrs = new AttributeGroup(
       "features",
       Array[Attribute](
@@ -454,7 +453,7 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
         new NumericAttribute(Some("a_bar:b_zz"), Some(2)),
         new NumericAttribute(Some("a_foo:b_zq"), Some(3)),
         new NumericAttribute(Some("a_foo:b_zz"), Some(4))))
-    assert(attrs === expectedAttrs)
+    testRFormulaTransform[(Int, String, String)](original, model, expected, expectedAttrs)
   }
 
   test("read/write: RFormula") {
@@ -517,9 +516,11 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
 
     // Handle unseen features.
     val formula1 = new RFormula().setFormula("id ~ a + b")
-    intercept[SparkException] {
-      formula1.fit(df1).transform(df2).collect()
-    }
+    testTransformerByInterceptingException[(Int, String, String)](
+      df2,
+      formula1.fit(df1),
+      "Unseen label:",
+      "features")
     val model1 = formula1.setHandleInvalid("skip").fit(df1)
     val model2 = formula1.setHandleInvalid("keep").fit(df1)
 
@@ -538,21 +539,28 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
 
     // Handle unseen labels.
     val formula2 = new RFormula().setFormula("b ~ a + id")
-    intercept[SparkException] {
-      formula2.fit(df1).transform(df2).collect()
-    }
+    testTransformerByInterceptingException[(Int, String, String)](
+      df2,
+      formula2.fit(df1),
+      "Unseen label:",
+      "label")
+
     val model3 = formula2.setHandleInvalid("skip").fit(df1)
     val model4 = formula2.setHandleInvalid("keep").fit(df1)
 
+    val attr = NominalAttribute.defaultAttr
     val expected3 = Seq(
       (1, "foo", "zq", Vectors.dense(0.0, 1.0), 0.0),
       (2, "bar", "zq", Vectors.dense(1.0, 2.0), 0.0)
     ).toDF("id", "a", "b", "features", "label")
+      .select($"id", $"a", $"b", $"features", $"label".as("label", attr.toMetadata()))
+
     val expected4 = Seq(
       (1, "foo", "zq", Vectors.dense(0.0, 1.0, 1.0), 0.0),
       (2, "bar", "zq", Vectors.dense(1.0, 0.0, 2.0), 0.0),
       (3, "bar", "zy", Vectors.dense(1.0, 0.0, 3.0), 2.0)
     ).toDF("id", "a", "b", "features", "label")
+      .select($"id", $"a", $"b", $"features", $"label".as("label", attr.toMetadata()))
 
     testRFormulaTransform[(Int, String, String)](df2, model3, expected3)
     testRFormulaTransform[(Int, String, String)](df2, model4, expected4)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/SQLTransformerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/SQLTransformerSuite.scala
index 673a146e619f2..cf09418d8e0a2 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/SQLTransformerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/SQLTransformerSuite.scala
@@ -17,15 +17,12 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
-import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
 import org.apache.spark.sql.types.{LongType, StructField, StructType}
 import org.apache.spark.storage.StorageLevel
 
-class SQLTransformerSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class SQLTransformerSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -37,14 +34,22 @@ class SQLTransformerSuite
     val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2")
     val sqlTrans = new SQLTransformer().setStatement(
       "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
-    val result = sqlTrans.transform(original)
-    val resultSchema = sqlTrans.transformSchema(original.schema)
-    val expected = Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0))
+     val expected = Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0))
       .toDF("id", "v1", "v2", "v3", "v4")
-    assert(result.schema.toString == resultSchema.toString)
-    assert(resultSchema == expected.schema)
-    assert(result.collect().toSeq == expected.collect().toSeq)
-    assert(original.sparkSession.catalog.listTables().count() == 0)
+    val resultSchema = sqlTrans.transformSchema(original.schema)
+    testTransformerByGlobalCheckFunc[(Int, Double, Double)](
+      original,
+      sqlTrans,
+      "id",
+      "v1",
+      "v2",
+      "v3",
+      "v4") { rows =>
+      assert(rows.head.schema.toString == resultSchema.toString)
+      assert(resultSchema == expected.schema)
+      assert(rows == expected.collect().toSeq)
+      assert(original.sparkSession.catalog.listTables().count() == 0)
+    }
   }
 
   test("read/write") {
@@ -63,13 +68,13 @@ class SQLTransformerSuite
   }
 
   test("SPARK-22538: SQLTransformer should not unpersist given dataset") {
-    val df = spark.range(10)
+    val df = spark.range(10).toDF()
     df.cache()
     df.count()
     assert(df.storageLevel != StorageLevel.NONE)
-    new SQLTransformer()
+    val sqlTrans = new SQLTransformer()
       .setStatement("SELECT id + 1 AS id1 FROM __THIS__")
-      .transform(df)
+    testTransformerByGlobalCheckFunc[Long](df, sqlTrans, "id1") { _ => }
     assert(df.storageLevel != StorageLevel.NONE)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala
index 350ba44baa1eb..c5c49d67194e4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala
@@ -17,16 +17,13 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{DataFrame, Row}
 
-class StandardScalerSuite extends SparkFunSuite with MLlibTestSparkContext
-  with DefaultReadWriteTest {
+class StandardScalerSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -60,12 +57,10 @@ class StandardScalerSuite extends SparkFunSuite with MLlibTestSparkContext
     )
   }
 
-  def assertResult(df: DataFrame): Unit = {
-    df.select("standardized_features", "expected").collect().foreach {
-      case Row(vector1: Vector, vector2: Vector) =>
-        assert(vector1 ~== vector2 absTol 1E-5,
-          "The vector value is not correct after standardization.")
-    }
+  def assertResult: Row => Unit = {
+    case Row(vector1: Vector, vector2: Vector) =>
+      assert(vector1 ~== vector2 absTol 1E-5,
+        "The vector value is not correct after standardization.")
   }
 
   test("params") {
@@ -83,7 +78,8 @@ class StandardScalerSuite extends SparkFunSuite with MLlibTestSparkContext
     val standardScaler0 = standardScalerEst0.fit(df0)
     MLTestingUtils.checkCopyAndUids(standardScalerEst0, standardScaler0)
 
-    assertResult(standardScaler0.transform(df0))
+    testTransformer[(Vector, Vector)](df0, standardScaler0, "standardized_features", "expected")(
+      assertResult)
   }
 
   test("Standardization with setter") {
@@ -112,9 +108,12 @@ class StandardScalerSuite extends SparkFunSuite with MLlibTestSparkContext
       .setWithStd(false)
       .fit(df3)
 
-    assertResult(standardScaler1.transform(df1))
-    assertResult(standardScaler2.transform(df2))
-    assertResult(standardScaler3.transform(df3))
+    testTransformer[(Vector, Vector)](df1, standardScaler1, "standardized_features", "expected")(
+      assertResult)
+    testTransformer[(Vector, Vector)](df2, standardScaler2, "standardized_features", "expected")(
+      assertResult)
+    testTransformer[(Vector, Vector)](df3, standardScaler3, "standardized_features", "expected")(
+      assertResult)
   }
 
   test("sparse data and withMean") {
@@ -130,7 +129,8 @@ class StandardScalerSuite extends SparkFunSuite with MLlibTestSparkContext
       .setWithMean(true)
       .setWithStd(false)
       .fit(df)
-    assertResult(standardScaler.transform(df))
+    testTransformer[(Vector, Vector)](df, standardScaler, "standardized_features", "expected")(
+      assertResult)
   }
 
   test("StandardScaler read/write") {
@@ -149,4 +149,5 @@ class StandardScalerSuite extends SparkFunSuite with MLlibTestSparkContext
     assert(newInstance.std === instance.std)
     assert(newInstance.mean === instance.mean)
   }
+
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
index 5262b146b184e..21259a50916d2 100755
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
@@ -17,28 +17,20 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
-import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.{Dataset, Row}
-
-object StopWordsRemoverSuite extends SparkFunSuite {
-  def testStopWordsRemover(t: StopWordsRemover, dataset: Dataset[_]): Unit = {
-    t.transform(dataset)
-      .select("filtered", "expected")
-      .collect()
-      .foreach { case Row(tokens, wantedTokens) =>
-        assert(tokens === wantedTokens)
-    }
-  }
-}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
+import org.apache.spark.sql.{DataFrame, Row}
 
-class StopWordsRemoverSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class StopWordsRemoverSuite extends MLTest with DefaultReadWriteTest {
 
-  import StopWordsRemoverSuite._
   import testImplicits._
 
+  def testStopWordsRemover(t: StopWordsRemover, dataFrame: DataFrame): Unit = {
+    testTransformer[(Array[String], Array[String])](dataFrame, t, "filtered", "expected") {
+       case Row(tokens: Seq[_], wantedTokens: Seq[_]) =>
+         assert(tokens === wantedTokens)
+    }
+  }
+
   test("StopWordsRemover default") {
     val remover = new StopWordsRemover()
       .setInputCol("raw")
@@ -151,9 +143,10 @@ class StopWordsRemoverSuite
       .setOutputCol(outputCol)
     val dataSet = Seq((Seq("The", "the", "swift"), Seq("swift"))).toDF("raw", outputCol)
 
-    val thrown = intercept[IllegalArgumentException] {
-      testStopWordsRemover(remover, dataSet)
-    }
-    assert(thrown.getMessage == s"requirement failed: Column $outputCol already exists.")
+    testTransformerByInterceptingException[(Array[String], Array[String])](
+      dataSet,
+      remover,
+      s"requirement failed: Column $outputCol already exists.",
+      "expected")
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
index 775a04d3df050..df24367177011 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
@@ -17,17 +17,14 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.attribute.{Attribute, NominalAttribute}
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
-import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.functions.col
 import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
 
-class StringIndexerSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class StringIndexerSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -46,19 +43,23 @@ class StringIndexerSuite
       .setInputCol("label")
       .setOutputCol("labelIndex")
     val indexerModel = indexer.fit(df)
-
     MLTestingUtils.checkCopyAndUids(indexer, indexerModel)
-
-    val transformed = indexerModel.transform(df)
-    val attr = Attribute.fromStructField(transformed.schema("labelIndex"))
-      .asInstanceOf[NominalAttribute]
-    assert(attr.values.get === Array("a", "c", "b"))
-    val output = transformed.select("id", "labelIndex").rdd.map { r =>
-      (r.getInt(0), r.getDouble(1))
-    }.collect().toSet
     // a -> 0, b -> 2, c -> 1
-    val expected = Set((0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0))
-    assert(output === expected)
+    val expected = Seq(
+      (0, 0.0),
+      (1, 2.0),
+      (2, 1.0),
+      (3, 0.0),
+      (4, 0.0),
+       (5, 1.0)
+    ).toDF("id", "labelIndex")
+
+    testTransformerByGlobalCheckFunc[(Int, String)](df, indexerModel, "id", "labelIndex") { rows =>
+      val attr = Attribute.fromStructField(rows.head.schema("labelIndex"))
+        .asInstanceOf[NominalAttribute]
+      assert(attr.values.get === Array("a", "c", "b"))
+      assert(rows.seq === expected.collect().toSeq)
+    }
   }
 
   test("StringIndexerUnseen") {
@@ -70,36 +71,38 @@ class StringIndexerSuite
       .setInputCol("label")
       .setOutputCol("labelIndex")
       .fit(df)
+
     // Verify we throw by default with unseen values
-    intercept[SparkException] {
-      indexer.transform(df2).collect()
-    }
+    testTransformerByInterceptingException[(Int, String)](
+      df2,
+      indexer,
+      "Unseen label:",
+      "labelIndex")
 
-    indexer.setHandleInvalid("skip")
     // Verify that we skip the c record
-    val transformedSkip = indexer.transform(df2)
-    val attrSkip = Attribute.fromStructField(transformedSkip.schema("labelIndex"))
-      .asInstanceOf[NominalAttribute]
-    assert(attrSkip.values.get === Array("b", "a"))
-    val outputSkip = transformedSkip.select("id", "labelIndex").rdd.map { r =>
-      (r.getInt(0), r.getDouble(1))
-    }.collect().toSet
     // a -> 1, b -> 0
-    val expectedSkip = Set((0, 1.0), (1, 0.0))
-    assert(outputSkip === expectedSkip)
+    indexer.setHandleInvalid("skip")
+
+    val expectedSkip = Seq((0, 1.0), (1, 0.0)).toDF()
+    testTransformerByGlobalCheckFunc[(Int, String)](df2, indexer, "id", "labelIndex") { rows =>
+      val attrSkip = Attribute.fromStructField(rows.head.schema("labelIndex"))
+        .asInstanceOf[NominalAttribute]
+      assert(attrSkip.values.get === Array("b", "a"))
+      assert(rows.seq === expectedSkip.collect().toSeq)
+    }
 
     indexer.setHandleInvalid("keep")
-    // Verify that we keep the unseen records
-    val transformedKeep = indexer.transform(df2)
-    val attrKeep = Attribute.fromStructField(transformedKeep.schema("labelIndex"))
-      .asInstanceOf[NominalAttribute]
-    assert(attrKeep.values.get === Array("b", "a", "__unknown"))
-    val outputKeep = transformedKeep.select("id", "labelIndex").rdd.map { r =>
-      (r.getInt(0), r.getDouble(1))
-    }.collect().toSet
+
     // a -> 1, b -> 0, c -> 2, d -> 3
-    val expectedKeep = Set((0, 1.0), (1, 0.0), (2, 2.0), (3, 2.0))
-    assert(outputKeep === expectedKeep)
+    val expectedKeep = Seq((0, 1.0), (1, 0.0), (2, 2.0), (3, 2.0)).toDF()
+
+    // Verify that we keep the unseen records
+    testTransformerByGlobalCheckFunc[(Int, String)](df2, indexer, "id", "labelIndex") { rows =>
+      val attrKeep = Attribute.fromStructField(rows.head.schema("labelIndex"))
+        .asInstanceOf[NominalAttribute]
+      assert(attrKeep.values.get === Array("b", "a", "__unknown"))
+      assert(rows === expectedKeep.collect().toSeq)
+    }
   }
 
   test("StringIndexer with a numeric input column") {
@@ -109,16 +112,14 @@ class StringIndexerSuite
       .setInputCol("label")
       .setOutputCol("labelIndex")
       .fit(df)
-    val transformed = indexer.transform(df)
-    val attr = Attribute.fromStructField(transformed.schema("labelIndex"))
-      .asInstanceOf[NominalAttribute]
-    assert(attr.values.get === Array("100", "300", "200"))
-    val output = transformed.select("id", "labelIndex").rdd.map { r =>
-      (r.getInt(0), r.getDouble(1))
-    }.collect().toSet
     // 100 -> 0, 200 -> 2, 300 -> 1
-    val expected = Set((0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0))
-    assert(output === expected)
+    val expected = Seq((0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)).toDF()
+    testTransformerByGlobalCheckFunc[(Int, String)](df, indexer, "id", "labelIndex") { rows =>
+      val attr = Attribute.fromStructField(rows.head.schema("labelIndex"))
+        .asInstanceOf[NominalAttribute]
+      assert(attr.values.get === Array("100", "300", "200"))
+      assert(rows === expected.collect().toSeq)
+    }
   }
 
   test("StringIndexer with NULLs") {
@@ -133,37 +134,36 @@ class StringIndexerSuite
 
     withClue("StringIndexer should throw error when setHandleInvalid=error " +
       "when given NULL values") {
-      intercept[SparkException] {
-        indexer.setHandleInvalid("error")
-        indexer.fit(df).transform(df2).collect()
-      }
+      indexer.setHandleInvalid("error")
+      testTransformerByInterceptingException[(Int, String)](
+        df2,
+        indexer.fit(df),
+        "StringIndexer encountered NULL value.",
+        "labelIndex")
     }
 
     indexer.setHandleInvalid("skip")
-    val transformedSkip = indexer.fit(df).transform(df2)
-    val attrSkip = Attribute
-      .fromStructField(transformedSkip.schema("labelIndex"))
-      .asInstanceOf[NominalAttribute]
-    assert(attrSkip.values.get === Array("b", "a"))
-    val outputSkip = transformedSkip.select("id", "labelIndex").rdd.map { r =>
-      (r.getInt(0), r.getDouble(1))
-    }.collect().toSet
+    val modelSkip = indexer.fit(df)
     // a -> 1, b -> 0
-    val expectedSkip = Set((0, 1.0), (1, 0.0))
-    assert(outputSkip === expectedSkip)
+    val expectedSkip = Seq((0, 1.0), (1, 0.0)).toDF()
+    testTransformerByGlobalCheckFunc[(Int, String)](df2, modelSkip, "id", "labelIndex") { rows =>
+      val attrSkip =
+        Attribute.fromStructField(rows.head.schema("labelIndex")).asInstanceOf[NominalAttribute]
+      assert(attrSkip.values.get === Array("b", "a"))
+      assert(rows === expectedSkip.collect().toSeq)
+    }
 
     indexer.setHandleInvalid("keep")
-    val transformedKeep = indexer.fit(df).transform(df2)
-    val attrKeep = Attribute
-      .fromStructField(transformedKeep.schema("labelIndex"))
-      .asInstanceOf[NominalAttribute]
-    assert(attrKeep.values.get === Array("b", "a", "__unknown"))
-    val outputKeep = transformedKeep.select("id", "labelIndex").rdd.map { r =>
-      (r.getInt(0), r.getDouble(1))
-    }.collect().toSet
     // a -> 1, b -> 0, null -> 2
-    val expectedKeep = Set((0, 1.0), (1, 0.0), (3, 2.0))
-    assert(outputKeep === expectedKeep)
+    val expectedKeep = Seq((0, 1.0), (1, 0.0), (3, 2.0)).toDF()
+    val modelKeep = indexer.fit(df)
+    testTransformerByGlobalCheckFunc[(Int, String)](df2, modelKeep, "id", "labelIndex") { rows =>
+      val attrKeep = Attribute
+        .fromStructField(rows.head.schema("labelIndex"))
+        .asInstanceOf[NominalAttribute]
+      assert(attrKeep.values.get === Array("b", "a", "__unknown"))
+      assert(rows === expectedKeep.collect().toSeq)
+    }
   }
 
   test("StringIndexerModel should keep silent if the input column does not exist.") {
@@ -171,7 +171,9 @@ class StringIndexerSuite
       .setInputCol("label")
       .setOutputCol("labelIndex")
     val df = spark.range(0L, 10L).toDF()
-    assert(indexerModel.transform(df).collect().toSet === df.collect().toSet)
+    testTransformerByGlobalCheckFunc[Long](df, indexerModel, "id") { rows =>
+      assert(rows.toSet === df.collect().toSet)
+    }
   }
 
   test("StringIndexerModel can't overwrite output column") {
@@ -188,9 +190,12 @@ class StringIndexerSuite
       .setOutputCol("indexedInput")
       .fit(df)
 
-    intercept[IllegalArgumentException] {
-      indexer.setOutputCol("output").transform(df)
-    }
+    testTransformerByInterceptingException[(Int, String)](
+      df,
+      indexer.setOutputCol("output"),
+      "Output column output already exists.",
+      "labelIndex")
+
   }
 
   test("StringIndexer read/write") {
@@ -223,7 +228,8 @@ class StringIndexerSuite
       .setInputCol("index")
       .setOutputCol("actual")
       .setLabels(labels)
-    idxToStr0.transform(df0).select("actual", "expected").collect().foreach {
+
+    testTransformer[(Int, String)](df0, idxToStr0, "actual", "expected") {
       case Row(actual, expected) =>
         assert(actual === expected)
     }
@@ -234,7 +240,8 @@ class StringIndexerSuite
     val idxToStr1 = new IndexToString()
       .setInputCol("indexWithAttr")
       .setOutputCol("actual")
-    idxToStr1.transform(df1).select("actual", "expected").collect().foreach {
+
+    testTransformer[(Int, String)](df1, idxToStr1, "actual", "expected") {
       case Row(actual, expected) =>
         assert(actual === expected)
     }
@@ -252,9 +259,10 @@ class StringIndexerSuite
       .setInputCol("labelIndex")
       .setOutputCol("sameLabel")
       .setLabels(indexer.labels)
-    idx2str.transform(transformed).select("label", "sameLabel").collect().foreach {
-      case Row(a: String, b: String) =>
-        assert(a === b)
+
+    testTransformer[(Int, String, Double)](transformed, idx2str, "sameLabel", "label") {
+      case Row(sameLabel, label) =>
+        assert(sameLabel === label)
     }
   }
 
@@ -286,10 +294,11 @@ class StringIndexerSuite
       .setInputCol("label")
       .setOutputCol("labelIndex")
       .fit(df)
-    val transformed = indexer.transform(df)
-    val attrs =
-      NominalAttribute.decodeStructField(transformed.schema("labelIndex"), preserveName = true)
-    assert(attrs.name.nonEmpty && attrs.name.get === "labelIndex")
+    testTransformerByGlobalCheckFunc[(Int, String)](df, indexer, "labelIndex") { rows =>
+      val attrs =
+        NominalAttribute.decodeStructField(rows.head.schema("labelIndex"), preserveName = true)
+      assert(attrs.name.nonEmpty && attrs.name.get === "labelIndex")
+    }
   }
 
   test("StringIndexer order types") {
@@ -299,18 +308,17 @@ class StringIndexerSuite
       .setInputCol("label")
       .setOutputCol("labelIndex")
 
-    val expected = Seq(Set((0, 0.0), (1, 0.0), (2, 2.0), (3, 1.0), (4, 1.0), (5, 0.0)),
-      Set((0, 2.0), (1, 2.0), (2, 0.0), (3, 1.0), (4, 1.0), (5, 2.0)),
-      Set((0, 1.0), (1, 1.0), (2, 0.0), (3, 2.0), (4, 2.0), (5, 1.0)),
-      Set((0, 1.0), (1, 1.0), (2, 2.0), (3, 0.0), (4, 0.0), (5, 1.0)))
+    val expected = Seq(Seq((0, 0.0), (1, 0.0), (2, 2.0), (3, 1.0), (4, 1.0), (5, 0.0)),
+      Seq((0, 2.0), (1, 2.0), (2, 0.0), (3, 1.0), (4, 1.0), (5, 2.0)),
+      Seq((0, 1.0), (1, 1.0), (2, 0.0), (3, 2.0), (4, 2.0), (5, 1.0)),
+      Seq((0, 1.0), (1, 1.0), (2, 2.0), (3, 0.0), (4, 0.0), (5, 1.0)))
 
     var idx = 0
     for (orderType <- StringIndexer.supportedStringOrderType) {
-      val transformed = indexer.setStringOrderType(orderType).fit(df).transform(df)
-      val output = transformed.select("id", "labelIndex").rdd.map { r =>
-        (r.getInt(0), r.getDouble(1))
-      }.collect().toSet
-      assert(output === expected(idx))
+      val model = indexer.setStringOrderType(orderType).fit(df)
+      testTransformerByGlobalCheckFunc[(Int, String)](df, model, "id", "labelIndex") { rows =>
+        assert(rows === expected(idx).toDF().collect().toSeq)
+      }
       idx += 1
     }
   }
@@ -328,7 +336,11 @@ class StringIndexerSuite
       .setOutputCol("CITYIndexed")
       .fit(dfNoBristol)
 
-    val dfWithIndex = model.transform(dfNoBristol)
-    assert(dfWithIndex.filter($"CITYIndexed" === 1.0).count == 1)
+    testTransformerByGlobalCheckFunc[(String, String, String)](
+      dfNoBristol,
+      model,
+      "CITYIndexed") { rows =>
+      assert(rows.toList.count(_.getDouble(0) == 1.0) === 1)
+    }
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
index c895659a2d8be..be59b0af2c78e 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
@@ -19,16 +19,14 @@ package org.apache.spark.ml.feature
 
 import scala.beans.BeanInfo
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
-import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.{Dataset, Row}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
+import org.apache.spark.sql.{DataFrame, Row}
 
 @BeanInfo
 case class TokenizerTestData(rawText: String, wantedTokens: Array[String])
 
-class TokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class TokenizerSuite extends MLTest with DefaultReadWriteTest {
 
   test("params") {
     ParamsSuite.checkParams(new Tokenizer)
@@ -42,12 +40,17 @@ class TokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
   }
 }
 
-class RegexTokenizerSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class RegexTokenizerSuite extends MLTest with DefaultReadWriteTest {
 
-  import org.apache.spark.ml.feature.RegexTokenizerSuite._
   import testImplicits._
 
+  def testRegexTokenizer(t: RegexTokenizer, dataframe: DataFrame): Unit = {
+    testTransformer[(String, Seq[String])](dataframe, t, "tokens", "wantedTokens") {
+      case Row(tokens, wantedTokens) =>
+        assert(tokens === wantedTokens)
+    }
+  }
+
   test("params") {
     ParamsSuite.checkParams(new RegexTokenizer)
   }
@@ -105,14 +108,3 @@ class RegexTokenizerSuite
   }
 }
 
-object RegexTokenizerSuite extends SparkFunSuite {
-
-  def testRegexTokenizer(t: RegexTokenizer, dataset: Dataset[_]): Unit = {
-    t.transform(dataset)
-      .select("tokens", "wantedTokens")
-      .collect()
-      .foreach { case Row(tokens, wantedTokens) =>
-        assert(tokens === wantedTokens)
-      }
-  }
-}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala
index 69a7b75e32eb7..e5675e31bbecf 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala
@@ -19,18 +19,16 @@ package org.apache.spark.ml.feature
 
 import scala.beans.{BeanInfo, BeanProperty}
 
-import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.SparkException
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.attribute._
 import org.apache.spark.ml.linalg.{SparseVector, Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
-import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.{DataFrame, Row}
 
-class VectorIndexerSuite extends SparkFunSuite with MLlibTestSparkContext
-  with DefaultReadWriteTest with Logging {
+class VectorIndexerSuite extends MLTest with DefaultReadWriteTest with Logging {
 
   import testImplicits._
   import VectorIndexerSuite.FeatureData
@@ -128,18 +126,27 @@ class VectorIndexerSuite extends SparkFunSuite with MLlibTestSparkContext
 
     MLTestingUtils.checkCopyAndUids(vectorIndexer, model)
 
-    model.transform(densePoints1) // should work
-    model.transform(sparsePoints1) // should work
+    testTransformer[FeatureData](densePoints1, model, "indexed") { _ => }
+    testTransformer[FeatureData](sparsePoints1, model, "indexed") { _ => }
+
     // If the data is local Dataset, it throws AssertionError directly.
-    intercept[AssertionError] {
-      model.transform(densePoints2).collect()
-      logInfo("Did not throw error when fit, transform were called on vectors of different lengths")
+    withClue("Did not throw error when fit, transform were called on " +
+      "vectors of different lengths") {
+      testTransformerByInterceptingException[FeatureData](
+        densePoints2,
+        model,
+        "VectorIndexerModel expected vector of length 3 but found length 4",
+        "indexed")
     }
     // If the data is distributed Dataset, it throws SparkException
     // which is the wrapper of AssertionError.
-    intercept[SparkException] {
-      model.transform(densePoints2.repartition(2)).collect()
-      logInfo("Did not throw error when fit, transform were called on vectors of different lengths")
+    withClue("Did not throw error when fit, transform were called " +
+      "on vectors of different lengths") {
+      testTransformerByInterceptingException[FeatureData](
+        densePoints2.repartition(2),
+        model,
+        "VectorIndexerModel expected vector of length 3 but found length 4",
+        "indexed")
     }
     intercept[SparkException] {
       vectorIndexer.fit(badPoints)
@@ -178,46 +185,48 @@ class VectorIndexerSuite extends SparkFunSuite with MLlibTestSparkContext
         val categoryMaps = model.categoryMaps
         // Chose correct categorical features
         assert(categoryMaps.keys.toSet === categoricalFeatures)
-        val transformed = model.transform(data).select("indexed")
-        val indexedRDD: RDD[Vector] = transformed.rdd.map(_.getAs[Vector](0))
-        val featureAttrs = AttributeGroup.fromStructField(transformed.schema("indexed"))
-        assert(featureAttrs.name === "indexed")
-        assert(featureAttrs.attributes.get.length === model.numFeatures)
-        categoricalFeatures.foreach { feature: Int =>
-          val origValueSet = collectedData.map(_(feature)).toSet
-          val targetValueIndexSet = Range(0, origValueSet.size).toSet
-          val catMap = categoryMaps(feature)
-          assert(catMap.keys.toSet === origValueSet) // Correct categories
-          assert(catMap.values.toSet === targetValueIndexSet) // Correct category indices
-          if (origValueSet.contains(0.0)) {
-            assert(catMap(0.0) === 0) // value 0 gets index 0
-          }
-          // Check transformed data
-          assert(indexedRDD.map(_(feature)).collect().toSet === targetValueIndexSet)
-          // Check metadata
-          val featureAttr = featureAttrs(feature)
-          assert(featureAttr.index.get === feature)
-          featureAttr match {
-            case attr: BinaryAttribute =>
-              assert(attr.values.get === origValueSet.toArray.sorted.map(_.toString))
-            case attr: NominalAttribute =>
-              assert(attr.values.get === origValueSet.toArray.sorted.map(_.toString))
-              assert(attr.isOrdinal.get === false)
-            case _ =>
-              throw new RuntimeException(errMsg + s". Categorical feature $feature failed" +
-                s" metadata check. Found feature attribute: $featureAttr.")
+        testTransformerByGlobalCheckFunc[FeatureData](data, model, "indexed") { rows =>
+          val transformed = rows.map { r => Tuple1(r.getAs[Vector](0)) }.toDF("indexed")
+          val indexedRDD: RDD[Vector] = transformed.rdd.map(_.getAs[Vector](0))
+          val featureAttrs = AttributeGroup.fromStructField(rows.head.schema("indexed"))
+          assert(featureAttrs.name === "indexed")
+          assert(featureAttrs.attributes.get.length === model.numFeatures)
+          categoricalFeatures.foreach { feature: Int =>
+            val origValueSet = collectedData.map(_(feature)).toSet
+            val targetValueIndexSet = Range(0, origValueSet.size).toSet
+            val catMap = categoryMaps(feature)
+            assert(catMap.keys.toSet === origValueSet) // Correct categories
+            assert(catMap.values.toSet === targetValueIndexSet) // Correct category indices
+            if (origValueSet.contains(0.0)) {
+              assert(catMap(0.0) === 0) // value 0 gets index 0
+            }
+            // Check transformed data
+            assert(indexedRDD.map(_(feature)).collect().toSet === targetValueIndexSet)
+            // Check metadata
+            val featureAttr = featureAttrs(feature)
+            assert(featureAttr.index.get === feature)
+            featureAttr match {
+              case attr: BinaryAttribute =>
+                assert(attr.values.get === origValueSet.toArray.sorted.map(_.toString))
+              case attr: NominalAttribute =>
+                assert(attr.values.get === origValueSet.toArray.sorted.map(_.toString))
+                assert(attr.isOrdinal.get === false)
+              case _ =>
+                throw new RuntimeException(errMsg + s". Categorical feature $feature failed" +
+                  s" metadata check. Found feature attribute: $featureAttr.")
+            }
           }
-        }
-        // Check numerical feature metadata.
-        Range(0, model.numFeatures).filter(feature => !categoricalFeatures.contains(feature))
-          .foreach { feature: Int =>
-          val featureAttr = featureAttrs(feature)
-          featureAttr match {
-            case attr: NumericAttribute =>
-              assert(featureAttr.index.get === feature)
-            case _ =>
-              throw new RuntimeException(errMsg + s". Numerical feature $feature failed" +
-                s" metadata check. Found feature attribute: $featureAttr.")
+          // Check numerical feature metadata.
+          Range(0, model.numFeatures).filter(feature => !categoricalFeatures.contains(feature))
+            .foreach { feature: Int =>
+            val featureAttr = featureAttrs(feature)
+            featureAttr match {
+              case attr: NumericAttribute =>
+                assert(featureAttr.index.get === feature)
+              case _ =>
+                throw new RuntimeException(errMsg + s". Numerical feature $feature failed" +
+                  s" metadata check. Found feature attribute: $featureAttr.")
+            }
           }
         }
       } catch {
@@ -236,25 +245,32 @@ class VectorIndexerSuite extends SparkFunSuite with MLlibTestSparkContext
       (sparsePoints1, sparsePoints1TestInvalid))) {
       val vectorIndexer = getIndexer.setMaxCategories(4).setHandleInvalid("error")
       val model = vectorIndexer.fit(points)
-      intercept[SparkException] {
-        model.transform(pointsTestInvalid).collect()
-      }
+      testTransformerByInterceptingException[FeatureData](
+        pointsTestInvalid,
+        model,
+        "VectorIndexer encountered invalid value",
+        "indexed")
       val vectorIndexer1 = getIndexer.setMaxCategories(4).setHandleInvalid("skip")
       val model1 = vectorIndexer1.fit(points)
-      val invalidTransformed1 = model1.transform(pointsTestInvalid).select("indexed")
-        .collect().map(_(0))
-      val transformed1 = model1.transform(points).select("indexed").collect().map(_(0))
-      assert(transformed1 === invalidTransformed1)
-
+      val expected = Seq(
+        Vectors.dense(1.0, 2.0, 0.0),
+        Vectors.dense(0.0, 1.0, 2.0),
+        Vectors.dense(0.0, 0.0, 1.0),
+        Vectors.dense(1.0, 3.0, 2.0))
+      testTransformerByGlobalCheckFunc[FeatureData](pointsTestInvalid, model1, "indexed") { rows =>
+        assert(rows.map(_(0)) == expected)
+      }
+      testTransformerByGlobalCheckFunc[FeatureData](points, model1, "indexed") { rows =>
+        assert(rows.map(_(0)) == expected)
+      }
       val vectorIndexer2 = getIndexer.setMaxCategories(4).setHandleInvalid("keep")
       val model2 = vectorIndexer2.fit(points)
-      val invalidTransformed2 = model2.transform(pointsTestInvalid).select("indexed")
-        .collect().map(_(0))
-      assert(invalidTransformed2 === transformed1 ++ Array(
-        Vectors.dense(2.0, 2.0, 0.0),
-        Vectors.dense(0.0, 4.0, 2.0),
-        Vectors.dense(1.0, 3.0, 3.0))
-      )
+      testTransformerByGlobalCheckFunc[FeatureData](pointsTestInvalid, model2, "indexed") { rows =>
+        assert(rows.map(_(0)) == expected ++ Array(
+          Vectors.dense(2.0, 2.0, 0.0),
+          Vectors dense(0.0, 4.0, 2.0),
+          Vectors.dense(1.0, 3.0, 3.0)))
+      }
     }
   }
 
@@ -263,12 +279,12 @@ class VectorIndexerSuite extends SparkFunSuite with MLlibTestSparkContext
       val points = data.collect().map(_.getAs[Vector](0))
       val vectorIndexer = getIndexer.setMaxCategories(maxCategories)
       val model = vectorIndexer.fit(data)
-      val indexedPoints =
-        model.transform(data).select("indexed").rdd.map(_.getAs[Vector](0)).collect()
-      points.zip(indexedPoints).foreach {
-        case (orig: SparseVector, indexed: SparseVector) =>
-          assert(orig.indices.length == indexed.indices.length)
-        case _ => throw new UnknownError("Unit test has a bug in it.") // should never happen
+      testTransformerByGlobalCheckFunc[FeatureData](data, model, "indexed") { rows =>
+        points.zip(rows.map(_(0))).foreach {
+          case (orig: SparseVector, indexed: SparseVector) =>
+            assert(orig.indices.length == indexed.indices.length)
+          case _ => throw new UnknownError("Unit test has a bug in it.") // should never happen
+        }
       }
     }
     checkSparsity(sparsePoints1, maxCategories = 2)
@@ -286,17 +302,18 @@ class VectorIndexerSuite extends SparkFunSuite with MLlibTestSparkContext
     val vectorIndexer = getIndexer.setMaxCategories(2)
     val model = vectorIndexer.fit(densePoints1WithMeta)
     // Check that ML metadata are preserved.
-    val indexedPoints = model.transform(densePoints1WithMeta)
-    val transAttributes: Array[Attribute] =
-      AttributeGroup.fromStructField(indexedPoints.schema("indexed")).attributes.get
-    featureAttributes.zip(transAttributes).foreach { case (orig, trans) =>
-      assert(orig.name === trans.name)
-      (orig, trans) match {
-        case (orig: NumericAttribute, trans: NumericAttribute) =>
-          assert(orig.max.nonEmpty && orig.max === trans.max)
-        case _ =>
+    testTransformerByGlobalCheckFunc[FeatureData](densePoints1WithMeta, model, "indexed") { rows =>
+      val transAttributes: Array[Attribute] =
+        AttributeGroup.fromStructField(rows.head.schema("indexed")).attributes.get
+      featureAttributes.zip(transAttributes).foreach { case (orig, trans) =>
+        assert(orig.name === trans.name)
+        (orig, trans) match {
+          case (orig: NumericAttribute, trans: NumericAttribute) =>
+            assert(orig.max.nonEmpty && orig.max === trans.max)
+          case _ =>
           // do nothing
           // TODO: Once input features marked as categorical are handled correctly, check that here.
+        }
       }
     }
   }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorSizeHintSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorSizeHintSuite.scala
index f6c9a76599fae..d89d10b320d84 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorSizeHintSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorSizeHintSuite.scala
@@ -17,17 +17,15 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.Pipeline
 import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.linalg.{Vector, Vectors}
-import org.apache.spark.ml.util.DefaultReadWriteTest
-import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
 import org.apache.spark.sql.execution.streaming.MemoryStream
 import org.apache.spark.sql.streaming.StreamTest
 
 class VectorSizeHintSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+  extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -40,16 +38,23 @@ class VectorSizeHintSuite
     val data = Seq((Vectors.dense(1, 2), 0)).toDF("vector", "intValue")
 
     val noSizeTransformer = new VectorSizeHint().setInputCol("vector")
-    intercept[NoSuchElementException] (noSizeTransformer.transform(data))
+    testTransformerByInterceptingException[(Vector, Int)](
+      data,
+      noSizeTransformer,
+      "Failed to find a default value for size",
+      "vector")
     intercept[NoSuchElementException] (noSizeTransformer.transformSchema(data.schema))
 
     val noInputColTransformer = new VectorSizeHint().setSize(2)
-    intercept[NoSuchElementException] (noInputColTransformer.transform(data))
+    testTransformerByInterceptingException[(Vector, Int)](
+      data,
+      noInputColTransformer,
+      "Failed to find a default value for inputCol",
+      "vector")
     intercept[NoSuchElementException] (noInputColTransformer.transformSchema(data.schema))
   }
 
   test("Adding size to column of vectors.") {
-
     val size = 3
     val vectorColName = "vector"
     val denseVector = Vectors.dense(1, 2, 3)
@@ -66,12 +71,15 @@ class VectorSizeHintSuite
         .setInputCol(vectorColName)
         .setSize(size)
         .setHandleInvalid(handleInvalid)
-      val withSize = transformer.transform(dataFrame)
-      assert(
-        AttributeGroup.fromStructField(withSize.schema(vectorColName)).size == size,
-        "Transformer did not add expected size data.")
-      val numRows = withSize.collect().length
-      assert(numRows === data.length, s"Expecting ${data.length} rows, got $numRows.")
+      testTransformerByGlobalCheckFunc[Tuple1[Vector]](dataFrame, transformer, vectorColName) {
+        rows => {
+          assert(
+            AttributeGroup.fromStructField(rows.head.schema(vectorColName)).size == size,
+            "Transformer did not add expected size data.")
+          val numRows = rows.length
+          assert(numRows === data.length, s"Expecting ${data.length} rows, got $numRows.")
+        }
+      }
     }
   }
 
@@ -93,14 +101,16 @@ class VectorSizeHintSuite
         .setInputCol(vectorColName)
         .setSize(size)
         .setHandleInvalid(handleInvalid)
-      val withSize = transformer.transform(dataFrameWithMetadata)
-
-      val newGroup = AttributeGroup.fromStructField(withSize.schema(vectorColName))
-      assert(newGroup.size === size, "Column has incorrect size metadata.")
-      assert(
-        newGroup.attributes.get === group.attributes.get,
-        "VectorSizeHint did not preserve attributes.")
-      withSize.collect
+      testTransformerByGlobalCheckFunc[(Int, Int, Int, Vector)](
+        dataFrameWithMetadata,
+        transformer,
+        vectorColName) { rows =>
+          val newGroup = AttributeGroup.fromStructField(rows.head.schema(vectorColName))
+          assert(newGroup.size === size, "Column has incorrect size metadata.")
+          assert(
+            newGroup.attributes.get === group.attributes.get,
+            "VectorSizeHint did not preserve attributes.")
+      }
     }
   }
 
@@ -120,7 +130,11 @@ class VectorSizeHintSuite
         .setInputCol(vectorColName)
         .setSize(size)
         .setHandleInvalid(handleInvalid)
-      intercept[IllegalArgumentException](transformer.transform(dataFrameWithMetadata))
+      testTransformerByInterceptingException[(Int, Int, Int, Vector)](
+        dataFrameWithMetadata,
+        transformer,
+        "Trying to set size of vectors in `vector` to 4 but size already set to 3.",
+        vectorColName)
     }
   }
 
@@ -136,18 +150,36 @@ class VectorSizeHintSuite
       .setHandleInvalid("error")
       .setSize(3)
 
-    intercept[SparkException](sizeHint.transform(dataWithNull).collect())
-    intercept[SparkException](sizeHint.transform(dataWithShort).collect())
+    testTransformerByInterceptingException[Tuple1[Vector]](
+      dataWithNull,
+      sizeHint,
+      "Got null vector in VectorSizeHint",
+      "vector")
+
+    testTransformerByInterceptingException[Tuple1[Vector]](
+      dataWithShort,
+      sizeHint,
+      "VectorSizeHint Expecting a vector of size 3 but got 1",
+      "vector")
 
     sizeHint.setHandleInvalid("skip")
-    assert(sizeHint.transform(dataWithNull).count() === 1)
-    assert(sizeHint.transform(dataWithShort).count() === 1)
+    testTransformerByGlobalCheckFunc[Tuple1[Vector]](dataWithNull, sizeHint, "vector") { rows =>
+      assert(rows.length === 1)
+    }
+    testTransformerByGlobalCheckFunc[Tuple1[Vector]](dataWithShort, sizeHint, "vector") { rows =>
+      assert(rows.length === 1)
+    }
 
     sizeHint.setHandleInvalid("optimistic")
-    assert(sizeHint.transform(dataWithNull).count() === 2)
-    assert(sizeHint.transform(dataWithShort).count() === 2)
+    testTransformerByGlobalCheckFunc[Tuple1[Vector]](dataWithNull, sizeHint, "vector") { rows =>
+      assert(rows.length === 2)
+    }
+    testTransformerByGlobalCheckFunc[Tuple1[Vector]](dataWithShort, sizeHint, "vector") { rows =>
+      assert(rows.length === 2)
+    }
   }
 
+
   test("read/write") {
     val sizeHint = new VectorSizeHint()
       .setInputCol("myInputCol")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorSlicerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorSlicerSuite.scala
index 1746ce53107c4..3d90f9d9ac764 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorSlicerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorSlicerSuite.scala
@@ -17,16 +17,16 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
 import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
-import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.types.{StructField, StructType}
 
-class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class VectorSlicerSuite extends MLTest with DefaultReadWriteTest {
+
+  import testImplicits._
 
   test("params") {
     val slicer = new VectorSlicer().setInputCol("feature")
@@ -84,12 +84,12 @@ class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with De
 
     val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result")
 
-    def validateResults(df: DataFrame): Unit = {
-      df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) =>
+    def validateResults(rows: Seq[Row]): Unit = {
+      rows.foreach { case Row(vec1: Vector, vec2: Vector) =>
         assert(vec1 === vec2)
       }
-      val resultMetadata = AttributeGroup.fromStructField(df.schema("result"))
-      val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected"))
+      val resultMetadata = AttributeGroup.fromStructField(rows.head.schema("result"))
+      val expectedMetadata = AttributeGroup.fromStructField(rows.head.schema("expected"))
       assert(resultMetadata.numAttributes === expectedMetadata.numAttributes)
       resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) =>
         assert(a === b)
@@ -97,13 +97,16 @@ class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with De
     }
 
     vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty)
-    validateResults(vectorSlicer.transform(df))
+    testTransformerByGlobalCheckFunc[(Vector, Vector)](df, vectorSlicer, "result", "expected")(
+      validateResults)
 
     vectorSlicer.setIndices(Array(1)).setNames(Array("f4"))
-    validateResults(vectorSlicer.transform(df))
+    testTransformerByGlobalCheckFunc[(Vector, Vector)](df, vectorSlicer, "result", "expected")(
+      validateResults)
 
     vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4"))
-    validateResults(vectorSlicer.transform(df))
+    testTransformerByGlobalCheckFunc[(Vector, Vector)](df, vectorSlicer, "result", "expected")(
+      validateResults)
   }
 
   test("read/write") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
index 10682ba176aca..b59c4e7967338 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
@@ -17,17 +17,17 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.feature.{Word2VecModel => OldWord2VecModel}
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.Row
 import org.apache.spark.util.Utils
 
-class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class Word2VecSuite extends MLTest with DefaultReadWriteTest {
+
+  import testImplicits._
 
   test("params") {
     ParamsSuite.checkParams(new Word2Vec)
@@ -36,10 +36,6 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
   }
 
   test("Word2Vec") {
-
-    val spark = this.spark
-    import spark.implicits._
-
     val sentence = "a b " * 100 + "a c " * 10
     val numOfWords = sentence.split(" ").size
     val doc = sc.parallelize(Seq(sentence, sentence)).map(line => line.split(" "))
@@ -70,17 +66,13 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
     // These expectations are just magic values, characterizing the current
     // behavior.  The test needs to be updated to be more general, see SPARK-11502
     val magicExp = Vectors.dense(0.30153007534417237, -0.6833061711354689, 0.5116530778733167)
-    model.transform(docDF).select("result", "expected").collect().foreach {
+    testTransformer[(Seq[String], Vector)](docDF, model, "result", "expected") {
       case Row(vector1: Vector, vector2: Vector) =>
         assert(vector1 ~== magicExp absTol 1E-5, "Transformed vector is different with expected.")
     }
   }
 
   test("getVectors") {
-
-    val spark = this.spark
-    import spark.implicits._
-
     val sentence = "a b " * 100 + "a c " * 10
     val doc = sc.parallelize(Seq(sentence, sentence)).map(line => line.split(" "))
 
@@ -119,9 +111,6 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
 
   test("findSynonyms") {
 
-    val spark = this.spark
-    import spark.implicits._
-
     val sentence = "a b " * 100 + "a c " * 10
     val doc = sc.parallelize(Seq(sentence, sentence)).map(line => line.split(" "))
     val docDF = doc.zip(doc).toDF("text", "alsotext")
@@ -154,9 +143,6 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
 
   test("window size") {
 
-    val spark = this.spark
-    import spark.implicits._
-
     val sentence = "a q s t q s t b b b s t m s t m q " * 100 + "a c " * 10
     val doc = sc.parallelize(Seq(sentence, sentence)).map(line => line.split(" "))
     val docDF = doc.zip(doc).toDF("text", "alsotext")
@@ -227,8 +213,6 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
   }
 
   test("Word2Vec works with input that is non-nullable (NGram)") {
-    val spark = this.spark
-    import spark.implicits._
 
     val sentence = "a q s t q s t b b b s t m s t m q "
     val docDF = sc.parallelize(Seq(sentence, sentence)).map(_.split(" ")).toDF("text")
@@ -243,7 +227,7 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
       .fit(ngramDF)
 
     // Just test that this transformation succeeds
-    model.transform(ngramDF).collect()
+    testTransformerByGlobalCheckFunc[(Seq[String], Seq[String])](ngramDF, model, "result") { _ => }
   }
 
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/MLTest.scala b/mllib/src/test/scala/org/apache/spark/ml/util/MLTest.scala
index 17678aa611a48..795fd0e2ac0e4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/util/MLTest.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/util/MLTest.scala
@@ -22,9 +22,10 @@ import java.io.File
 import org.scalatest.Suite
 
 import org.apache.spark.SparkContext
-import org.apache.spark.ml.{PipelineModel, Transformer}
+import org.apache.spark.ml.Transformer
 import org.apache.spark.sql.{DataFrame, Encoder, Row}
 import org.apache.spark.sql.execution.streaming.MemoryStream
+import org.apache.spark.sql.functions.col
 import org.apache.spark.sql.streaming.StreamTest
 import org.apache.spark.sql.test.TestSparkSession
 import org.apache.spark.util.Utils
@@ -62,8 +63,10 @@ trait MLTest extends StreamTest with TempDirectory { self: Suite =>
 
     val columnNames = dataframe.schema.fieldNames
     val stream = MemoryStream[A]
-    val streamDF = stream.toDS().toDF(columnNames: _*)
-
+    val columnsWithMetadata = dataframe.schema.map { structField =>
+      col(structField.name).as(structField.name, structField.metadata)
+    }
+    val streamDF = stream.toDS().toDF(columnNames: _*).select(columnsWithMetadata: _*)
     val data = dataframe.as[A].collect()
 
     val streamOutput = transformer.transform(streamDF)
@@ -108,5 +111,29 @@ trait MLTest extends StreamTest with TempDirectory { self: Suite =>
       otherResultCols: _*)(globalCheckFunction)
     testTransformerOnDF(dataframe, transformer, firstResultCol,
       otherResultCols: _*)(globalCheckFunction)
+    }
+
+  def testTransformerByInterceptingException[A : Encoder](
+    dataframe: DataFrame,
+    transformer: Transformer,
+    expectedMessagePart : String,
+    firstResultCol: String) {
+
+    def hasExpectedMessage(exception: Throwable): Boolean =
+      exception.getMessage.contains(expectedMessagePart) ||
+        (exception.getCause != null && exception.getCause.getMessage.contains(expectedMessagePart))
+
+    withClue(s"""Expected message part "${expectedMessagePart}" is not found in DF test.""") {
+      val exceptionOnDf = intercept[Throwable] {
+        testTransformerOnDF(dataframe, transformer, firstResultCol)(_ => Unit)
+      }
+      assert(hasExpectedMessage(exceptionOnDf))
+    }
+    withClue(s"""Expected message part "${expectedMessagePart}" is not found in stream test.""") {
+      val exceptionOnStreamData = intercept[Throwable] {
+        testTransformerOnStreamData(dataframe, transformer, firstResultCol)(_ => Unit)
+      }
+      assert(hasExpectedMessage(exceptionOnStreamData))
+    }
   }
 }

From a9d0784e6733666a0608e8236322f1dc380e96b7 Mon Sep 17 00:00:00 2001
From: smallory <s.mallory@gmail.com>
Date: Thu, 15 Mar 2018 11:58:54 +0900
Subject: [PATCH 349/566] [SPARK-23642][DOCS] AccumulatorV2 subclass isZero
 scaladoc fix

Added/corrected scaladoc for isZero on the DoubleAccumulator, CollectionAccumulator, and LongAccumulator subclasses of AccumulatorV2, particularly noting where there are requirements in addition to having a value of zero in order to return true.

## What changes were proposed in this pull request?

Three scaladoc comments are updated in AccumulatorV2.scala
No changes outside of comment blocks were made.

## How was this patch tested?

Running "sbt unidoc", fixing style errors found, and reviewing the resulting local scaladoc in firefox.

Author: smallory <s.mallory@gmail.com>

Closes #20790 from smallory/patch-1.

(cherry picked from commit 4f5bad615b47d743b8932aea1071652293981604)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 .../main/scala/org/apache/spark/util/AccumulatorV2.scala | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
index f4a736d6d439a..0f84ea9752cf5 100644
--- a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
+++ b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
@@ -290,7 +290,8 @@ class LongAccumulator extends AccumulatorV2[jl.Long, jl.Long] {
   private var _count = 0L
 
   /**
-   * Adds v to the accumulator, i.e. increment sum by v and count by 1.
+   * Returns false if this accumulator has had any values added to it or the sum is non-zero.
+   *
    * @since 2.0.0
    */
   override def isZero: Boolean = _sum == 0L && _count == 0
@@ -368,6 +369,9 @@ class DoubleAccumulator extends AccumulatorV2[jl.Double, jl.Double] {
   private var _sum = 0.0
   private var _count = 0L
 
+  /**
+   * Returns false if this accumulator has had any values added to it or the sum is non-zero.
+   */
   override def isZero: Boolean = _sum == 0.0 && _count == 0
 
   override def copy(): DoubleAccumulator = {
@@ -441,6 +445,9 @@ class DoubleAccumulator extends AccumulatorV2[jl.Double, jl.Double] {
 class CollectionAccumulator[T] extends AccumulatorV2[T, java.util.List[T]] {
   private val _list: java.util.List[T] = Collections.synchronizedList(new ArrayList[T]())
 
+  /**
+   * Returns false if this accumulator instance has any values in it.
+   */
   override def isZero: Boolean = _list.isEmpty
 
   override def copyAndReset(): CollectionAccumulator[T] = new CollectionAccumulator

From 72c13ed844d6be6510ce2c5e3526c234d1d5e10f Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Thu, 15 Mar 2018 10:55:33 -0700
Subject: [PATCH 350/566] [SPARK-23695][PYTHON] Fix the error message for
 Kinesis streaming tests

## What changes were proposed in this pull request?

This PR proposes to fix the error message for Kinesis in PySpark when its jar is missing but explicitly enabled.

```bash
ENABLE_KINESIS_TESTS=1 SPARK_TESTING=1 bin/pyspark pyspark.streaming.tests
```

Before:

```
Skipped test_flume_stream (enable by setting environment variable ENABLE_FLUME_TESTS=1Skipped test_kafka_stream (enable by setting environment variable ENABLE_KAFKA_0_8_TESTS=1Traceback (most recent call last):
  File "/usr/local/Cellar/python/2.7.14_3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 174, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/usr/local/Cellar/python/2.7.14_3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/.../spark/python/pyspark/streaming/tests.py", line 1572, in <module>
    % kinesis_asl_assembly_dir) +
NameError: name 'kinesis_asl_assembly_dir' is not defined
```

After:

```
Skipped test_flume_stream (enable by setting environment variable ENABLE_FLUME_TESTS=1Skipped test_kafka_stream (enable by setting environment variable ENABLE_KAFKA_0_8_TESTS=1Traceback (most recent call last):
  File "/usr/local/Cellar/python/2.7.14_3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 174, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/usr/local/Cellar/python/2.7.14_3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/.../spark/python/pyspark/streaming/tests.py", line 1576, in <module>
    "You need to build Spark with 'build/sbt -Pkinesis-asl "
Exception: Failed to find Spark Streaming Kinesis assembly jar in /.../spark/external/kinesis-asl-assembly. You need to build Spark with 'build/sbt -Pkinesis-asl assembly/package streaming-kinesis-asl-assembly/assembly'or 'build/mvn -Pkinesis-asl package' before running this test.
```

## How was this patch tested?

Manually tested.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20834 from HyukjinKwon/minor-variable.

(cherry picked from commit 56e8f48a43eb51e8582db2461a585b13a771a00a)
Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
---
 python/pyspark/streaming/tests.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 5b86c1cb2c390..ca28c9bf3bee3 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -1503,10 +1503,13 @@ def search_flume_assembly_jar():
         return jars[0]
 
 
-def search_kinesis_asl_assembly_jar():
+def _kinesis_asl_assembly_dir():
     SPARK_HOME = os.environ["SPARK_HOME"]
-    kinesis_asl_assembly_dir = os.path.join(SPARK_HOME, "external/kinesis-asl-assembly")
-    jars = search_jar(kinesis_asl_assembly_dir, "spark-streaming-kinesis-asl-assembly")
+    return os.path.join(SPARK_HOME, "external/kinesis-asl-assembly")
+
+
+def search_kinesis_asl_assembly_jar():
+    jars = search_jar(_kinesis_asl_assembly_dir(), "spark-streaming-kinesis-asl-assembly")
     if not jars:
         return None
     elif len(jars) > 1:
@@ -1569,7 +1572,7 @@ def search_kinesis_asl_assembly_jar():
     else:
         raise Exception(
             ("Failed to find Spark Streaming Kinesis assembly jar in %s. "
-             % kinesis_asl_assembly_dir) +
+             % _kinesis_asl_assembly_dir()) +
             "You need to build Spark with 'build/sbt -Pkinesis-asl "
             "assembly/package streaming-kinesis-asl-assembly/assembly'"
             "or 'build/mvn -Pkinesis-asl package' before running this test.")

From 2e1e274ed9d7a30656555e71c68e7de34a336a8a Mon Sep 17 00:00:00 2001
From: Sahil Takiar <stakiar@cloudera.com>
Date: Thu, 15 Mar 2018 17:04:39 -0700
Subject: [PATCH 351/566] [SPARK-23658][LAUNCHER] InProcessAppHandle uses the
 wrong class in getLogger

## What changes were proposed in this pull request?

Changed `Logger` in `InProcessAppHandle` to use `InProcessAppHandle` instead of `ChildProcAppHandle`

Author: Sahil Takiar <stakiar@cloudera.com>

Closes #20815 from sahilTakiar/master.

(cherry picked from commit 7618896e855579f111dd92cd76794a5672a087e5)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../main/java/org/apache/spark/launcher/InProcessAppHandle.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
index 4b740d3fad20e..15fbca0facef2 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
@@ -25,7 +25,7 @@
 class InProcessAppHandle extends AbstractAppHandle {
 
   private static final String THREAD_NAME_FMT = "spark-app-%d: '%s'";
-  private static final Logger LOG = Logger.getLogger(ChildProcAppHandle.class.getName());
+  private static final Logger LOG = Logger.getLogger(InProcessAppHandle.class.getName());
   private static final AtomicLong THREAD_IDS = new AtomicLong();
 
   // Avoid really long thread names.

From 52a52d5d26fc1650e788eec62ce478c76f627470 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 15 Mar 2018 17:12:01 -0700
Subject: [PATCH 352/566] [SPARK-23671][CORE] Fix condition to enable the SHS
 thread pool.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20814 from vanzin/SPARK-23671.

(cherry picked from commit 18f8575e0166c6997569358d45bdae2cf45bf624)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../org/apache/spark/deploy/history/FsHistoryProvider.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index f9d0b5ee4e23e..ace6d9e00c838 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -173,7 +173,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
    * Fixed size thread pool to fetch and parse log files.
    */
   private val replayExecutor: ExecutorService = {
-    if (Utils.isTesting) {
+    if (!Utils.isTesting) {
       ThreadUtils.newDaemonFixedThreadPool(NUM_PROCESSING_THREADS, "log-replay-executor")
     } else {
       MoreExecutors.sameThreadExecutor()

From 99f5c0bc7a6c77917b4ccd498724b8ccc0c21473 Mon Sep 17 00:00:00 2001
From: Ye Zhou <yezhou@linkedin.com>
Date: Thu, 15 Mar 2018 17:15:53 -0700
Subject: [PATCH 353/566] [SPARK-23608][CORE][WEBUI] Add synchronization in SHS
 between attachSparkUI and detachSparkUI functions to avoid concurrent
 modification issue to Jetty Handlers

Jetty handlers are dynamically attached/detached while SHS is running. But the attach and detach operations might be taking place at the same time due to the async in load/clear in Guava Cache.

## What changes were proposed in this pull request?
Add synchronization between attachSparkUI and detachSparkUI in SHS.

## How was this patch tested?
With this patch, the jetty handlers missing issue never happens again in our production cluster SHS.

Author: Ye Zhou <yezhou@linkedin.com>

Closes #20744 from zhouyejoe/SPARK-23608.

(cherry picked from commit 3675af7247e841e9a689666dc20891ba55c612b3)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../apache/spark/deploy/history/HistoryServer.scala    | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
index 0ec4afad0308c..611fa563a7cd9 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
@@ -150,14 +150,18 @@ class HistoryServer(
       ui: SparkUI,
       completed: Boolean) {
     assert(serverInfo.isDefined, "HistoryServer must be bound before attaching SparkUIs")
-    ui.getHandlers.foreach(attachHandler)
-    addFilters(ui.getHandlers, conf)
+    handlers.synchronized {
+      ui.getHandlers.foreach(attachHandler)
+      addFilters(ui.getHandlers, conf)
+    }
   }
 
   /** Detach a reconstructed UI from this server. Only valid after bind(). */
   override def detachSparkUI(appId: String, attemptId: Option[String], ui: SparkUI): Unit = {
     assert(serverInfo.isDefined, "HistoryServer must be bound before detaching SparkUIs")
-    ui.getHandlers.foreach(detachHandler)
+    handlers.synchronized {
+      ui.getHandlers.foreach(detachHandler)
+    }
     provider.onUIDetached(appId, attemptId, ui)
   }
 

From d9e1f7040092aa4aeab9bdb82f0c28a292c90609 Mon Sep 17 00:00:00 2001
From: myroslavlisniak <acnipin@gmail.com>
Date: Thu, 15 Mar 2018 17:20:17 -0700
Subject: [PATCH 354/566] [SPARK-23670][SQL] Fix memory leak on
 SparkPlanGraphWrapper

Clean up SparkPlanGraphWrapper objects from InMemoryStore together with cleaning up SQLExecutionUIData
existing unit test was extended to check also SparkPlanGraphWrapper object count

vanzin

Author: myroslavlisniak <acnipin@gmail.com>

Closes #20813 from myroslavlisniak/master.

(cherry picked from commit c2632edebd978716dbfa7874a2fc0a8f5a4a9951)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../apache/spark/sql/execution/ui/SQLAppStatusListener.scala | 5 ++++-
 .../apache/spark/sql/execution/ui/SQLAppStatusStore.scala    | 4 ++++
 .../spark/sql/execution/ui/SQLAppStatusListenerSuite.scala   | 1 +
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
index 53fb9a0cc21cf..71e9f93c4566e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
@@ -334,7 +334,10 @@ class SQLAppStatusListener(
 
     val view = kvstore.view(classOf[SQLExecutionUIData]).index("completionTime").first(0L)
     val toDelete = KVUtils.viewToSeq(view, countToDelete.toInt)(_.completionTime.isDefined)
-    toDelete.foreach { e => kvstore.delete(e.getClass(), e.executionId) }
+    toDelete.foreach { e =>
+      kvstore.delete(e.getClass(), e.executionId)
+      kvstore.delete(classOf[SparkPlanGraphWrapper], e.executionId)
+    }
   }
 
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala
index 9a76584717f42..241001a857c8f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala
@@ -54,6 +54,10 @@ class SQLAppStatusStore(
     store.count(classOf[SQLExecutionUIData])
   }
 
+  def planGraphCount(): Long = {
+    store.count(classOf[SparkPlanGraphWrapper])
+  }
+
   def executionMetrics(executionId: Long): Map[Long, String] = {
     def metricsFromStore(): Option[Map[Long, String]] = {
       val exec = store.read(classOf[SQLExecutionUIData], executionId)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala
index 85face3994fd4..f3f08839c1d3a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala
@@ -611,6 +611,7 @@ class SQLAppStatusListenerMemoryLeakSuite extends SparkFunSuite {
         sc.listenerBus.waitUntilEmpty(10000)
         val statusStore = spark.sharedState.statusStore
         assert(statusStore.executionsCount() <= 50)
+        assert(statusStore.planGraphCount() <= 50)
         // No live data should be left behind after all executions end.
         assert(statusStore.listener.get.noLiveData())
       }

From 21b6de4592bf8c69db9135f409912b6a62f70894 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Fri, 16 Mar 2018 09:36:30 -0700
Subject: [PATCH 355/566] [SPARK-23553][TESTS] Tests should not assume the
 default value of `spark.sql.sources.default`

## What changes were proposed in this pull request?

Currently, some tests have an assumption that `spark.sql.sources.default=parquet`. In fact, that is a correct assumption, but that assumption makes it difficult to test new data source format.

This PR aims to
- Improve test suites more robust and makes it easy to test new data sources in the future.
- Test new native ORC data source with the full existing Apache Spark test coverage.

As an example, the PR uses `spark.sql.sources.default=orc` during reviews. The value should be `parquet` when this PR is accepted.

## How was this patch tested?

Pass the Jenkins with updated tests.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20705 from dongjoon-hyun/SPARK-23553.

(cherry picked from commit 5414abca4fec6a68174c34d22d071c20027e959d)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 python/pyspark/sql/readwriter.py              |  4 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  9 +--
 .../columnar/InMemoryColumnarQuerySuite.scala |  5 +-
 .../sql/execution/command/DDLSuite.scala      | 11 ++-
 .../ParquetPartitionDiscoverySuite.scala      | 10 +++
 .../sql/test/DataFrameReaderWriterSuite.scala |  3 +-
 .../sql/hive/MetastoreDataSourcesSuite.scala  | 72 +++++++++----------
 .../PartitionProviderCompatibilitySuite.scala |  6 +-
 .../hive/PartitionedTablePerfStatsSuite.scala |  2 +-
 .../sql/hive/execution/HiveDDLSuite.scala     | 11 +--
 .../sql/hive/execution/SQLQuerySuite.scala    | 19 ++---
 11 files changed, 81 insertions(+), 71 deletions(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 9d05ac7cb39be..e70aa9ef4aa97 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -147,8 +147,8 @@ def load(self, path=None, format=None, schema=None, **options):
                        or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).
         :param options: all other string options
 
-        >>> df = spark.read.load('python/test_support/sql/parquet_partitioned', opt1=True,
-        ...     opt2=1, opt3='str')
+        >>> df = spark.read.format("parquet").load('python/test_support/sql/parquet_partitioned',
+        ...     opt1=True, opt2=1, opt3='str')
         >>> df.dtypes
         [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 97d03b2cd496e..ebebf62ca4cfa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2148,7 +2148,8 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
 
   test("data source table created in InMemoryCatalog should be able to read/write") {
     withTable("tbl") {
-      sql("CREATE TABLE tbl(i INT, j STRING) USING parquet")
+      val provider = spark.sessionState.conf.defaultDataSourceName
+      sql(s"CREATE TABLE tbl(i INT, j STRING) USING $provider")
       checkAnswer(sql("SELECT i, j FROM tbl"), Nil)
 
       Seq(1 -> "a", 2 -> "b").toDF("i", "j").write.mode("overwrite").insertInto("tbl")
@@ -2472,9 +2473,9 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
 
   test("SPARK-16975: Column-partition path starting '_' should be handled correctly") {
     withTempDir { dir =>
-      val parquetDir = new File(dir, "parquet").getCanonicalPath
-      spark.range(10).withColumn("_col", $"id").write.partitionBy("_col").save(parquetDir)
-      spark.read.parquet(parquetDir)
+      val dataDir = new File(dir, "data").getCanonicalPath
+      spark.range(10).withColumn("_col", $"id").write.partitionBy("_col").save(dataDir)
+      spark.read.load(dataDir)
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
index dc1766fb9a785..26b63e8e8490f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
@@ -487,7 +487,10 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-22673: InMemoryRelation should utilize existing stats of the plan to be cached") {
-    withSQLConf("spark.sql.cbo.enabled" -> "true") {
+    // This test case depends on the size of parquet in statistics.
+    withSQLConf(
+        SQLConf.CBO_ENABLED.key -> "true",
+        SQLConf.DEFAULT_DATA_SOURCE_NAME.key -> "parquet") {
       withTempPath { workDir =>
         withTable("table1") {
           val workDirPath = workDir.getAbsolutePath
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 1426df50d95c0..d360367d0c52b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -154,10 +154,15 @@ class InMemoryCatalogedDDLSuite extends DDLSuite with SharedSQLContext with Befo
       Seq(4 -> "d").toDF("i", "j").write.saveAsTable("t1")
 
       val e = intercept[AnalysisException] {
-        Seq(5 -> "e").toDF("i", "j").write.mode("append").format("json").saveAsTable("t1")
+        val format = if (spark.sessionState.conf.defaultDataSourceName.equalsIgnoreCase("json")) {
+          "orc"
+        } else {
+          "json"
+        }
+        Seq(5 -> "e").toDF("i", "j").write.mode("append").format(format).saveAsTable("t1")
       }
-      assert(e.message.contains("The format of the existing table default.t1 is " +
-        "`ParquetFileFormat`. It doesn't match the specified format `JsonFileFormat`."))
+      assert(e.message.contains("The format of the existing table default.t1 is "))
+      assert(e.message.contains("It doesn't match the specified format"))
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index edb3da904d10d..e887c9734a8b8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -57,6 +57,16 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
   val timeZone = TimeZone.getDefault()
   val timeZoneId = timeZone.getID
 
+  protected override def beforeAll(): Unit = {
+    super.beforeAll()
+    spark.conf.set(SQLConf.DEFAULT_DATA_SOURCE_NAME.key, "parquet")
+  }
+
+  protected override def afterAll(): Unit = {
+    spark.conf.unset(SQLConf.DEFAULT_DATA_SOURCE_NAME.key)
+    super.afterAll()
+  }
+
   test("column type inference") {
     def check(raw: String, literal: Literal, timeZone: TimeZone = timeZone): Unit = {
       assert(inferPartitionColumnValue(raw, true, timeZone) === literal)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
index 8c9bb7d56a35f..b3147b0a11478 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
@@ -563,7 +563,8 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be
     "and a same-name temp view exist") {
     withTable("same_name") {
       withTempView("same_name") {
-        sql("CREATE TABLE same_name(id LONG) USING parquet")
+        val format = spark.sessionState.conf.defaultDataSourceName
+        sql(s"CREATE TABLE same_name(id LONG) USING $format")
         spark.range(10).createTempView("same_name")
         spark.range(20).write.mode(SaveMode.Append).saveAsTable("same_name")
         checkAnswer(spark.table("same_name"), spark.range(10).toDF())
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index fade143a1755e..d7e2b575da08e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -591,7 +591,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
   }
 
   test("Pre insert nullability check (ArrayType)") {
-    withTable("arrayInParquet") {
+    withTable("array") {
       {
         val df = (Tuple1(Seq(Int.box(1), null: Integer)) :: Nil).toDF("a")
         val expectedSchema =
@@ -604,9 +604,8 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
         assert(df.schema === expectedSchema)
 
         df.write
-          .format("parquet")
           .mode(SaveMode.Overwrite)
-          .saveAsTable("arrayInParquet")
+          .saveAsTable("array")
       }
 
       {
@@ -621,25 +620,24 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
         assert(df.schema === expectedSchema)
 
         df.write
-          .format("parquet")
           .mode(SaveMode.Append)
-          .insertInto("arrayInParquet")
+          .insertInto("array")
       }
 
       (Tuple1(Seq(4, 5)) :: Nil).toDF("a")
         .write
         .mode(SaveMode.Append)
-        .saveAsTable("arrayInParquet") // This one internally calls df2.insertInto.
+        .saveAsTable("array") // This one internally calls df2.insertInto.
 
       (Tuple1(Seq(Int.box(6), null: Integer)) :: Nil).toDF("a")
         .write
         .mode(SaveMode.Append)
-        .saveAsTable("arrayInParquet")
+        .saveAsTable("array")
 
-      sparkSession.catalog.refreshTable("arrayInParquet")
+      sparkSession.catalog.refreshTable("array")
 
       checkAnswer(
-        sql("SELECT a FROM arrayInParquet"),
+        sql("SELECT a FROM array"),
         Row(ArrayBuffer(1, null)) ::
           Row(ArrayBuffer(2, 3)) ::
           Row(ArrayBuffer(4, 5)) ::
@@ -648,7 +646,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
   }
 
   test("Pre insert nullability check (MapType)") {
-    withTable("mapInParquet") {
+    withTable("map") {
       {
         val df = (Tuple1(Map(1 -> (null: Integer))) :: Nil).toDF("a")
         val expectedSchema =
@@ -661,9 +659,8 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
         assert(df.schema === expectedSchema)
 
         df.write
-          .format("parquet")
           .mode(SaveMode.Overwrite)
-          .saveAsTable("mapInParquet")
+          .saveAsTable("map")
       }
 
       {
@@ -678,27 +675,24 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
         assert(df.schema === expectedSchema)
 
         df.write
-          .format("parquet")
           .mode(SaveMode.Append)
-          .insertInto("mapInParquet")
+          .insertInto("map")
       }
 
       (Tuple1(Map(4 -> 5)) :: Nil).toDF("a")
         .write
-        .format("parquet")
         .mode(SaveMode.Append)
-        .saveAsTable("mapInParquet") // This one internally calls df2.insertInto.
+        .saveAsTable("map") // This one internally calls df2.insertInto.
 
       (Tuple1(Map(6 -> null.asInstanceOf[Integer])) :: Nil).toDF("a")
         .write
-        .format("parquet")
         .mode(SaveMode.Append)
-        .saveAsTable("mapInParquet")
+        .saveAsTable("map")
 
-      sparkSession.catalog.refreshTable("mapInParquet")
+      sparkSession.catalog.refreshTable("map")
 
       checkAnswer(
-        sql("SELECT a FROM mapInParquet"),
+        sql("SELECT a FROM map"),
         Row(Map(1 -> null)) ::
           Row(Map(2 -> 3)) ::
           Row(Map(4 -> 5)) ::
@@ -852,52 +846,52 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
       (from to to).map(i => i -> s"str$i").toDF("c1", "c2")
     }
 
-    withTable("insertParquet") {
-      createDF(0, 9).write.format("parquet").saveAsTable("insertParquet")
+    withTable("t") {
+      createDF(0, 9).write.saveAsTable("t")
       checkAnswer(
-        sql("SELECT p.c1, p.c2 FROM insertParquet p WHERE p.c1 > 5"),
+        sql("SELECT p.c1, p.c2 FROM t p WHERE p.c1 > 5"),
         (6 to 9).map(i => Row(i, s"str$i")))
 
       intercept[AnalysisException] {
-        createDF(10, 19).write.format("parquet").saveAsTable("insertParquet")
+        createDF(10, 19).write.saveAsTable("t")
       }
 
-      createDF(10, 19).write.mode(SaveMode.Append).format("parquet").saveAsTable("insertParquet")
+      createDF(10, 19).write.mode(SaveMode.Append).saveAsTable("t")
       checkAnswer(
-        sql("SELECT p.c1, p.c2 FROM insertParquet p WHERE p.c1 > 5"),
+        sql("SELECT p.c1, p.c2 FROM t p WHERE p.c1 > 5"),
         (6 to 19).map(i => Row(i, s"str$i")))
 
-      createDF(20, 29).write.mode(SaveMode.Append).format("parquet").saveAsTable("insertParquet")
+      createDF(20, 29).write.mode(SaveMode.Append).saveAsTable("t")
       checkAnswer(
-        sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 5 AND p.c1 < 25"),
+        sql("SELECT p.c1, c2 FROM t p WHERE p.c1 > 5 AND p.c1 < 25"),
         (6 to 24).map(i => Row(i, s"str$i")))
 
       intercept[AnalysisException] {
-        createDF(30, 39).write.saveAsTable("insertParquet")
+        createDF(30, 39).write.saveAsTable("t")
       }
 
-      createDF(30, 39).write.mode(SaveMode.Append).saveAsTable("insertParquet")
+      createDF(30, 39).write.mode(SaveMode.Append).saveAsTable("t")
       checkAnswer(
-        sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 5 AND p.c1 < 35"),
+        sql("SELECT p.c1, c2 FROM t p WHERE p.c1 > 5 AND p.c1 < 35"),
         (6 to 34).map(i => Row(i, s"str$i")))
 
-      createDF(40, 49).write.mode(SaveMode.Append).insertInto("insertParquet")
+      createDF(40, 49).write.mode(SaveMode.Append).insertInto("t")
       checkAnswer(
-        sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 5 AND p.c1 < 45"),
+        sql("SELECT p.c1, c2 FROM t p WHERE p.c1 > 5 AND p.c1 < 45"),
         (6 to 44).map(i => Row(i, s"str$i")))
 
-      createDF(50, 59).write.mode(SaveMode.Overwrite).saveAsTable("insertParquet")
+      createDF(50, 59).write.mode(SaveMode.Overwrite).saveAsTable("t")
       checkAnswer(
-        sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 51 AND p.c1 < 55"),
+        sql("SELECT p.c1, c2 FROM t p WHERE p.c1 > 51 AND p.c1 < 55"),
         (52 to 54).map(i => Row(i, s"str$i")))
-      createDF(60, 69).write.mode(SaveMode.Ignore).saveAsTable("insertParquet")
+      createDF(60, 69).write.mode(SaveMode.Ignore).saveAsTable("t")
       checkAnswer(
-        sql("SELECT p.c1, c2 FROM insertParquet p"),
+        sql("SELECT p.c1, c2 FROM t p"),
         (50 to 59).map(i => Row(i, s"str$i")))
 
-      createDF(70, 79).write.mode(SaveMode.Overwrite).insertInto("insertParquet")
+      createDF(70, 79).write.mode(SaveMode.Overwrite).insertInto("t")
       checkAnswer(
-        sql("SELECT p.c1, c2 FROM insertParquet p"),
+        sql("SELECT p.c1, c2 FROM t p"),
         (70 to 79).map(i => Row(i, s"str$i")))
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
index 9440a17677ebf..80afc9d8f44bc 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
@@ -37,11 +37,11 @@ class PartitionProviderCompatibilitySuite
     spark.range(5).selectExpr("id as fieldOne", "id as partCol").write
       .partitionBy("partCol")
       .mode("overwrite")
-      .parquet(dir.getAbsolutePath)
+      .save(dir.getAbsolutePath)
 
     spark.sql(s"""
       |create table $tableName (fieldOne long, partCol int)
-      |using parquet
+      |using ${spark.sessionState.conf.defaultDataSourceName}
       |options (path "${dir.toURI}")
       |partitioned by (partCol)""".stripMargin)
   }
@@ -358,7 +358,7 @@ class PartitionProviderCompatibilitySuite
     try {
       spark.sql(s"""
         |create table test (id long, P1 int, P2 int)
-        |using parquet
+        |using ${spark.sessionState.conf.defaultDataSourceName}
         |options (path "${base.toURI}")
         |partitioned by (P1, P2)""".stripMargin)
       spark.sql(s"alter table test add partition (P1=0, P2=0) location '${a.toURI}'")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
index 54d3962a46b4d..1a86c604d5da3 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
@@ -417,7 +417,7 @@ class PartitionedTablePerfStatsSuite
       import spark.implicits._
       Seq(1).toDF("a").write.mode("overwrite").save(dir.getAbsolutePath)
       HiveCatalogMetrics.reset()
-      spark.read.parquet(dir.getAbsolutePath)
+      spark.read.load(dir.getAbsolutePath)
       assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 1)
       assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 1)
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 65be244418670..db76ec9d084cb 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -1658,8 +1658,8 @@ class HiveDDLSuite
         Seq(5 -> "e").toDF("i", "j")
           .write.format("hive").mode("append").saveAsTable("t1")
       }
-      assert(e.message.contains("The format of the existing table default.t1 is " +
-        "`ParquetFileFormat`. It doesn't match the specified format `HiveFileFormat`."))
+      assert(e.message.contains("The format of the existing table default.t1 is "))
+      assert(e.message.contains("It doesn't match the specified format `HiveFileFormat`."))
     }
   }
 
@@ -1709,11 +1709,12 @@ class HiveDDLSuite
       spark.sessionState.catalog.getTableMetadata(TableIdentifier(tblName)).schema.map(_.name)
     }
 
+    val provider = spark.sessionState.conf.defaultDataSourceName
     withTable("t", "t1", "t2", "t3", "t4", "t5", "t6") {
-      sql("CREATE TABLE t(a int, b int, c int, d int) USING parquet PARTITIONED BY (d, b)")
+      sql(s"CREATE TABLE t(a int, b int, c int, d int) USING $provider PARTITIONED BY (d, b)")
       assert(getTableColumns("t") == Seq("a", "c", "d", "b"))
 
-      sql("CREATE TABLE t1 USING parquet PARTITIONED BY (d, b) AS SELECT 1 a, 1 b, 1 c, 1 d")
+      sql(s"CREATE TABLE t1 USING $provider PARTITIONED BY (d, b) AS SELECT 1 a, 1 b, 1 c, 1 d")
       assert(getTableColumns("t1") == Seq("a", "c", "d", "b"))
 
       Seq((1, 1, 1, 1)).toDF("a", "b", "c", "d").write.partitionBy("d", "b").saveAsTable("t2")
@@ -1723,7 +1724,7 @@ class HiveDDLSuite
         val dataPath = new File(new File(path, "d=1"), "b=1").getCanonicalPath
         Seq(1 -> 1).toDF("a", "c").write.save(dataPath)
 
-        sql(s"CREATE TABLE t3 USING parquet LOCATION '${path.toURI}'")
+        sql(s"CREATE TABLE t3 USING $provider LOCATION '${path.toURI}'")
         assert(getTableColumns("t3") == Seq("a", "c", "d", "b"))
       }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 33bcae91fdaf4..5e14f5397e9e6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -516,24 +516,19 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   test("CTAS with default fileformat") {
     val table = "ctas1"
     val ctas = s"CREATE TABLE IF NOT EXISTS $table SELECT key k, value FROM src"
-    withSQLConf(SQLConf.CONVERT_CTAS.key -> "true") {
-      withSQLConf("hive.default.fileformat" -> "textfile") {
+    Seq("orc", "parquet").foreach { dataSourceFormat =>
+      withSQLConf(
+        SQLConf.CONVERT_CTAS.key -> "true",
+        SQLConf.DEFAULT_DATA_SOURCE_NAME.key -> dataSourceFormat,
+        "hive.default.fileformat" -> "textfile") {
         withTable(table) {
           sql(ctas)
-          // We should use parquet here as that is the default datasource fileformat. The default
-          // datasource file format is controlled by `spark.sql.sources.default` configuration.
+          // The default datasource file format is controlled by `spark.sql.sources.default`.
           // This testcase verifies that setting `hive.default.fileformat` has no impact on
           // the target table's fileformat in case of CTAS.
-          assert(sessionState.conf.defaultDataSourceName === "parquet")
-          checkRelation(tableName = table, isDataSourceTable = true, format = "parquet")
+          checkRelation(tableName = table, isDataSourceTable = true, format = dataSourceFormat)
         }
       }
-      withSQLConf("spark.sql.sources.default" -> "orc") {
-        withTable(table) {
-          sql(ctas)
-          checkRelation(tableName = table, isDataSourceTable = true, format = "orc")
-         }
-      }
     }
   }
 

From 6937571ab8818a62ec2457a373eb3f6f618985e1 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Sat, 17 Mar 2018 16:24:51 -0700
Subject: [PATCH 356/566] [SPARK-23623][SS] Avoid concurrent use of cached
 consumers in CachedKafkaConsumer (branch-2.3)

This is a backport of #20767 to branch 2.3

## What changes were proposed in this pull request?
CacheKafkaConsumer in the project `kafka-0-10-sql` is designed to maintain a pool of KafkaConsumers that can be reused. However, it was built with the assumption there will be only one task using trying to read the same Kafka TopicPartition at the same time. Hence, the cache was keyed by the TopicPartition a consumer is supposed to read. And any cases where this assumption may not be true, we have SparkPlan flag to disable the use of a cache. So it was up to the planner to correctly identify when it was not safe to use the cache and set the flag accordingly.

Fundamentally, this is the wrong way to approach the problem. It is HARD for a high-level planner to reason about the low-level execution model, whether there will be multiple tasks in the same query trying to read the same partition. Case in point, 2.3.0 introduced stream-stream joins, and you can build a streaming self-join query on Kafka. It's pretty non-trivial to figure out how this leads to two tasks reading the same partition twice, possibly concurrently. And due to the non-triviality, it is hard to figure this out in the planner and set the flag to avoid the cache / consumer pool. And this can inadvertently lead to ConcurrentModificationException ,or worse, silent reading of incorrect data.

Here is a better way to design this. The planner shouldnt have to understand these low-level optimizations. Rather the consumer pool should be smart enough avoid concurrent use of a cached consumer. Currently, it tries to do so but incorrectly (the flag inuse is not checked when returning a cached consumer, see [this](https://github.com/apache/spark/blob/master/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala#L403)). If there is another request for the same partition as a currently in-use consumer, the pool should automatically return a fresh consumer that should be closed when the task is done. Then the planner does not have to have a flag to avoid reuses.

This PR is a step towards that goal. It does the following.
- There are effectively two kinds of consumer that may be generated
  - Cached consumer - this should be returned to the pool at task end
  - Non-cached consumer - this should be closed at task end
- A trait called KafkaConsumer is introduced to hide this difference from the users of the consumer so that the client code does not have to reason about whether to stop and release. They simply called `val consumer = KafkaConsumer.acquire` and then `consumer.release()`.
- If there is request for a consumer that is in-use, then a new consumer is generated.
- If there is a concurrent attempt of the same task, then a new consumer is generated, and the existing cached consumer is marked for close upon release.
- In addition, I renamed the classes because CachedKafkaConsumer is a misnomer given that what it returns may or may not be cached.

This PR does not remove the planner flag to avoid reuse to make this patch safe enough for merging in branch-2.3. This can be done later in master-only.

## How was this patch tested?
A new stress test that verifies it is safe to concurrently get consumers for the same partition from the consumer pool.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #20848 from tdas/SPARK-23623-2.3.
---
 .../sql/kafka010/KafkaContinuousReader.scala  |   4 +-
 ...Consumer.scala => KafkaDataConsumer.scala} | 242 ++++++++++++------
 .../spark/sql/kafka010/KafkaSourceRDD.scala   |  23 +-
 .../kafka010/CachedKafkaConsumerSuite.scala   |  34 ---
 .../sql/kafka010/KafkaDataConsumerSuite.scala | 124 +++++++++
 5 files changed, 292 insertions(+), 135 deletions(-)
 rename external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/{CachedKafkaConsumer.scala => KafkaDataConsumer.scala} (66%)
 delete mode 100644 external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumerSuite.scala
 create mode 100644 external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDataConsumerSuite.scala

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
index a269a50d81c38..a2a4c8349d968 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
@@ -189,7 +189,7 @@ class KafkaContinuousDataReader(
     failOnDataLoss: Boolean) extends ContinuousDataReader[UnsafeRow] {
   private val topic = topicPartition.topic
   private val kafkaPartition = topicPartition.partition
-  private val consumer = CachedKafkaConsumer.createUncached(topic, kafkaPartition, kafkaParams)
+  private val consumer = KafkaDataConsumer.acquire(topicPartition, kafkaParams, useCache = false)
 
   private val sharedRow = new UnsafeRow(7)
   private val bufferHolder = new BufferHolder(sharedRow)
@@ -255,6 +255,6 @@ class KafkaContinuousDataReader(
   }
 
   override def close(): Unit = {
-    consumer.close()
+    consumer.release()
   }
 }
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaDataConsumer.scala
similarity index 66%
rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala
rename to external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaDataConsumer.scala
index 90ed7b1fba2f8..dcf2f6359896c 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaDataConsumer.scala
@@ -27,30 +27,73 @@ import org.apache.kafka.common.TopicPartition
 
 import org.apache.spark.{SparkEnv, SparkException, TaskContext}
 import org.apache.spark.internal.Logging
+import org.apache.spark.sql.kafka010.KafkaDataConsumer.AvailableOffsetRange
 import org.apache.spark.sql.kafka010.KafkaSource._
 import org.apache.spark.util.UninterruptibleThread
 
+private[kafka010] sealed trait KafkaDataConsumer {
+  /**
+   * Get the record for the given offset if available. Otherwise it will either throw error
+   * (if failOnDataLoss = true), or return the next available offset within [offset, untilOffset),
+   * or null.
+   *
+   * @param offset         the offset to fetch.
+   * @param untilOffset    the max offset to fetch. Exclusive.
+   * @param pollTimeoutMs  timeout in milliseconds to poll data from Kafka.
+   * @param failOnDataLoss When `failOnDataLoss` is `true`, this method will either return record at
+   *                       offset if available, or throw exception.when `failOnDataLoss` is `false`,
+   *                       this method will either return record at offset if available, or return
+   *                       the next earliest available record less than untilOffset, or null. It
+   *                       will not throw any exception.
+   */
+  def get(
+      offset: Long,
+      untilOffset: Long,
+      pollTimeoutMs: Long,
+      failOnDataLoss: Boolean): ConsumerRecord[Array[Byte], Array[Byte]] = {
+    internalConsumer.get(offset, untilOffset, pollTimeoutMs, failOnDataLoss)
+  }
+
+  /**
+   * Return the available offset range of the current partition. It's a pair of the earliest offset
+   * and the latest offset.
+   */
+  def getAvailableOffsetRange(): AvailableOffsetRange = internalConsumer.getAvailableOffsetRange()
+
+  /**
+   * Release this consumer from being further used. Depending on its implementation,
+   * this consumer will be either finalized, or reset for reuse later.
+   */
+  def release(): Unit
+
+  /** Reference to the internal implementation that this wrapper delegates to */
+  protected def internalConsumer: InternalKafkaConsumer
+}
+
 
 /**
- * Consumer of single topicpartition, intended for cached reuse.
- * Underlying consumer is not threadsafe, so neither is this,
- * but processing the same topicpartition and group id in multiple threads is usually bad anyway.
+ * A wrapper around Kafka's KafkaConsumer that throws error when data loss is detected.
+ * This is not for direct use outside this file.
  */
-private[kafka010] case class CachedKafkaConsumer private(
+private[kafka010] case class InternalKafkaConsumer(
     topicPartition: TopicPartition,
     kafkaParams: ju.Map[String, Object]) extends Logging {
-  import CachedKafkaConsumer._
+  import InternalKafkaConsumer._
 
   private val groupId = kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).asInstanceOf[String]
 
-  private var consumer = createConsumer
+  @volatile private var consumer = createConsumer
 
   /** indicates whether this consumer is in use or not */
-  private var inuse = true
+  @volatile var inUse = true
+
+  /** indicate whether this consumer is going to be stopped in the next release */
+  @volatile var markedForClose = false
 
   /** Iterator to the already fetch data */
-  private var fetchedData = ju.Collections.emptyIterator[ConsumerRecord[Array[Byte], Array[Byte]]]
-  private var nextOffsetInFetchedData = UNKNOWN_OFFSET
+  @volatile private var fetchedData =
+    ju.Collections.emptyIterator[ConsumerRecord[Array[Byte], Array[Byte]]]
+  @volatile private var nextOffsetInFetchedData = UNKNOWN_OFFSET
 
   /** Create a KafkaConsumer to fetch records for `topicPartition` */
   private def createConsumer: KafkaConsumer[Array[Byte], Array[Byte]] = {
@@ -61,8 +104,6 @@ private[kafka010] case class CachedKafkaConsumer private(
     c
   }
 
-  case class AvailableOffsetRange(earliest: Long, latest: Long)
-
   private def runUninterruptiblyIfPossible[T](body: => T): T = Thread.currentThread match {
     case ut: UninterruptibleThread =>
       ut.runUninterruptibly(body)
@@ -313,21 +354,51 @@ private[kafka010] case class CachedKafkaConsumer private(
   }
 }
 
-private[kafka010] object CachedKafkaConsumer extends Logging {
 
-  private val UNKNOWN_OFFSET = -2L
+private[kafka010] object KafkaDataConsumer extends Logging {
+
+  case class AvailableOffsetRange(earliest: Long, latest: Long)
+
+  private case class CachedKafkaDataConsumer(internalConsumer: InternalKafkaConsumer)
+    extends KafkaDataConsumer {
+    assert(internalConsumer.inUse) // make sure this has been set to true
+    override def release(): Unit = { KafkaDataConsumer.release(internalConsumer) }
+  }
+
+  private case class NonCachedKafkaDataConsumer(internalConsumer: InternalKafkaConsumer)
+    extends KafkaDataConsumer {
+    override def release(): Unit = { internalConsumer.close() }
+  }
 
-  private case class CacheKey(groupId: String, topicPartition: TopicPartition)
+  private case class CacheKey(groupId: String, topicPartition: TopicPartition) {
+    def this(topicPartition: TopicPartition, kafkaParams: ju.Map[String, Object]) =
+      this(kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).asInstanceOf[String], topicPartition)
+  }
 
+  // This cache has the following important properties.
+  // - We make a best-effort attempt to maintain the max size of the cache as configured capacity.
+  //   The capacity is not guaranteed to be maintained, especially when there are more active
+  //   tasks simultaneously using consumers than the capacity.
   private lazy val cache = {
     val conf = SparkEnv.get.conf
     val capacity = conf.getInt("spark.sql.kafkaConsumerCache.capacity", 64)
-    new ju.LinkedHashMap[CacheKey, CachedKafkaConsumer](capacity, 0.75f, true) {
+    new ju.LinkedHashMap[CacheKey, InternalKafkaConsumer](capacity, 0.75f, true) {
       override def removeEldestEntry(
-        entry: ju.Map.Entry[CacheKey, CachedKafkaConsumer]): Boolean = {
-        if (entry.getValue.inuse == false && this.size > capacity) {
-          logWarning(s"KafkaConsumer cache hitting max capacity of $capacity, " +
-            s"removing consumer for ${entry.getKey}")
+        entry: ju.Map.Entry[CacheKey, InternalKafkaConsumer]): Boolean = {
+
+        // Try to remove the least-used entry if its currently not in use.
+        //
+        // If you cannot remove it, then the cache will keep growing. In the worst case,
+        // the cache will grow to the max number of concurrent tasks that can run in the executor,
+        // (that is, number of tasks slots) after which it will never reduce. This is unlikely to
+        // be a serious problem because an executor with more than 64 (default) tasks slots is
+        // likely running on a beefy machine that can handle a large number of simultaneously
+        // active consumers.
+
+        if (entry.getValue.inUse == false && this.size > capacity) {
+          logWarning(
+            s"KafkaConsumer cache hitting max capacity of $capacity, " +
+              s"removing consumer for ${entry.getKey}")
           try {
             entry.getValue.close()
           } catch {
@@ -342,80 +413,87 @@ private[kafka010] object CachedKafkaConsumer extends Logging {
     }
   }
 
-  def releaseKafkaConsumer(
-      topic: String,
-      partition: Int,
-      kafkaParams: ju.Map[String, Object]): Unit = {
-    val groupId = kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).asInstanceOf[String]
-    val topicPartition = new TopicPartition(topic, partition)
-    val key = CacheKey(groupId, topicPartition)
-
-    synchronized {
-      val consumer = cache.get(key)
-      if (consumer != null) {
-        consumer.inuse = false
-      } else {
-        logWarning(s"Attempting to release consumer that does not exist")
-      }
-    }
-  }
-
   /**
-   * Removes (and closes) the Kafka Consumer for the given topic, partition and group id.
+   * Get a cached consumer for groupId, assigned to topic and partition.
+   * If matching consumer doesn't already exist, will be created using kafkaParams.
+   * The returned consumer must be released explicitly using [[KafkaDataConsumer.release()]].
+   *
+   * Note: This method guarantees that the consumer returned is not currently in use by any one
+   * else. Within this guarantee, this method will make a best effort attempt to re-use consumers by
+   * caching them and tracking when they are in use.
    */
-  def removeKafkaConsumer(
-      topic: String,
-      partition: Int,
-      kafkaParams: ju.Map[String, Object]): Unit = {
-    val groupId = kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).asInstanceOf[String]
-    val topicPartition = new TopicPartition(topic, partition)
-    val key = CacheKey(groupId, topicPartition)
+  def acquire(
+      topicPartition: TopicPartition,
+      kafkaParams: ju.Map[String, Object],
+      useCache: Boolean): KafkaDataConsumer = synchronized {
+    val key = new CacheKey(topicPartition, kafkaParams)
+    val existingInternalConsumer = cache.get(key)
 
-    synchronized {
-      val removedConsumer = cache.remove(key)
-      if (removedConsumer != null) {
-        removedConsumer.close()
+    lazy val newInternalConsumer = new InternalKafkaConsumer(topicPartition, kafkaParams)
+
+    if (TaskContext.get != null && TaskContext.get.attemptNumber >= 1) {
+      // If this is reattempt at running the task, then invalidate cached consumer if any and
+      // start with a new one.
+      if (existingInternalConsumer != null) {
+        // Consumer exists in cache. If its in use, mark it for closing later, or close it now.
+        if (existingInternalConsumer.inUse) {
+          existingInternalConsumer.markedForClose = true
+        } else {
+          existingInternalConsumer.close()
+        }
       }
+      cache.remove(key)  // Invalidate the cache in any case
+      NonCachedKafkaDataConsumer(newInternalConsumer)
+
+    } else if (!useCache) {
+      // If planner asks to not reuse consumers, then do not use it, return a new consumer
+      NonCachedKafkaDataConsumer(newInternalConsumer)
+
+    } else if (existingInternalConsumer == null) {
+      // If consumer is not already cached, then put a new in the cache and return it
+      cache.put(key, newInternalConsumer)
+      newInternalConsumer.inUse = true
+      CachedKafkaDataConsumer(newInternalConsumer)
+
+    } else if (existingInternalConsumer.inUse) {
+      // If consumer is already cached but is currently in use, then return a new consumer
+      NonCachedKafkaDataConsumer(newInternalConsumer)
+
+    } else {
+      // If consumer is already cached and is currently not in use, then return that consumer
+      existingInternalConsumer.inUse = true
+      CachedKafkaDataConsumer(existingInternalConsumer)
     }
   }
 
-  /**
-   * Get a cached consumer for groupId, assigned to topic and partition.
-   * If matching consumer doesn't already exist, will be created using kafkaParams.
-   */
-  def getOrCreate(
-      topic: String,
-      partition: Int,
-      kafkaParams: ju.Map[String, Object]): CachedKafkaConsumer = synchronized {
-    val groupId = kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).asInstanceOf[String]
-    val topicPartition = new TopicPartition(topic, partition)
-    val key = CacheKey(groupId, topicPartition)
-
-    // If this is reattempt at running the task, then invalidate cache and start with
-    // a new consumer
-    if (TaskContext.get != null && TaskContext.get.attemptNumber >= 1) {
-      removeKafkaConsumer(topic, partition, kafkaParams)
-      val consumer = new CachedKafkaConsumer(topicPartition, kafkaParams)
-      consumer.inuse = true
-      cache.put(key, consumer)
-      consumer
-    } else {
-      if (!cache.containsKey(key)) {
-        cache.put(key, new CachedKafkaConsumer(topicPartition, kafkaParams))
+  private def release(intConsumer: InternalKafkaConsumer): Unit = {
+    synchronized {
+
+      // Clear the consumer from the cache if this is indeed the consumer present in the cache
+      val key = new CacheKey(intConsumer.topicPartition, intConsumer.kafkaParams)
+      val cachedIntConsumer = cache.get(key)
+      if (intConsumer.eq(cachedIntConsumer)) {
+        // The released consumer is the same object as the cached one.
+        if (intConsumer.markedForClose) {
+          intConsumer.close()
+          cache.remove(key)
+        } else {
+          intConsumer.inUse = false
+        }
+      } else {
+        // The released consumer is either not the same one as in the cache, or not in the cache
+        // at all. This may happen if the cache was invalidate while this consumer was being used.
+        // Just close this consumer.
+        intConsumer.close()
+        logInfo(s"Released a supposedly cached consumer that was not found in the cache")
       }
-      val consumer = cache.get(key)
-      consumer.inuse = true
-      consumer
     }
   }
+}
 
-  /** Create an [[CachedKafkaConsumer]] but don't put it into cache. */
-  def createUncached(
-      topic: String,
-      partition: Int,
-      kafkaParams: ju.Map[String, Object]): CachedKafkaConsumer = {
-    new CachedKafkaConsumer(new TopicPartition(topic, partition), kafkaParams)
-  }
+private[kafka010] object InternalKafkaConsumer extends Logging {
+
+  private val UNKNOWN_OFFSET = -2L
 
   private def reportDataLoss0(
       failOnDataLoss: Boolean,
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala
index 66b3409c0cd04..498e344ea39f4 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala
@@ -52,7 +52,7 @@ private[kafka010] case class KafkaSourceRDDPartition(
  * An RDD that reads data from Kafka based on offset ranges across multiple partitions.
  * Additionally, it allows preferred locations to be set for each topic + partition, so that
  * the [[KafkaSource]] can ensure the same executor always reads the same topic + partition
- * and cached KafkaConsumers (see [[CachedKafkaConsumer]] can be used read data efficiently.
+ * and cached KafkaConsumers (see [[KafkaDataConsumer]] can be used read data efficiently.
  *
  * @param sc the [[SparkContext]]
  * @param executorKafkaParams Kafka configuration for creating KafkaConsumer on the executors
@@ -126,14 +126,9 @@ private[kafka010] class KafkaSourceRDD(
     val sourcePartition = thePart.asInstanceOf[KafkaSourceRDDPartition]
     val topic = sourcePartition.offsetRange.topic
     val kafkaPartition = sourcePartition.offsetRange.partition
-    val consumer =
-      if (!reuseKafkaConsumer) {
-        // If we can't reuse CachedKafkaConsumers, creating a new CachedKafkaConsumer. As here we
-        // uses `assign`, we don't need to worry about the "group.id" conflicts.
-        CachedKafkaConsumer.createUncached(topic, kafkaPartition, executorKafkaParams)
-      } else {
-        CachedKafkaConsumer.getOrCreate(topic, kafkaPartition, executorKafkaParams)
-      }
+    val consumer = KafkaDataConsumer.acquire(
+      sourcePartition.offsetRange.topicPartition, executorKafkaParams, reuseKafkaConsumer)
+
     val range = resolveRange(consumer, sourcePartition.offsetRange)
     assert(
       range.fromOffset <= range.untilOffset,
@@ -167,13 +162,7 @@ private[kafka010] class KafkaSourceRDD(
         }
 
         override protected def close(): Unit = {
-          if (!reuseKafkaConsumer) {
-            // Don't forget to close non-reuse KafkaConsumers. You may take down your cluster!
-            consumer.close()
-          } else {
-            // Indicate that we're no longer using this consumer
-            CachedKafkaConsumer.releaseKafkaConsumer(topic, kafkaPartition, executorKafkaParams)
-          }
+          consumer.release()
         }
       }
       // Release consumer, either by removing it or indicating we're no longer using it
@@ -184,7 +173,7 @@ private[kafka010] class KafkaSourceRDD(
     }
   }
 
-  private def resolveRange(consumer: CachedKafkaConsumer, range: KafkaSourceRDDOffsetRange) = {
+  private def resolveRange(consumer: KafkaDataConsumer, range: KafkaSourceRDDOffsetRange) = {
     if (range.fromOffset < 0 || range.untilOffset < 0) {
       // Late bind the offset range
       val availableOffsetRange = consumer.getAvailableOffsetRange()
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumerSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumerSuite.scala
deleted file mode 100644
index 7aa7dd096c07b..0000000000000
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumerSuite.scala
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.kafka010
-
-import org.scalatest.PrivateMethodTester
-
-import org.apache.spark.sql.test.SharedSQLContext
-
-class CachedKafkaConsumerSuite extends SharedSQLContext with PrivateMethodTester {
-
-  test("SPARK-19886: Report error cause correctly in reportDataLoss") {
-    val cause = new Exception("D'oh!")
-    val reportDataLoss = PrivateMethod[Unit]('reportDataLoss0)
-    val e = intercept[IllegalStateException] {
-      CachedKafkaConsumer.invokePrivate(reportDataLoss(true, "message", cause))
-    }
-    assert(e.getCause === cause)
-  }
-}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDataConsumerSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDataConsumerSuite.scala
new file mode 100644
index 0000000000000..0d0fb9c3ab5af
--- /dev/null
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDataConsumerSuite.scala
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.util.concurrent.{Executors, TimeUnit}
+
+import scala.collection.JavaConverters._
+import scala.concurrent.{ExecutionContext, Future}
+import scala.concurrent.duration.Duration
+import scala.util.Random
+
+import org.apache.kafka.clients.consumer.ConsumerConfig
+import org.apache.kafka.common.TopicPartition
+import org.apache.kafka.common.serialization.ByteArrayDeserializer
+import org.scalatest.PrivateMethodTester
+
+import org.apache.spark.{TaskContext, TaskContextImpl}
+import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.util.ThreadUtils
+
+class KafkaDataConsumerSuite extends SharedSQLContext with PrivateMethodTester {
+
+  protected var testUtils: KafkaTestUtils = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    testUtils = new KafkaTestUtils(Map[String, Object]())
+    testUtils.setup()
+  }
+
+  override def afterAll(): Unit = {
+    if (testUtils != null) {
+      testUtils.teardown()
+      testUtils = null
+    }
+    super.afterAll()
+  }
+
+  test("SPARK-19886: Report error cause correctly in reportDataLoss") {
+    val cause = new Exception("D'oh!")
+    val reportDataLoss = PrivateMethod[Unit]('reportDataLoss0)
+    val e = intercept[IllegalStateException] {
+      InternalKafkaConsumer.invokePrivate(reportDataLoss(true, "message", cause))
+    }
+    assert(e.getCause === cause)
+  }
+
+  test("SPARK-23623: concurrent use of KafkaDataConsumer") {
+    val topic = "topic" + Random.nextInt()
+    val data = (1 to 1000).map(_.toString)
+    testUtils.createTopic(topic, 1)
+    testUtils.sendMessages(topic, data.toArray)
+    val topicPartition = new TopicPartition(topic, 0)
+
+    import ConsumerConfig._
+    val kafkaParams = Map[String, Object](
+      GROUP_ID_CONFIG -> "groupId",
+      BOOTSTRAP_SERVERS_CONFIG -> testUtils.brokerAddress,
+      KEY_DESERIALIZER_CLASS_CONFIG -> classOf[ByteArrayDeserializer].getName,
+      VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[ByteArrayDeserializer].getName,
+      AUTO_OFFSET_RESET_CONFIG -> "earliest",
+      ENABLE_AUTO_COMMIT_CONFIG -> "false"
+    )
+
+    val numThreads = 100
+    val numConsumerUsages = 500
+
+    @volatile var error: Throwable = null
+
+    def consume(i: Int): Unit = {
+      val useCache = Random.nextBoolean
+      val taskContext = if (Random.nextBoolean) {
+        new TaskContextImpl(0, 0, 0, 0, attemptNumber = Random.nextInt(2), null, null, null)
+      } else {
+        null
+      }
+      TaskContext.setTaskContext(taskContext)
+      val consumer = KafkaDataConsumer.acquire(
+        topicPartition, kafkaParams.asJava, useCache)
+      try {
+        val range = consumer.getAvailableOffsetRange()
+        val rcvd = range.earliest until range.latest map { offset =>
+          val bytes = consumer.get(offset, Long.MaxValue, 10000, failOnDataLoss = false).value()
+          new String(bytes)
+        }
+        assert(rcvd == data)
+      } catch {
+        case e: Throwable =>
+          error = e
+          throw e
+      } finally {
+        consumer.release()
+      }
+    }
+
+    val threadpool = Executors.newFixedThreadPool(numThreads)
+    try {
+      val futures = (1 to numConsumerUsages).map { i =>
+        threadpool.submit(new Runnable {
+          override def run(): Unit = { consume(i) }
+        })
+      }
+      futures.foreach(_.get(1, TimeUnit.MINUTES))
+      assert(error == null)
+    } finally {
+      threadpool.shutdown()
+    }
+  }
+}

From 80e79430ff33dca9a3641bf0a882d79b343a3f25 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sun, 18 Mar 2018 20:24:14 +0900
Subject: [PATCH 357/566] [SPARK-23706][PYTHON] spark.conf.get(value,
 default=None) should produce None in PySpark

Scala:

```
scala> spark.conf.get("hey", null)
res1: String = null
```

```
scala> spark.conf.get("spark.sql.sources.partitionOverwriteMode", null)
res2: String = null
```

Python:

**Before**

```
>>> spark.conf.get("hey", None)
...
py4j.protocol.Py4JJavaError: An error occurred while calling o30.get.
: java.util.NoSuchElementException: hey
...
```

```
>>> spark.conf.get("spark.sql.sources.partitionOverwriteMode", None)
u'STATIC'
```

**After**

```
>>> spark.conf.get("hey", None) is None
True
```

```
>>> spark.conf.get("spark.sql.sources.partitionOverwriteMode", None) is None
True
```

*Note that this PR preserves the case below:

```
>>> spark.conf.get("spark.sql.sources.partitionOverwriteMode")
u'STATIC'
```

Manually tested and unit tests were added.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20841 from HyukjinKwon/spark-conf-get.

(cherry picked from commit 61487b308b0169e3108c2ad31674a0c80b8ac5f3)
Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
---
 python/pyspark/sql/conf.py    | 11 +++++++----
 python/pyspark/sql/context.py |  8 ++++----
 python/pyspark/sql/tests.py   | 11 +++++++++++
 3 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/python/pyspark/sql/conf.py b/python/pyspark/sql/conf.py
index 792c420ca6386..d62ab91ef43f1 100644
--- a/python/pyspark/sql/conf.py
+++ b/python/pyspark/sql/conf.py
@@ -15,7 +15,9 @@
 # limitations under the License.
 #
 
-from pyspark import since
+import sys
+
+from pyspark import since, _NoValue
 from pyspark.rdd import ignore_unicode_prefix
 
 
@@ -37,15 +39,16 @@ def set(self, key, value):
 
     @ignore_unicode_prefix
     @since(2.0)
-    def get(self, key, default=None):
+    def get(self, key, default=_NoValue):
         """Returns the value of Spark runtime configuration property for the given key,
         assuming it is set.
         """
         self._checkType(key, "key")
-        if default is None:
+        if default is _NoValue:
             return self._jconf.get(key)
         else:
-            self._checkType(default, "default")
+            if default is not None:
+                self._checkType(default, "default")
             return self._jconf.get(key, default)
 
     @ignore_unicode_prefix
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index cc1cd1a5842d9..673415f864343 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -22,7 +22,7 @@
 if sys.version >= '3':
     basestring = unicode = str
 
-from pyspark import since
+from pyspark import since, _NoValue
 from pyspark.rdd import ignore_unicode_prefix
 from pyspark.sql.session import _monkey_patch_RDD, SparkSession
 from pyspark.sql.dataframe import DataFrame
@@ -124,11 +124,11 @@ def setConf(self, key, value):
 
     @ignore_unicode_prefix
     @since(1.3)
-    def getConf(self, key, defaultValue=None):
+    def getConf(self, key, defaultValue=_NoValue):
         """Returns the value of Spark SQL configuration property for the given key.
 
-        If the key is not set and defaultValue is not None, return
-        defaultValue. If the key is not set and defaultValue is None, return
+        If the key is not set and defaultValue is set, return
+        defaultValue. If the key is not set and defaultValue is not set, return
         the system default value.
 
         >>> sqlContext.getConf("spark.sql.shuffle.partitions")
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 4c51f715b58b8..82c5500fe1220 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -2452,6 +2452,17 @@ def test_conf(self):
         spark.conf.unset("bogo")
         self.assertEqual(spark.conf.get("bogo", "colombia"), "colombia")
 
+        self.assertEqual(spark.conf.get("hyukjin", None), None)
+
+        # This returns 'STATIC' because it's the default value of
+        # 'spark.sql.sources.partitionOverwriteMode', and `defaultValue` in
+        # `spark.conf.get` is unset.
+        self.assertEqual(spark.conf.get("spark.sql.sources.partitionOverwriteMode"), "STATIC")
+
+        # This returns None because 'spark.sql.sources.partitionOverwriteMode' is unset, but
+        # `defaultValue` in `spark.conf.get` is set to None.
+        self.assertEqual(spark.conf.get("spark.sql.sources.partitionOverwriteMode", None), None)
+
     def test_current_database(self):
         spark = self.spark
         spark.catalog._reset()

From 920493949eba77befd67e32f9e6ede5d594bcd56 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Cattilapiros=E2=80=9D?=
 <piros.attila.zsolt@gmail.com>
Date: Mon, 19 Mar 2018 10:42:12 -0700
Subject: [PATCH 358/566] [SPARK-23728][BRANCH-2.3] Fix ML tests with expected
 exceptions running streaming tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

The testTransformerByInterceptingException failed to catch the expected message on 2.3 during streaming tests as the feature generated message is not at the direct caused by exception but even one level deeper.

## How was this patch tested?

Running the unit tests.

Author: “attilapiros” <piros.attila.zsolt@gmail.com>

Closes #20852 from attilapiros/SPARK-23728.
---
 .../test/scala/org/apache/spark/ml/util/MLTest.scala   | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/MLTest.scala b/mllib/src/test/scala/org/apache/spark/ml/util/MLTest.scala
index 795fd0e2ac0e4..23e05acd40099 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/util/MLTest.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/util/MLTest.scala
@@ -119,9 +119,15 @@ trait MLTest extends StreamTest with TempDirectory { self: Suite =>
     expectedMessagePart : String,
     firstResultCol: String) {
 
+    def hasExpectedMessageDirectly(exception: Throwable): Boolean =
+      exception.getMessage.contains(expectedMessagePart)
+
     def hasExpectedMessage(exception: Throwable): Boolean =
-      exception.getMessage.contains(expectedMessagePart) ||
-        (exception.getCause != null && exception.getCause.getMessage.contains(expectedMessagePart))
+      hasExpectedMessageDirectly(exception) || (
+        exception.getCause != null && (
+          hasExpectedMessageDirectly(exception.getCause) || (
+            exception.getCause.getCause != null &&
+            hasExpectedMessageDirectly(exception.getCause.getCause))))
 
     withClue(s"""Expected message part "${expectedMessagePart}" is not found in DF test.""") {
       val exceptionOnDf = intercept[Throwable] {

From 5c1c03d080d58611f7ac6e265a7432b2ee76e880 Mon Sep 17 00:00:00 2001
From: Gabor Somogyi <gabor.g.somogyi@gmail.com>
Date: Mon, 19 Mar 2018 18:02:04 -0700
Subject: [PATCH 359/566] [SPARK-23660] Fix exception in yarn cluster mode when
 application ended fast

## What changes were proposed in this pull request?

Yarn throws the following exception in cluster mode when the application is really small:

```
18/03/07 23:34:22 WARN netty.NettyRpcEnv: Ignored failure: java.util.concurrent.RejectedExecutionException: Task java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask7c974942 rejected from java.util.concurrent.ScheduledThreadPoolExecutor1eea9d2d[Terminated, pool size = 0, active threads = 0, queued tasks = 0, completed tasks = 0]
18/03/07 23:34:22 ERROR yarn.ApplicationMaster: Uncaught exception:
org.apache.spark.SparkException: Exception thrown in awaitResult:
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:205)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:92)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:76)
	at org.apache.spark.deploy.yarn.YarnAllocator.<init>(YarnAllocator.scala:102)
	at org.apache.spark.deploy.yarn.YarnRMClient.register(YarnRMClient.scala:77)
	at org.apache.spark.deploy.yarn.ApplicationMaster.registerAM(ApplicationMaster.scala:450)
	at org.apache.spark.deploy.yarn.ApplicationMaster.runDriver(ApplicationMaster.scala:493)
	at org.apache.spark.deploy.yarn.ApplicationMaster.org$apache$spark$deploy$yarn$ApplicationMaster$$runImpl(ApplicationMaster.scala:345)
	at org.apache.spark.deploy.yarn.ApplicationMaster$$anonfun$run$2.apply$mcV$sp(ApplicationMaster.scala:260)
	at org.apache.spark.deploy.yarn.ApplicationMaster$$anonfun$run$2.apply(ApplicationMaster.scala:260)
	at org.apache.spark.deploy.yarn.ApplicationMaster$$anonfun$run$2.apply(ApplicationMaster.scala:260)
	at org.apache.spark.deploy.yarn.ApplicationMaster$$anon$5.run(ApplicationMaster.scala:810)
	at java.security.AccessController.doPrivileged(Native Method)
	at javax.security.auth.Subject.doAs(Subject.java:422)
	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1920)
	at org.apache.spark.deploy.yarn.ApplicationMaster.doAsUser(ApplicationMaster.scala:809)
	at org.apache.spark.deploy.yarn.ApplicationMaster.run(ApplicationMaster.scala:259)
	at org.apache.spark.deploy.yarn.ApplicationMaster$.main(ApplicationMaster.scala:834)
	at org.apache.spark.deploy.yarn.ApplicationMaster.main(ApplicationMaster.scala)
Caused by: org.apache.spark.rpc.RpcEnvStoppedException: RpcEnv already stopped.
	at org.apache.spark.rpc.netty.Dispatcher.postMessage(Dispatcher.scala:158)
	at org.apache.spark.rpc.netty.Dispatcher.postLocalMessage(Dispatcher.scala:135)
	at org.apache.spark.rpc.netty.NettyRpcEnv.ask(NettyRpcEnv.scala:229)
	at org.apache.spark.rpc.netty.NettyRpcEndpointRef.ask(NettyRpcEnv.scala:523)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:91)
	... 17 more
18/03/07 23:34:22 INFO yarn.ApplicationMaster: Final app status: FAILED, exitCode: 13, (reason: Uncaught exception: org.apache.spark.SparkException: Exception thrown in awaitResult: )
```

Example application:

```
object ExampleApp {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("ExampleApp")
    val sc = new SparkContext(conf)
    try {
      // Do nothing
    } finally {
      sc.stop()
    }
  }
```

This PR pauses user class thread after `SparkContext` created and keeps it so until application master initialises properly.

## How was this patch tested?

Automated: Existing unit tests
Manual: Application submitted into small cluster

Author: Gabor Somogyi <gabor.g.somogyi@gmail.com>

Closes #20807 from gaborgsomogyi/SPARK-23660.

(cherry picked from commit 5f4deff19511b6870f056eba5489104b9cac05a9)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../spark/deploy/yarn/ApplicationMaster.scala   | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 2f88feb0f1fdf..6e35d23def6f0 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -418,7 +418,19 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends
   }
 
   private def sparkContextInitialized(sc: SparkContext) = {
-    sparkContextPromise.success(sc)
+    sparkContextPromise.synchronized {
+      // Notify runDriver function that SparkContext is available
+      sparkContextPromise.success(sc)
+      // Pause the user class thread in order to make proper initialization in runDriver function.
+      sparkContextPromise.wait()
+    }
+  }
+
+  private def resumeDriver(): Unit = {
+    // When initialization in runDriver happened the user class thread has to be resumed.
+    sparkContextPromise.synchronized {
+      sparkContextPromise.notify()
+    }
   }
 
   private def registerAM(
@@ -497,6 +509,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends
         // if the user app did not create a SparkContext.
         throw new IllegalStateException("User did not initialize spark context!")
       }
+      resumeDriver()
       userClassThread.join()
     } catch {
       case e: SparkException if e.getCause().isInstanceOf[TimeoutException] =>
@@ -506,6 +519,8 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends
         finish(FinalApplicationStatus.FAILED,
           ApplicationMaster.EXIT_SC_NOT_INITED,
           "Timed out waiting for SparkContext.")
+    } finally {
+      resumeDriver()
     }
   }
 

From 2f82c037d90114705c0d0bd0bd7f82215aecfe3b Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Tue, 20 Mar 2018 10:07:27 +0800
Subject: [PATCH 360/566] [SPARK-23644][CORE][UI][BACKPORT-2.3] Use absolute
 path for REST call in SHS

## What changes were proposed in this pull request?

SHS is using a relative path for the REST API call to get the list of the application is a relative path call. In case of the SHS being consumed through a proxy, it can be an issue if the path doesn't end with a "/".

Therefore, we should use an absolute path for the REST call as it is done for all the other resources.

## How was this patch tested?

manual tests
Before the change:
![screen shot 2018-03-10 at 4 22 02 pm](https://user-images.githubusercontent.com/8821783/37244190-8ccf9d40-2485-11e8-8fa9-345bc81472fc.png)

After the change:
![screen shot 2018-03-10 at 4 36 34 pm 1](https://user-images.githubusercontent.com/8821783/37244201-a1922810-2485-11e8-8856-eeab2bf5e180.png)

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #20847 from mgaido91/SPARK-23644_2.3.
---
 .../main/resources/org/apache/spark/ui/static/historypage.js  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage.js b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
index 2cde66b081a1c..16d59bedf1fef 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/historypage.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
@@ -108,7 +108,7 @@ $(document).ready(function() {
     requestedIncomplete = getParameterByName("showIncomplete", searchString);
     requestedIncomplete = (requestedIncomplete == "true" ? true : false);
 
-    $.getJSON("api/v1/applications?limit=" + appLimit, function(response,status,jqXHR) {
+    $.getJSON(uiRoot + "/api/v1/applications?limit=" + appLimit, function(response,status,jqXHR) {
       var array = [];
       var hasMultipleAttempts = false;
       for (i in response) {
@@ -146,7 +146,7 @@ $(document).ready(function() {
         "showCompletedColumns": !requestedIncomplete,
       }
 
-      $.get("static/historypage-template.html", function(template) {
+      $.get(uiRoot + "/static/historypage-template.html", function(template) {
         var sibling = historySummary.prev();
         historySummary.detach();
         var apps = $(Mustache.render($(template).filter("#history-summary-template").html(),data));

From c854b6ca7ba4dc33138c12ba4606ff8fbe82aef2 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 20 Mar 2018 17:53:09 +0900
Subject: [PATCH 361/566] [SPARK-23691][PYTHON][BRANCH-2.3] Use sql_conf util
 in PySpark tests where possible

## What changes were proposed in this pull request?

This PR backports https://github.com/apache/spark/pull/20830 to reduce the diff against master and restore the default value back in PySpark tests.

https://github.com/apache/spark/commit/d6632d185e147fcbe6724545488ad80dce20277e added an useful util. This backport extracts and brings this util:

```python
contextmanager
def sql_conf(self, pairs):
    ...
```

to allow configuration set/unset within a block:

```python
with self.sql_conf({"spark.blah.blah.blah", "blah"})
    # test codes
```

This PR proposes to use this util where possible in PySpark tests.

Note that there look already few places affecting tests without restoring the original value back in unittest classes.

## How was this patch tested?

Likewise, manually tested via:

```
./run-tests --modules=pyspark-sql --python-executables=python2
./run-tests --modules=pyspark-sql --python-executables=python3
```

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20863 from HyukjinKwon/backport-20830.
---
 python/pyspark/sql/tests.py | 143 ++++++++++++++++++------------------
 1 file changed, 72 insertions(+), 71 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 82c5500fe1220..d806e5d92495c 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -33,6 +33,7 @@
 import array
 import ctypes
 import py4j
+from contextlib import contextmanager
 
 try:
     import xmlrunner
@@ -201,6 +202,28 @@ def assertPandasEqual(self, expected, result):
                "\n\nResult:\n%s\n%s" % (result, result.dtypes))
         self.assertTrue(expected.equals(result), msg=msg)
 
+    @contextmanager
+    def sql_conf(self, pairs):
+        """
+        A convenient context manager to test some configuration specific logic. This sets
+        `value` to the configuration `key` and then restores it back when it exits.
+        """
+        assert isinstance(pairs, dict), "pairs should be a dictionary."
+
+        keys = pairs.keys()
+        new_values = pairs.values()
+        old_values = [self.spark.conf.get(key, None) for key in keys]
+        for key, new_value in zip(keys, new_values):
+            self.spark.conf.set(key, new_value)
+        try:
+            yield
+        finally:
+            for key, old_value in zip(keys, old_values):
+                if old_value is None:
+                    self.spark.conf.unset(key)
+                else:
+                    self.spark.conf.set(key, old_value)
+
 
 class DataTypeTests(unittest.TestCase):
     # regression test for SPARK-6055
@@ -2409,17 +2432,13 @@ def test_join_without_on(self):
         df1 = self.spark.range(1).toDF("a")
         df2 = self.spark.range(1).toDF("b")
 
-        try:
-            self.spark.conf.set("spark.sql.crossJoin.enabled", "false")
+        with self.sql_conf({"spark.sql.crossJoin.enabled": False}):
             self.assertRaises(AnalysisException, lambda: df1.join(df2, how="inner").collect())
 
-            self.spark.conf.set("spark.sql.crossJoin.enabled", "true")
+        with self.sql_conf({"spark.sql.crossJoin.enabled": True}):
             actual = df1.join(df2, how="inner").collect()
             expected = [Row(a=0, b=0)]
             self.assertEqual(actual, expected)
-        finally:
-            # We should unset this. Otherwise, other tests are affected.
-            self.spark.conf.unset("spark.sql.crossJoin.enabled")
 
     # Regression test for invalid join methods when on is None, Spark-14761
     def test_invalid_join_method(self):
@@ -2891,21 +2910,18 @@ def test_create_dateframe_from_pandas_with_dst(self):
         self.assertPandasEqual(pdf, df.toPandas())
 
         orig_env_tz = os.environ.get('TZ', None)
-        orig_session_tz = self.spark.conf.get('spark.sql.session.timeZone')
         try:
             tz = 'America/Los_Angeles'
             os.environ['TZ'] = tz
             time.tzset()
-            self.spark.conf.set('spark.sql.session.timeZone', tz)
-
-            df = self.spark.createDataFrame(pdf)
-            self.assertPandasEqual(pdf, df.toPandas())
+            with self.sql_conf({'spark.sql.session.timeZone': tz}):
+                df = self.spark.createDataFrame(pdf)
+                self.assertPandasEqual(pdf, df.toPandas())
         finally:
             del os.environ['TZ']
             if orig_env_tz is not None:
                 os.environ['TZ'] = orig_env_tz
             time.tzset()
-            self.spark.conf.set('spark.sql.session.timeZone', orig_session_tz)
 
 
 class HiveSparkSubmitTests(SparkSubmitTests):
@@ -3472,12 +3488,11 @@ def test_null_conversion(self):
         self.assertTrue(all([c == 1 for c in null_counts]))
 
     def _toPandas_arrow_toggle(self, df):
-        self.spark.conf.set("spark.sql.execution.arrow.enabled", "false")
-        try:
+        with self.sql_conf({"spark.sql.execution.arrow.enabled": False}):
             pdf = df.toPandas()
-        finally:
-            self.spark.conf.set("spark.sql.execution.arrow.enabled", "true")
+
         pdf_arrow = df.toPandas()
+
         return pdf, pdf_arrow
 
     def test_toPandas_arrow_toggle(self):
@@ -3489,16 +3504,17 @@ def test_toPandas_arrow_toggle(self):
 
     def test_toPandas_respect_session_timezone(self):
         df = self.spark.createDataFrame(self.data, schema=self.schema)
-        orig_tz = self.spark.conf.get("spark.sql.session.timeZone")
-        try:
-            timezone = "America/New_York"
-            self.spark.conf.set("spark.sql.session.timeZone", timezone)
-            self.spark.conf.set("spark.sql.execution.pandas.respectSessionTimeZone", "false")
-            try:
-                pdf_la, pdf_arrow_la = self._toPandas_arrow_toggle(df)
-                self.assertPandasEqual(pdf_arrow_la, pdf_la)
-            finally:
-                self.spark.conf.set("spark.sql.execution.pandas.respectSessionTimeZone", "true")
+
+        timezone = "America/New_York"
+        with self.sql_conf({
+                "spark.sql.execution.pandas.respectSessionTimeZone": False,
+                "spark.sql.session.timeZone": timezone}):
+            pdf_la, pdf_arrow_la = self._toPandas_arrow_toggle(df)
+            self.assertPandasEqual(pdf_arrow_la, pdf_la)
+
+        with self.sql_conf({
+                "spark.sql.execution.pandas.respectSessionTimeZone": True,
+                "spark.sql.session.timeZone": timezone}):
             pdf_ny, pdf_arrow_ny = self._toPandas_arrow_toggle(df)
             self.assertPandasEqual(pdf_arrow_ny, pdf_ny)
 
@@ -3511,8 +3527,6 @@ def test_toPandas_respect_session_timezone(self):
                     pdf_la_corrected[field.name] = _check_series_convert_timestamps_local_tz(
                         pdf_la_corrected[field.name], timezone)
             self.assertPandasEqual(pdf_ny, pdf_la_corrected)
-        finally:
-            self.spark.conf.set("spark.sql.session.timeZone", orig_tz)
 
     def test_pandas_round_trip(self):
         pdf = self.create_pandas_data_frame()
@@ -3528,12 +3542,11 @@ def test_filtered_frame(self):
         self.assertTrue(pdf.empty)
 
     def _createDataFrame_toggle(self, pdf, schema=None):
-        self.spark.conf.set("spark.sql.execution.arrow.enabled", "false")
-        try:
+        with self.sql_conf({"spark.sql.execution.arrow.enabled": False}):
             df_no_arrow = self.spark.createDataFrame(pdf, schema=schema)
-        finally:
-            self.spark.conf.set("spark.sql.execution.arrow.enabled", "true")
+
         df_arrow = self.spark.createDataFrame(pdf, schema=schema)
+
         return df_no_arrow, df_arrow
 
     def test_createDataFrame_toggle(self):
@@ -3544,18 +3557,18 @@ def test_createDataFrame_toggle(self):
     def test_createDataFrame_respect_session_timezone(self):
         from datetime import timedelta
         pdf = self.create_pandas_data_frame()
-        orig_tz = self.spark.conf.get("spark.sql.session.timeZone")
-        try:
-            timezone = "America/New_York"
-            self.spark.conf.set("spark.sql.session.timeZone", timezone)
-            self.spark.conf.set("spark.sql.execution.pandas.respectSessionTimeZone", "false")
-            try:
-                df_no_arrow_la, df_arrow_la = self._createDataFrame_toggle(pdf, schema=self.schema)
-                result_la = df_no_arrow_la.collect()
-                result_arrow_la = df_arrow_la.collect()
-                self.assertEqual(result_la, result_arrow_la)
-            finally:
-                self.spark.conf.set("spark.sql.execution.pandas.respectSessionTimeZone", "true")
+        timezone = "America/New_York"
+        with self.sql_conf({
+                "spark.sql.execution.pandas.respectSessionTimeZone": False,
+                "spark.sql.session.timeZone": timezone}):
+            df_no_arrow_la, df_arrow_la = self._createDataFrame_toggle(pdf, schema=self.schema)
+            result_la = df_no_arrow_la.collect()
+            result_arrow_la = df_arrow_la.collect()
+            self.assertEqual(result_la, result_arrow_la)
+
+        with self.sql_conf({
+                "spark.sql.execution.pandas.respectSessionTimeZone": True,
+                "spark.sql.session.timeZone": timezone}):
             df_no_arrow_ny, df_arrow_ny = self._createDataFrame_toggle(pdf, schema=self.schema)
             result_ny = df_no_arrow_ny.collect()
             result_arrow_ny = df_arrow_ny.collect()
@@ -3568,8 +3581,6 @@ def test_createDataFrame_respect_session_timezone(self):
                                           for k, v in row.asDict().items()})
                                    for row in result_la]
             self.assertEqual(result_ny, result_la_corrected)
-        finally:
-            self.spark.conf.set("spark.sql.session.timeZone", orig_tz)
 
     def test_createDataFrame_with_schema(self):
         pdf = self.create_pandas_data_frame()
@@ -4222,9 +4233,7 @@ def gen_timestamps(id):
     def test_vectorized_udf_check_config(self):
         from pyspark.sql.functions import pandas_udf, col
         import pandas as pd
-        orig_value = self.spark.conf.get("spark.sql.execution.arrow.maxRecordsPerBatch", None)
-        self.spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", 3)
-        try:
+        with self.sql_conf({"spark.sql.execution.arrow.maxRecordsPerBatch": 3}):
             df = self.spark.range(10, numPartitions=1)
 
             @pandas_udf(returnType=LongType())
@@ -4234,11 +4243,6 @@ def check_records_per_batch(x):
             result = df.select(check_records_per_batch(col("id"))).collect()
             for (r,) in result:
                 self.assertTrue(r <= 3)
-        finally:
-            if orig_value is None:
-                self.spark.conf.unset("spark.sql.execution.arrow.maxRecordsPerBatch")
-            else:
-                self.spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", orig_value)
 
     def test_vectorized_udf_timestamps_respect_session_timezone(self):
         from pyspark.sql.functions import pandas_udf, col
@@ -4257,30 +4261,27 @@ def test_vectorized_udf_timestamps_respect_session_timezone(self):
         internal_value = pandas_udf(
             lambda ts: ts.apply(lambda ts: ts.value if ts is not pd.NaT else None), LongType())
 
-        orig_tz = self.spark.conf.get("spark.sql.session.timeZone")
-        try:
-            timezone = "America/New_York"
-            self.spark.conf.set("spark.sql.session.timeZone", timezone)
-            self.spark.conf.set("spark.sql.execution.pandas.respectSessionTimeZone", "false")
-            try:
-                df_la = df.withColumn("tscopy", f_timestamp_copy(col("timestamp"))) \
-                    .withColumn("internal_value", internal_value(col("timestamp")))
-                result_la = df_la.select(col("idx"), col("internal_value")).collect()
-                # Correct result_la by adjusting 3 hours difference between Los Angeles and New York
-                diff = 3 * 60 * 60 * 1000 * 1000 * 1000
-                result_la_corrected = \
-                    df_la.select(col("idx"), col("tscopy"), col("internal_value") + diff).collect()
-            finally:
-                self.spark.conf.set("spark.sql.execution.pandas.respectSessionTimeZone", "true")
+        timezone = "America/New_York"
+        with self.sql_conf({
+                "spark.sql.execution.pandas.respectSessionTimeZone": False,
+                "spark.sql.session.timeZone": timezone}):
+            df_la = df.withColumn("tscopy", f_timestamp_copy(col("timestamp"))) \
+                .withColumn("internal_value", internal_value(col("timestamp")))
+            result_la = df_la.select(col("idx"), col("internal_value")).collect()
+            # Correct result_la by adjusting 3 hours difference between Los Angeles and New York
+            diff = 3 * 60 * 60 * 1000 * 1000 * 1000
+            result_la_corrected = \
+                df_la.select(col("idx"), col("tscopy"), col("internal_value") + diff).collect()
 
+        with self.sql_conf({
+                "spark.sql.execution.pandas.respectSessionTimeZone": True,
+                "spark.sql.session.timeZone": timezone}):
             df_ny = df.withColumn("tscopy", f_timestamp_copy(col("timestamp"))) \
                 .withColumn("internal_value", internal_value(col("timestamp")))
             result_ny = df_ny.select(col("idx"), col("tscopy"), col("internal_value")).collect()
 
             self.assertNotEqual(result_ny, result_la)
             self.assertEqual(result_ny, result_la_corrected)
-        finally:
-            self.spark.conf.set("spark.sql.session.timeZone", orig_tz)
 
     def test_nondeterministic_vectorized_udf(self):
         # Test that nondeterministic UDFs are evaluated only once in chained UDF evaluations

From 0b880db65b647e549b78721859b1712dff733ec9 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Tue, 20 Mar 2018 10:34:56 -0700
Subject: [PATCH 362/566] [SPARK-23649][SQL] Skipping chars disallowed in UTF-8

## What changes were proposed in this pull request?

The mapping of UTF-8 char's first byte to char's size doesn't cover whole range 0-255. It is defined only for 0-253:
https://github.com/apache/spark/blob/master/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java#L60-L65
https://github.com/apache/spark/blob/master/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java#L190

If the first byte of a char is 253-255, IndexOutOfBoundsException is thrown. Besides of that values for 244-252 are not correct according to recent unicode standard for UTF-8: http://www.unicode.org/versions/Unicode10.0.0/UnicodeStandard-10.0.pdf

As a consequence of the exception above, the length of input string in UTF-8 encoding cannot be calculated if the string contains chars started from 253 code. It is visible on user's side as for example crashing of schema inferring of csv file which contains such chars but the file can be read if the schema is specified explicitly or if the mode set to multiline.

The proposed changes build correct mapping of first byte of UTF-8 char to its size (now it covers all cases) and skip disallowed chars (counts it as one octet).

## How was this patch tested?

Added a test and a file with a char which is disallowed in UTF-8 - 0xFF.

Author: Maxim Gekk <maxim.gekk@databricks.com>

Closes #20796 from MaxGekk/skip-wrong-utf8-chars.

(cherry picked from commit 5e7bc2acef4a1e11d0d8056ef5c12cd5c8f220da)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/unsafe/types/UTF8String.java | 48 +++++++++++++++----
 .../spark/unsafe/types/UTF8StringSuite.java   | 23 ++++++++-
 2 files changed, 62 insertions(+), 9 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index b0d0c44823e68..5d468aed42337 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -57,12 +57,43 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
   public Object getBaseObject() { return base; }
   public long getBaseOffset() { return offset; }
 
-  private static int[] bytesOfCodePointInUTF8 = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    5, 5, 5, 5,
-    6, 6};
+  /**
+   * A char in UTF-8 encoding can take 1-4 bytes depending on the first byte which
+   * indicates the size of the char. See Unicode standard in page 126, Table 3-6:
+   * http://www.unicode.org/versions/Unicode10.0.0/UnicodeStandard-10.0.pdf
+   *
+   * Binary    Hex          Comments
+   * 0xxxxxxx  0x00..0x7F   Only byte of a 1-byte character encoding
+   * 10xxxxxx  0x80..0xBF   Continuation bytes (1-3 continuation bytes)
+   * 110xxxxx  0xC0..0xDF   First byte of a 2-byte character encoding
+   * 1110xxxx  0xE0..0xEF   First byte of a 3-byte character encoding
+   * 11110xxx  0xF0..0xF7   First byte of a 4-byte character encoding
+   *
+   * As a consequence of the well-formedness conditions specified in
+   * Table 3-7 (page 126), the following byte values are disallowed in UTF-8:
+   *   C0–C1, F5–FF.
+   */
+  private static byte[] bytesOfCodePointInUTF8 = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00..0x0F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10..0x1F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20..0x2F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30..0x3F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40..0x4F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50..0x5F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60..0x6F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70..0x7F
+    // Continuation bytes cannot appear as the first byte
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80..0x8F
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90..0x9F
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0..0xAF
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0..0xBF
+    0, 0, // 0xC0..0xC1 - disallowed in UTF-8
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xC2..0xCF
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xD0..0xDF
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE0..0xEF
+    4, 4, 4, 4, 4, // 0xF0..0xF4
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 0xF5..0xFF - disallowed in UTF-8
+  };
 
   private static final boolean IS_LITTLE_ENDIAN =
       ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN;
@@ -187,8 +218,9 @@ public void writeTo(OutputStream out) throws IOException {
    * @param b The first byte of a code point
    */
   private static int numBytesForFirstByte(final byte b) {
-    final int offset = (b & 0xFF) - 192;
-    return (offset >= 0) ? bytesOfCodePointInUTF8[offset] : 1;
+    final int offset = b & 0xFF;
+    byte numBytes = bytesOfCodePointInUTF8[offset];
+    return (numBytes == 0) ? 1: numBytes; // Skip the first byte disallowed in UTF-8
   }
 
   /**
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 9b303fa5bc6c5..7c34d419574ef 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -58,8 +58,12 @@ private static void checkBasic(String str, int len) {
   @Test
   public void basicTest() {
     checkBasic("", 0);
-    checkBasic("hello", 5);
+    checkBasic("¡", 1); // 2 bytes char
+    checkBasic("ку", 2); // 2 * 2 bytes chars
+    checkBasic("hello", 5); // 5 * 1 byte chars
     checkBasic("大 千 世 界", 7);
+    checkBasic("︽﹋％", 3); // 3 * 3 bytes chars
+    checkBasic("\uD83E\uDD19", 1); // 4 bytes char
   }
 
   @Test
@@ -791,4 +795,21 @@ public void trimRightWithTrimString() {
     assertEquals(fromString("头"), fromString("头a???/").trimRight(fromString("数?/*&^%a")));
     assertEquals(fromString("头"), fromString("头数b数数 [").trimRight(fromString(" []数b")));
   }
+
+  @Test
+  public void skipWrongFirstByte() {
+    int[] wrongFirstBytes = {
+      0x80, 0x9F, 0xBF, // Skip Continuation bytes
+      0xC0, 0xC2, // 0xC0..0xC1 - disallowed in UTF-8
+      // 0xF5..0xFF - disallowed in UTF-8
+      0xF5, 0xF6, 0xF7, 0xF8, 0xF9,
+      0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
+    };
+    byte[] c = new byte[1];
+
+    for (int i = 0; i < wrongFirstBytes.length; ++i) {
+      c[0] = (byte)wrongFirstBytes[i];
+      assertEquals(fromBytes(c).numChars(), 1);
+    }
+  }
 }

From 1e552b356253bfbb2bdf8156c15630d5ed65dfc6 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Wed, 21 Mar 2018 09:52:28 -0700
Subject: [PATCH 363/566] [SPARK-23264][SQL] Fix scala.MatchError in
 literals.sql.out

## What changes were proposed in this pull request?
To fix `scala.MatchError` in `literals.sql.out`, this pr added an entry for `CalendarIntervalType` in `QueryExecution.toHiveStructString`.

## How was this patch tested?
Existing tests and added tests in `literals.sql`

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #20872 from maropu/FixIntervalTests.

(cherry picked from commit 98d0ea3f6091730285293321a50148f69e94c9cd)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/execution/QueryExecution.scala  |  2 ++
 .../resources/sql-tests/inputs/literals.sql   |  3 +++
 .../sql-tests/results/literals.sql.out        | 20 ++++++++++++-------
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
index 7cae24bf5976c..15379a0663f7d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -155,6 +155,7 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
       case (null, _) => "null"
       case (s: String, StringType) => "\"" + s + "\""
       case (decimal, DecimalType()) => decimal.toString
+      case (interval, CalendarIntervalType) => interval.toString
       case (other, tpe) if primitiveTypes contains tpe => other.toString
     }
 
@@ -178,6 +179,7 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
           DateTimeUtils.getTimeZone(sparkSession.sessionState.conf.sessionLocalTimeZone))
       case (bin: Array[Byte], BinaryType) => new String(bin, StandardCharsets.UTF_8)
       case (decimal: java.math.BigDecimal, DecimalType()) => formatDecimal(decimal)
+      case (interval, CalendarIntervalType) => interval.toString
       case (other, tpe) if primitiveTypes.contains(tpe) => other.toString
     }
   }
diff --git a/sql/core/src/test/resources/sql-tests/inputs/literals.sql b/sql/core/src/test/resources/sql-tests/inputs/literals.sql
index 37b4b7606d12b..a743cf1ec2cde 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/literals.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/literals.sql
@@ -105,3 +105,6 @@ select X'XuZ';
 
 -- Hive literal_double test.
 SELECT 3.14, -3.14, 3.14e8, 3.14e-8, -3.14e8, -3.14e-8, 3.14e+8, 3.14E8, 3.14E-8;
+
+-- map + interval test
+select map(1, interval 1 day, 2, interval 3 week);
diff --git a/sql/core/src/test/resources/sql-tests/results/literals.sql.out b/sql/core/src/test/resources/sql-tests/results/literals.sql.out
index 95d4413148f64..b8c91dc8b59a4 100644
--- a/sql/core/src/test/resources/sql-tests/results/literals.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/literals.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 43
+-- Number of queries: 44
 
 
 -- !query 0
@@ -323,19 +323,17 @@ select timestamp '2016-33-11 20:54:00.000'
 -- !query 34
 select interval 13.123456789 seconds, interval -13.123456789 second
 -- !query 34 schema
-struct<>
+struct<interval 13 seconds 123 milliseconds 456 microseconds:calendarinterval,interval -12 seconds -876 milliseconds -544 microseconds:calendarinterval>
 -- !query 34 output
-scala.MatchError
-(interval 13 seconds 123 milliseconds 456 microseconds,CalendarIntervalType) (of class scala.Tuple2)
+interval 13 seconds 123 milliseconds 456 microseconds	interval -12 seconds -876 milliseconds -544 microseconds
 
 
 -- !query 35
 select interval 1 year 2 month 3 week 4 day 5 hour 6 minute 7 seconds 8 millisecond, 9 microsecond
 -- !query 35 schema
-struct<>
+struct<interval 1 years 2 months 3 weeks 4 days 5 hours 6 minutes 7 seconds 8 milliseconds:calendarinterval,microsecond:int>
 -- !query 35 output
-scala.MatchError
-(interval 1 years 2 months 3 weeks 4 days 5 hours 6 minutes 7 seconds 8 milliseconds,CalendarIntervalType) (of class scala.Tuple2)
+interval 1 years 2 months 3 weeks 4 days 5 hours 6 minutes 7 seconds 8 milliseconds	9
 
 
 -- !query 36
@@ -416,3 +414,11 @@ SELECT 3.14, -3.14, 3.14e8, 3.14e-8, -3.14e8, -3.14e-8, 3.14e+8, 3.14E8, 3.14E-8
 struct<3.14:decimal(3,2),-3.14:decimal(3,2),3.14E+8:decimal(3,-6),3.14E-8:decimal(10,10),-3.14E+8:decimal(3,-6),-3.14E-8:decimal(10,10),3.14E+8:decimal(3,-6),3.14E+8:decimal(3,-6),3.14E-8:decimal(10,10)>
 -- !query 42 output
 3.14	-3.14	314000000	0.0000000314	-314000000	-0.0000000314	314000000	314000000	0.0000000314
+
+
+-- !query 43
+select map(1, interval 1 day, 2, interval 3 week)
+-- !query 43 schema
+struct<map(1, interval 1 days, 2, interval 3 weeks):map<int,calendarinterval>>
+-- !query 43 output
+{1:interval 1 days,2:interval 3 weeks}

From 4b9f33ff7f5268ffcc11cd5eea5976e9153be9fb Mon Sep 17 00:00:00 2001
From: Gabor Somogyi <gabor.g.somogyi@gmail.com>
Date: Wed, 21 Mar 2018 10:06:26 -0700
Subject: [PATCH 364/566] [SPARK-23288][SS] Fix output metrics with parquet
 sink

## What changes were proposed in this pull request?

Output metrics were not filled when parquet sink used.

This PR fixes this problem by passing a `BasicWriteJobStatsTracker` in `FileStreamSink`.

## How was this patch tested?

Additional unit test added.

Author: Gabor Somogyi <gabor.g.somogyi@gmail.com>

Closes #20745 from gaborgsomogyi/SPARK-23288.

(cherry picked from commit 918c7e99afdcea05c36626e230636c4f8aabf82c)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../command/DataWritingCommand.scala          | 11 +---
 .../datasources/BasicWriteStatsTracker.scala  | 25 +++++++--
 .../execution/streaming/FileStreamSink.scala  | 10 +++-
 .../sql/streaming/FileStreamSinkSuite.scala   | 52 +++++++++++++++++++
 4 files changed, 82 insertions(+), 16 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
index e56f8105fc9a7..e11dbd201004d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.command
 
 import org.apache.hadoop.conf.Configuration
 
-import org.apache.spark.SparkContext
 import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan}
@@ -45,15 +44,7 @@ trait DataWritingCommand extends Command {
   // Output columns of the analyzed input query plan
   def outputColumns: Seq[Attribute]
 
-  lazy val metrics: Map[String, SQLMetric] = {
-    val sparkContext = SparkContext.getActive.get
-    Map(
-      "numFiles" -> SQLMetrics.createMetric(sparkContext, "number of written files"),
-      "numOutputBytes" -> SQLMetrics.createMetric(sparkContext, "bytes of written output"),
-      "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
-      "numParts" -> SQLMetrics.createMetric(sparkContext, "number of dynamic part")
-    )
-  }
+  lazy val metrics: Map[String, SQLMetric] = BasicWriteJobStatsTracker.metrics
 
   def basicWriteJobStatsTracker(hadoopConf: Configuration): BasicWriteJobStatsTracker = {
     val serializableHadoopConf = new SerializableConfiguration(hadoopConf)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala
index 9dbbe9946ee99..69c03d862391e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala
@@ -153,12 +153,29 @@ class BasicWriteJobStatsTracker(
       totalNumOutput += summary.numRows
     }
 
-    metrics("numFiles").add(numFiles)
-    metrics("numOutputBytes").add(totalNumBytes)
-    metrics("numOutputRows").add(totalNumOutput)
-    metrics("numParts").add(numPartitions)
+    metrics(BasicWriteJobStatsTracker.NUM_FILES_KEY).add(numFiles)
+    metrics(BasicWriteJobStatsTracker.NUM_OUTPUT_BYTES_KEY).add(totalNumBytes)
+    metrics(BasicWriteJobStatsTracker.NUM_OUTPUT_ROWS_KEY).add(totalNumOutput)
+    metrics(BasicWriteJobStatsTracker.NUM_PARTS_KEY).add(numPartitions)
 
     val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
     SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toList)
   }
 }
+
+object BasicWriteJobStatsTracker {
+  private val NUM_FILES_KEY = "numFiles"
+  private val NUM_OUTPUT_BYTES_KEY = "numOutputBytes"
+  private val NUM_OUTPUT_ROWS_KEY = "numOutputRows"
+  private val NUM_PARTS_KEY = "numParts"
+
+  def metrics: Map[String, SQLMetric] = {
+    val sparkContext = SparkContext.getActive.get
+    Map(
+      NUM_FILES_KEY -> SQLMetrics.createMetric(sparkContext, "number of written files"),
+      NUM_OUTPUT_BYTES_KEY -> SQLMetrics.createMetric(sparkContext, "bytes of written output"),
+      NUM_OUTPUT_ROWS_KEY -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
+      NUM_PARTS_KEY -> SQLMetrics.createMetric(sparkContext, "number of dynamic part")
+    )
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
index 87a17cebdc10c..b3d12f67b5d63 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
@@ -26,7 +26,8 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.execution.datasources.{FileFormat, FileFormatWriter}
+import org.apache.spark.sql.execution.datasources.{BasicWriteJobStatsTracker, FileFormat, FileFormatWriter}
+import org.apache.spark.util.SerializableConfiguration
 
 object FileStreamSink extends Logging {
   // The name of the subdirectory that is used to store metadata about which files are valid.
@@ -97,6 +98,11 @@ class FileStreamSink(
     new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, logPath.toUri.toString)
   private val hadoopConf = sparkSession.sessionState.newHadoopConf()
 
+  private def basicWriteJobStatsTracker: BasicWriteJobStatsTracker = {
+    val serializableHadoopConf = new SerializableConfiguration(hadoopConf)
+    new BasicWriteJobStatsTracker(serializableHadoopConf, BasicWriteJobStatsTracker.metrics)
+  }
+
   override def addBatch(batchId: Long, data: DataFrame): Unit = {
     if (batchId <= fileLog.getLatest().map(_._1).getOrElse(-1L)) {
       logInfo(s"Skipping already committed batch $batchId")
@@ -131,7 +137,7 @@ class FileStreamSink(
         hadoopConf = hadoopConf,
         partitionColumns = partitionColumns,
         bucketSpec = None,
-        statsTrackers = Nil,
+        statsTrackers = Seq(basicWriteJobStatsTracker),
         options = options)
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
index ba48bc1ce0c4d..50bdbcb4186ad 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
@@ -21,6 +21,7 @@ import java.util.Locale
 
 import org.apache.hadoop.fs.Path
 
+import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd}
 import org.apache.spark.sql.{AnalysisException, DataFrame}
 import org.apache.spark.sql.execution.DataSourceScanExec
 import org.apache.spark.sql.execution.datasources._
@@ -405,4 +406,55 @@ class FileStreamSinkSuite extends StreamTest {
       }
     }
   }
+
+  test("SPARK-23288 writing and checking output metrics") {
+    Seq("parquet", "orc", "text", "json").foreach { format =>
+      val inputData = MemoryStream[String]
+      val df = inputData.toDF()
+
+      withTempDir { outputDir =>
+        withTempDir { checkpointDir =>
+
+          var query: StreamingQuery = null
+
+          var numTasks = 0
+          var recordsWritten: Long = 0L
+          var bytesWritten: Long = 0L
+          try {
+            spark.sparkContext.addSparkListener(new SparkListener() {
+              override def onTaskEnd(taskEnd: SparkListenerTaskEnd) {
+                val outputMetrics = taskEnd.taskMetrics.outputMetrics
+                recordsWritten += outputMetrics.recordsWritten
+                bytesWritten += outputMetrics.bytesWritten
+                numTasks += 1
+              }
+            })
+
+            query =
+              df.writeStream
+                .option("checkpointLocation", checkpointDir.getCanonicalPath)
+                .format(format)
+                .start(outputDir.getCanonicalPath)
+
+            inputData.addData("1", "2", "3")
+            inputData.addData("4", "5")
+
+            failAfter(streamingTimeout) {
+              query.processAllAvailable()
+            }
+            spark.sparkContext.listenerBus.waitUntilEmpty(streamingTimeout.toMillis)
+
+            assert(numTasks > 0)
+            assert(recordsWritten === 5)
+            // This is heavily file type/version specific but should be filled
+            assert(bytesWritten > 0)
+          } finally {
+            if (query != null) {
+              query.stop()
+            }
+          }
+        }
+      }
+    }
+  }
 }

From c9acd46bed8fa3e410e8a44aafe3237e59deaa73 Mon Sep 17 00:00:00 2001
From: Mihaly Toth <misutoth@gmail.com>
Date: Wed, 21 Mar 2018 17:05:39 -0700
Subject: [PATCH 365/566] [SPARK-23729][CORE] Respect URI fragment when
 resolving globs

Firstly, glob resolution will not result in swallowing the remote name part (that is preceded by the `#` sign) in case of `--files` or `--archives` options

Moreover in the special case of multiple resolutions when the remote naming does not make sense and error is returned.

Enhanced current test and wrote additional test for the error case

Author: Mihaly Toth <misutoth@gmail.com>

Closes #20853 from misutoth/glob-with-remote-name.

(cherry picked from commit 0604beaff2baa2d0fed86c0c87fd2a16a1838b5f)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../apache/spark/deploy/DependencyUtils.scala | 34 +++++++++++----
 .../org/apache/spark/deploy/SparkSubmit.scala | 13 ++++++
 .../spark/deploy/SparkSubmitSuite.scala       | 41 +++++++++++++++----
 3 files changed, 72 insertions(+), 16 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala b/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala
index ecc82d7ac8001..ab319c860ee69 100644
--- a/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala
@@ -18,12 +18,13 @@
 package org.apache.spark.deploy
 
 import java.io.File
+import java.net.URI
 
 import org.apache.commons.lang3.StringUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 
-import org.apache.spark.{SecurityManager, SparkConf}
+import org.apache.spark.{SecurityManager, SparkConf, SparkException}
 import org.apache.spark.util.{MutableURLClassLoader, Utils}
 
 private[deploy] object DependencyUtils {
@@ -137,16 +138,31 @@ private[deploy] object DependencyUtils {
   def resolveGlobPaths(paths: String, hadoopConf: Configuration): String = {
     require(paths != null, "paths cannot be null.")
     Utils.stringToSeq(paths).flatMap { path =>
-      val uri = Utils.resolveURI(path)
-      uri.getScheme match {
-        case "local" | "http" | "https" | "ftp" => Array(path)
-        case _ =>
-          val fs = FileSystem.get(uri, hadoopConf)
-          Option(fs.globStatus(new Path(uri))).map { status =>
-            status.filter(_.isFile).map(_.getPath.toUri.toString)
-          }.getOrElse(Array(path))
+      val (base, fragment) = splitOnFragment(path)
+      (resolveGlobPath(base, hadoopConf), fragment) match {
+        case (resolved, Some(_)) if resolved.length > 1 => throw new SparkException(
+            s"${base.toString} resolves ambiguously to multiple files: ${resolved.mkString(",")}")
+        case (resolved, Some(namedAs)) => resolved.map(_ + "#" + namedAs)
+        case (resolved, _) => resolved
       }
     }.mkString(",")
   }
 
+  private def splitOnFragment(path: String): (URI, Option[String]) = {
+    val uri = Utils.resolveURI(path)
+    val withoutFragment = new URI(uri.getScheme, uri.getSchemeSpecificPart, null)
+    (withoutFragment, Option(uri.getFragment))
+  }
+
+  private def resolveGlobPath(uri: URI, hadoopConf: Configuration): Array[String] = {
+    uri.getScheme match {
+      case "local" | "http" | "https" | "ftp" => Array(uri.toString)
+      case _ =>
+        val fs = FileSystem.get(uri, hadoopConf)
+        Option(fs.globStatus(new Path(uri))).map { status =>
+          status.filter(_.isFile).map(_.getPath.toUri.toString)
+        }.getOrElse(Array(uri.toString))
+    }
+  }
+
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 1e381965c52ba..b44c8802ea6d8 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -245,6 +245,19 @@ object SparkSubmit extends CommandLineUtils with Logging {
       args: SparkSubmitArguments,
       conf: Option[HadoopConfiguration] = None)
       : (Seq[String], Seq[String], SparkConf, String) = {
+    try {
+      doPrepareSubmitEnvironment(args, conf)
+    } catch {
+      case e: SparkException =>
+        printErrorAndExit(e.getMessage)
+        throw e
+    }
+  }
+
+  private def doPrepareSubmitEnvironment(
+      args: SparkSubmitArguments,
+      conf: Option[HadoopConfiguration] = None)
+      : (Seq[String], Seq[String], SparkConf, String) = {
     // Return values
     val childArgs = new ArrayBuffer[String]()
     val childClasspath = new ArrayBuffer[String]()
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 27dd435332348..feab888f89aca 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.deploy
 import java.io._
 import java.net.URI
 import java.nio.charset.StandardCharsets
-import java.nio.file.Files
+import java.nio.file.{Files, Paths}
 
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
@@ -606,10 +606,13 @@ class SparkSubmitSuite
   }
 
   test("resolves command line argument paths correctly") {
-    val jars = "/jar1,/jar2"                 // --jars
-    val files = "local:/file1,file2"          // --files
-    val archives = "file:/archive1,archive2" // --archives
-    val pyFiles = "py-file1,py-file2"        // --py-files
+    val dir = Utils.createTempDir()
+    val archive = Paths.get(dir.toPath.toString, "single.zip")
+    Files.createFile(archive)
+    val jars = "/jar1,/jar2"
+    val files = "local:/file1,file2"
+    val archives = s"file:/archive1,${dir.toPath.toAbsolutePath.toString}/*.zip#archive3"
+    val pyFiles = "py-file1,py-file2"
 
     // Test jars and files
     val clArgs = Seq(
@@ -636,9 +639,10 @@ class SparkSubmitSuite
     val appArgs2 = new SparkSubmitArguments(clArgs2)
     val (_, _, conf2, _) = SparkSubmit.prepareSubmitEnvironment(appArgs2)
     appArgs2.files should be (Utils.resolveURIs(files))
-    appArgs2.archives should be (Utils.resolveURIs(archives))
+    appArgs2.archives should fullyMatch regex ("file:/archive1,file:.*#archive3")
     conf2.get("spark.yarn.dist.files") should be (Utils.resolveURIs(files))
-    conf2.get("spark.yarn.dist.archives") should be (Utils.resolveURIs(archives))
+    conf2.get("spark.yarn.dist.archives") should fullyMatch regex
+      ("file:/archive1,file:.*#archive3")
 
     // Test python files
     val clArgs3 = Seq(
@@ -657,6 +661,29 @@ class SparkSubmitSuite
     conf3.get(PYSPARK_PYTHON.key) should be ("python3.5")
   }
 
+  test("ambiguous archive mapping results in error message") {
+    val dir = Utils.createTempDir()
+    val archive1 = Paths.get(dir.toPath.toString, "first.zip")
+    val archive2 = Paths.get(dir.toPath.toString, "second.zip")
+    Files.createFile(archive1)
+    Files.createFile(archive2)
+    val jars = "/jar1,/jar2"
+    val files = "local:/file1,file2"
+    val archives = s"file:/archive1,${dir.toPath.toAbsolutePath.toString}/*.zip#archive3"
+    val pyFiles = "py-file1,py-file2"
+
+    // Test files and archives (Yarn)
+    val clArgs2 = Seq(
+      "--master", "yarn",
+      "--class", "org.SomeClass",
+      "--files", files,
+      "--archives", archives,
+      "thejar.jar"
+    )
+
+    testPrematureExit(clArgs2.toArray, "resolves ambiguously to multiple files")
+  }
+
   test("resolves config paths correctly") {
     val jars = "/jar1,/jar2" // spark.jars
     val files = "local:/file1,file2" // spark.files / spark.yarn.dist.files

From 4da8c22f77475d1b328375e97e2825e1dea78fdd Mon Sep 17 00:00:00 2001
From: Kris Mok <kris.mok@databricks.com>
Date: Wed, 21 Mar 2018 21:21:36 -0700
Subject: [PATCH 366/566] [SPARK-23760][SQL]
 CodegenContext.withSubExprEliminationExprs should save/restore CSE state
 correctly

## What changes were proposed in this pull request?

Fixed `CodegenContext.withSubExprEliminationExprs()` so that it saves/restores CSE state correctly.

## How was this patch tested?

Added new unit test to verify that the old CSE state is indeed saved and restored around the `withSubExprEliminationExprs()` call. Manually verified that this test fails without this patch.

Author: Kris Mok <kris.mok@databricks.com>

Closes #20870 from rednaxelafx/codegen-subexpr-fix.

(cherry picked from commit 95e51ff849a4c46cae463636b1ee393042469e7b)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../expressions/codegen/CodeGenerator.scala   | 16 +++----
 .../expressions/CodeGenerationSuite.scala     | 44 +++++++++++++++++++
 2 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index a54af0309f667..2631e7ee33951 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -389,7 +389,7 @@ class CodegenContext {
   val equivalentExpressions: EquivalentExpressions = new EquivalentExpressions
 
   // Foreach expression that is participating in subexpression elimination, the state to use.
-  val subExprEliminationExprs = mutable.HashMap.empty[Expression, SubExprEliminationState]
+  var subExprEliminationExprs = Map.empty[Expression, SubExprEliminationState]
 
   // The collection of sub-expression result resetting methods that need to be called on each row.
   val subexprFunctions = mutable.ArrayBuffer.empty[String]
@@ -1118,14 +1118,12 @@ class CodegenContext {
       newSubExprEliminationExprs: Map[Expression, SubExprEliminationState])(
       f: => Seq[ExprCode]): Seq[ExprCode] = {
     val oldsubExprEliminationExprs = subExprEliminationExprs
-    subExprEliminationExprs.clear
-    newSubExprEliminationExprs.foreach(subExprEliminationExprs += _)
+    subExprEliminationExprs = newSubExprEliminationExprs
 
     val genCodes = f
 
     // Restore previous subExprEliminationExprs
-    subExprEliminationExprs.clear
-    oldsubExprEliminationExprs.foreach(subExprEliminationExprs += _)
+    subExprEliminationExprs = oldsubExprEliminationExprs
     genCodes
   }
 
@@ -1139,7 +1137,7 @@ class CodegenContext {
   def subexpressionEliminationForWholeStageCodegen(expressions: Seq[Expression]): SubExprCodes = {
     // Create a clear EquivalentExpressions and SubExprEliminationState mapping
     val equivalentExpressions: EquivalentExpressions = new EquivalentExpressions
-    val subExprEliminationExprs = mutable.HashMap.empty[Expression, SubExprEliminationState]
+    val localSubExprEliminationExprs = mutable.HashMap.empty[Expression, SubExprEliminationState]
 
     // Add each expression tree and compute the common subexpressions.
     expressions.foreach(equivalentExpressions.addExprTree)
@@ -1152,10 +1150,10 @@ class CodegenContext {
       // Generate the code for this expression tree.
       val eval = expr.genCode(this)
       val state = SubExprEliminationState(eval.isNull, eval.value)
-      e.foreach(subExprEliminationExprs.put(_, state))
+      e.foreach(localSubExprEliminationExprs.put(_, state))
       eval.code.trim
     }
-    SubExprCodes(codes, subExprEliminationExprs.toMap)
+    SubExprCodes(codes, localSubExprEliminationExprs.toMap)
   }
 
   /**
@@ -1203,7 +1201,7 @@ class CodegenContext {
 
       subexprFunctions += s"${addNewFunction(fnName, fn)}($INPUT_ROW);"
       val state = SubExprEliminationState(isNull, value)
-      e.foreach(subExprEliminationExprs.put(_, state))
+      subExprEliminationExprs ++= e.map(_ -> state).toMap
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
index 3958bff53aa9c..e3ce5af5936a0 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
@@ -442,4 +442,48 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper {
     assert(ctx.calculateParamLength(Seq.range(0, 100).map(Literal(_))) == 101)
     assert(ctx.calculateParamLength(Seq.range(0, 100).map(x => Literal(x.toLong))) == 201)
   }
+
+  test("SPARK-23760: CodegenContext.withSubExprEliminationExprs should save/restore correctly") {
+
+    val ref = BoundReference(0, IntegerType, true)
+    val add1 = Add(ref, ref)
+    val add2 = Add(add1, add1)
+
+    // raw testing of basic functionality
+    {
+      val ctx = new CodegenContext
+      val e = ref.genCode(ctx)
+      // before
+      ctx.subExprEliminationExprs += ref -> SubExprEliminationState(e.isNull, e.value)
+      assert(ctx.subExprEliminationExprs.contains(ref))
+      // call withSubExprEliminationExprs
+      ctx.withSubExprEliminationExprs(Map(add1 -> SubExprEliminationState("dummy", "dummy"))) {
+        assert(ctx.subExprEliminationExprs.contains(add1))
+        assert(!ctx.subExprEliminationExprs.contains(ref))
+        Seq.empty
+      }
+      // after
+      assert(ctx.subExprEliminationExprs.nonEmpty)
+      assert(ctx.subExprEliminationExprs.contains(ref))
+      assert(!ctx.subExprEliminationExprs.contains(add1))
+    }
+
+    // emulate an actual codegen workload
+    {
+      val ctx = new CodegenContext
+      // before
+      ctx.generateExpressions(Seq(add2, add1), doSubexpressionElimination = true) // trigger CSE
+      assert(ctx.subExprEliminationExprs.contains(add1))
+      // call withSubExprEliminationExprs
+      ctx.withSubExprEliminationExprs(Map(ref -> SubExprEliminationState("dummy", "dummy"))) {
+        assert(ctx.subExprEliminationExprs.contains(ref))
+        assert(!ctx.subExprEliminationExprs.contains(add1))
+        Seq.empty
+      }
+      // after
+      assert(ctx.subExprEliminationExprs.nonEmpty)
+      assert(ctx.subExprEliminationExprs.contains(add1))
+      assert(!ctx.subExprEliminationExprs.contains(ref))
+    }
+  }
 }

From 1d0d0a5fc7ee009443797feb48823eb215d1940a Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 22 Mar 2018 21:23:25 -0700
Subject: [PATCH 367/566] [SPARK-23614][SQL] Fix incorrect reuse exchange when
 caching is used

## What changes were proposed in this pull request?

We should provide customized canonicalize plan for `InMemoryRelation` and `InMemoryTableScanExec`. Otherwise, we can wrongly treat two different cached plans as same result. It causes wrongly reused exchange then.

For a test query like this:
```scala
val cached = spark.createDataset(Seq(TestDataUnion(1, 2, 3), TestDataUnion(4, 5, 6))).cache()
val group1 = cached.groupBy("x").agg(min(col("y")) as "value")
val group2 = cached.groupBy("x").agg(min(col("z")) as "value")
group1.union(group2)
```

Canonicalized plans before:

First exchange:
```
Exchange hashpartitioning(none#0, 5)
+- *(1) HashAggregate(keys=[none#0], functions=[partial_min(none#1)], output=[none#0, none#4])
   +- *(1) InMemoryTableScan [none#0, none#1]
         +- InMemoryRelation [x#4253, y#4254, z#4255], true, 10000, StorageLevel(disk, memory, deserialized, 1 replicas)
               +- LocalTableScan [x#4253, y#4254, z#4255]
```

Second exchange:
```
Exchange hashpartitioning(none#0, 5)
+- *(3) HashAggregate(keys=[none#0], functions=[partial_min(none#1)], output=[none#0, none#4])
   +- *(3) InMemoryTableScan [none#0, none#1]
         +- InMemoryRelation [x#4253, y#4254, z#4255], true, 10000, StorageLevel(disk, memory, deserialized, 1 replicas)
               +- LocalTableScan [x#4253, y#4254, z#4255]
```

You can find that they have the canonicalized plans are the same, although we use different columns in two `InMemoryTableScan`s.

Canonicalized plan after:

First exchange:
```
Exchange hashpartitioning(none#0, 5)
+- *(1) HashAggregate(keys=[none#0], functions=[partial_min(none#1)], output=[none#0, none#4])
   +- *(1) InMemoryTableScan [none#0, none#1]
         +- InMemoryRelation [none#0, none#1, none#2], true, 10000, StorageLevel(memory, 1 replicas)
               +- LocalTableScan [none#0, none#1, none#2]
```

Second exchange:
```
Exchange hashpartitioning(none#0, 5)
+- *(3) HashAggregate(keys=[none#0], functions=[partial_min(none#1)], output=[none#0, none#4])
   +- *(3) InMemoryTableScan [none#0, none#2]
         +- InMemoryRelation [none#0, none#1, none#2], true, 10000, StorageLevel(memory, 1 replicas)
               +- LocalTableScan [none#0, none#1, none#2]
```

## How was this patch tested?

Added unit test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #20831 from viirya/SPARK-23614.

(cherry picked from commit b2edc30db1dcc6102687d20c158a2700965fdf51)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../execution/columnar/InMemoryRelation.scala | 10 ++++++++++
 .../columnar/InMemoryTableScanExec.scala      | 19 +++++++++++++------
 .../org/apache/spark/sql/DatasetSuite.scala   |  9 +++++++++
 .../spark/sql/execution/ExchangeSuite.scala   |  7 +++++++
 4 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
index 22e16913d4da9..2579046e30708 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
@@ -24,6 +24,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.{HintInfo, Statistics}
 import org.apache.spark.sql.execution.SparkPlan
@@ -68,6 +69,15 @@ case class InMemoryRelation(
 
   override protected def innerChildren: Seq[SparkPlan] = Seq(child)
 
+  override def doCanonicalize(): logical.LogicalPlan =
+    copy(output = output.map(QueryPlan.normalizeExprId(_, child.output)),
+      storageLevel = StorageLevel.NONE,
+      child = child.canonicalized,
+      tableName = None)(
+      _cachedColumnBuffers,
+      sizeInBytesStats,
+      statsOfPlanToCache)
+
   override def producedAttributes: AttributeSet = outputSet
 
   @transient val partitionStatistics = new PartitionStatistics(output)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
index a93e8a1ad954d..e73e1378d52e3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning}
-import org.apache.spark.sql.execution.{ColumnarBatchScan, LeafExecNode, WholeStageCodegenExec}
+import org.apache.spark.sql.execution.{ColumnarBatchScan, LeafExecNode, SparkPlan, WholeStageCodegenExec}
 import org.apache.spark.sql.execution.vectorized._
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
@@ -38,6 +38,11 @@ case class InMemoryTableScanExec(
 
   override protected def innerChildren: Seq[QueryPlan[_]] = Seq(relation) ++ super.innerChildren
 
+  override def doCanonicalize(): SparkPlan =
+    copy(attributes = attributes.map(QueryPlan.normalizeExprId(_, relation.output)),
+      predicates = predicates.map(QueryPlan.normalizeExprId(_, relation.output)),
+      relation = relation.canonicalized.asInstanceOf[InMemoryRelation])
+
   override def vectorTypes: Option[Seq[String]] =
     Option(Seq.fill(attributes.length)(
       if (!conf.offHeapColumnVectorEnabled) {
@@ -169,11 +174,13 @@ case class InMemoryTableScanExec(
   override def outputOrdering: Seq[SortOrder] =
     relation.child.outputOrdering.map(updateAttribute(_).asInstanceOf[SortOrder])
 
-  private def statsFor(a: Attribute) = relation.partitionStatistics.forAttribute(a)
+  // Keeps relation's partition statistics because we don't serialize relation.
+  private val stats = relation.partitionStatistics
+  private def statsFor(a: Attribute) = stats.forAttribute(a)
 
   // Returned filter predicate should return false iff it is impossible for the input expression
   // to evaluate to `true' based on statistics collected about this partition batch.
-  @transient val buildFilter: PartialFunction[Expression, Expression] = {
+  @transient lazy val buildFilter: PartialFunction[Expression, Expression] = {
     case And(lhs: Expression, rhs: Expression)
       if buildFilter.isDefinedAt(lhs) || buildFilter.isDefinedAt(rhs) =>
       (buildFilter.lift(lhs) ++ buildFilter.lift(rhs)).reduce(_ && _)
@@ -213,14 +220,14 @@ case class InMemoryTableScanExec(
         l.asInstanceOf[Literal] <= statsFor(a).upperBound).reduce(_ || _)
   }
 
-  val partitionFilters: Seq[Expression] = {
+  lazy val partitionFilters: Seq[Expression] = {
     predicates.flatMap { p =>
       val filter = buildFilter.lift(p)
       val boundFilter =
         filter.map(
           BindReferences.bindReference(
             _,
-            relation.partitionStatistics.schema,
+            stats.schema,
             allowFailures = true))
 
       boundFilter.foreach(_ =>
@@ -243,7 +250,7 @@ case class InMemoryTableScanExec(
   private def filteredCachedBatches(): RDD[CachedBatch] = {
     // Using these variables here to avoid serialization of entire objects (if referenced directly)
     // within the map Partitions closure.
-    val schema = relation.partitionStatistics.schema
+    val schema = stats.schema
     val schemaIndex = schema.zipWithIndex
     val buffers = relation.cachedColumnBuffers
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 49c59cf695dc1..9b745befcb611 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1446,8 +1446,17 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
     val data = Seq(("a", null))
     checkDataset(data.toDS(), data: _*)
   }
+
+  test("SPARK-23614: Union produces incorrect results when caching is used") {
+    val cached = spark.createDataset(Seq(TestDataUnion(1, 2, 3), TestDataUnion(4, 5, 6))).cache()
+    val group1 = cached.groupBy("x").agg(min(col("y")) as "value")
+    val group2 = cached.groupBy("x").agg(min(col("z")) as "value")
+    checkAnswer(group1.union(group2), Row(4, 5) :: Row(1, 2) :: Row(4, 6) :: Row(1, 3) :: Nil)
+  }
 }
 
+case class TestDataUnion(x: Int, y: Int, z: Int)
+
 case class SingleData(id: Int)
 case class DoubleData(id: Int, val1: String)
 case class TripleData(id: Int, val1: String, val2: Long)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala
index 697d7e6520713..bde2de5b39fd7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala
@@ -125,4 +125,11 @@ class ExchangeSuite extends SparkPlanTest with SharedSQLContext {
       assertConsistency(spark.range(10000).map(i => Random.nextInt(1000).toLong))
     }
   }
+
+  test("SPARK-23614: Fix incorrect reuse exchange when caching is used") {
+    val cached = spark.createDataset(Seq((1, 2, 3), (4, 5, 6))).cache()
+    val projection1 = cached.select("_1", "_2").queryExecution.executedPlan
+    val projection2 = cached.select("_1", "_3").queryExecution.executedPlan
+    assert(!projection1.sameResult(projection2))
+  }
 }

From 45761ceb241cdf48b858ff28034a049a0ac62ea3 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@apache.org>
Date: Fri, 23 Mar 2018 21:01:07 +0900
Subject: [PATCH 368/566] [MINOR][R] Fix R lint failure

## What changes were proposed in this pull request?

The lint failure bugged me:

```R
R/SQLContext.R:715:97: style: Trailing whitespace is superfluous.
#'        file-based streaming data source. \code{timeZone} to indicate a timezone to be used to
                                                                                                ^
tests/fulltests/test_streaming.R:239:45: style: Commas should always have a space after.
  expect_equal(times[order(times$eventTime),][1, 2], 2)
                                            ^
lintr checks failed.
```

and I actually saw https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.6-ubuntu-test/500/console too. If I understood correctly, there is a try about moving to Unbuntu one.

## How was this patch tested?

Manually tested by `./dev/lint-r`:

```
...
lintr checks passed.
```

Author: hyukjinkwon <gurwls223@apache.org>

Closes #20879 from HyukjinKwon/minor-r-lint.

(cherry picked from commit 92e952557dbd8a170d66d615e25c6c6a8399dd43)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 R/pkg/R/SQLContext.R                   | 2 +-
 R/pkg/tests/fulltests/test_streaming.R | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 9d0a2d5e074e4..b8e6bb302b856 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -727,7 +727,7 @@ read.jdbc <- function(url, tableName,
 #' @param schema The data schema defined in structType or a DDL-formatted string, this is
 #'               required for file-based streaming data source
 #' @param ... additional external data source specific named options, for instance \code{path} for
-#'        file-based streaming data source. \code{timeZone} to indicate a timezone to be used to 
+#'        file-based streaming data source. \code{timeZone} to indicate a timezone to be used to
 #'        parse timestamps in the JSON/CSV data sources or partition values; If it isn't set, it
 #'        uses the default value, session local timezone.
 #' @return SparkDataFrame
diff --git a/R/pkg/tests/fulltests/test_streaming.R b/R/pkg/tests/fulltests/test_streaming.R
index a354d50c6b54e..bfb1a046490ec 100644
--- a/R/pkg/tests/fulltests/test_streaming.R
+++ b/R/pkg/tests/fulltests/test_streaming.R
@@ -236,7 +236,7 @@ test_that("Watermark", {
 
   times <- collect(sql("SELECT * FROM times"))
   # looks like write timing can affect the first bucket; but it should be t
-  expect_equal(times[order(times$eventTime),][1, 2], 2)
+  expect_equal(times[order(times$eventTime), ][1, 2], 2)
 
   stopQuery(q)
   unlink(parquetPath)

From ce0fbec6857a09b095c492bb93d9ebaaa1e5c074 Mon Sep 17 00:00:00 2001
From: arucard21 <arucard21@gmail.com>
Date: Fri, 23 Mar 2018 21:02:34 +0900
Subject: [PATCH 369/566] [SPARK-23769][CORE] Remove comments that
 unnecessarily disable Scalastyle check

## What changes were proposed in this pull request?

We re-enabled the Scalastyle checker on a line of code. It was previously disabled, but it does not violate any of the rules. So there's no reason to disable the Scalastyle checker here.

## How was this patch tested?

We tested this by running `build/mvn scalastyle:check` after removing the comments that disable the checker. This check passed with no errors or warnings for Spark Core
```
[INFO]
[INFO] ------------------------------------------------------------------------
[INFO] Building Spark Project Core 2.4.0-SNAPSHOT
[INFO] ------------------------------------------------------------------------
[INFO]
[INFO] --- scalastyle-maven-plugin:1.0.0:check (default-cli)  spark-core_2.11 ---
Saving to outputFile=<path to local dir>/spark/core/target/scalastyle-output.xml
Processed 485 file(s)
Found 0 errors
Found 0 warnings
Found 0 infos
```
We did not run all tests (with `dev/run-tests`) since this Scalastyle check seemed sufficient.

## Co-contributors:
chialun-yeh
Hrayo712
vpourquie

Author: arucard21 <arucard21@gmail.com>

Closes #20880 from arucard21/scalastyle_util.

(cherry picked from commit 6ac4fba69290e1c7de2c0a5863f224981dedb919)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 .../org/apache/spark/storage/BlockReplicationPolicy.scala     | 4 +---
 .../main/scala/org/apache/spark/util/CompletionIterator.scala | 2 --
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala b/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala
index 353eac60df171..0bacc34cdfd90 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala
@@ -54,10 +54,9 @@ trait BlockReplicationPolicy {
 }
 
 object BlockReplicationUtils {
-  // scalastyle:off line.size.limit
   /**
    * Uses sampling algorithm by Robert Floyd. Finds a random sample in O(n) while
-   * minimizing space usage. Please see <a href="http://math.stackexchange.com/questions/178690/whats-the-proof-of-correctness-for-robert-floyds-algorithm-for-selecting-a-sin">
+   * minimizing space usage. Please see <a href="https://math.stackexchange.com/q/178690">
    * here</a>.
    *
    * @param n total number of indices
@@ -65,7 +64,6 @@ object BlockReplicationUtils {
    * @param r random number generator
    * @return list of m random unique indices
    */
-  // scalastyle:on line.size.limit
   private def getSampleIds(n: Int, m: Int, r: Random): List[Int] = {
     val indices = (n - m + 1 to n).foldLeft(mutable.LinkedHashSet.empty[Int]) {case (set, i) =>
       val t = r.nextInt(i) + 1
diff --git a/core/src/main/scala/org/apache/spark/util/CompletionIterator.scala b/core/src/main/scala/org/apache/spark/util/CompletionIterator.scala
index 31d230d0fec8e..21acaa95c5645 100644
--- a/core/src/main/scala/org/apache/spark/util/CompletionIterator.scala
+++ b/core/src/main/scala/org/apache/spark/util/CompletionIterator.scala
@@ -22,9 +22,7 @@ package org.apache.spark.util
  * through all the elements.
  */
 private[spark]
-// scalastyle:off
 abstract class CompletionIterator[ +A, +I <: Iterator[A]](sub: I) extends Iterator[A] {
-// scalastyle:on
 
   private[this] var completed = false
   def next(): A = sub.next()

From ea44783ad479ea7c66abc2c280f2a3abf2a4d3af Mon Sep 17 00:00:00 2001
From: bag_of_tricks <falbani@hortonworks.com>
Date: Fri, 23 Mar 2018 10:36:23 -0700
Subject: [PATCH 370/566] [SPARK-23759][UI] Unable to bind Spark UI to specific
 host name / IP

## What changes were proposed in this pull request?

Fixes SPARK-23759 by moving connector.start() after connector.setHost()

Problem was created due connector.setHost(hostName) call was after connector.start()

## How was this patch tested?

Patch was tested after build and deployment. This patch requires SPARK_LOCAL_IP environment variable to be set on spark-env.sh

Author: bag_of_tricks <falbani@hortonworks.com>

Closes #20883 from felixalbani/SPARK-23759.

(cherry picked from commit 8b56f16640fc4156aa7bd529c54469d27635b951)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 core/src/main/scala/org/apache/spark/ui/JettyUtils.scala | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
index 0adeb4058b6e4..0e8a6307de6a8 100644
--- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
@@ -343,12 +343,13 @@ private[spark] object JettyUtils extends Logging {
           -1,
           connectionFactories: _*)
         connector.setPort(port)
-        connector.start()
+        connector.setHost(hostName)
 
         // Currently we only use "SelectChannelConnector"
         // Limit the max acceptor number to 8 so that we don't waste a lot of threads
         connector.setAcceptQueueSize(math.min(connector.getAcceptors, 8))
-        connector.setHost(hostName)
+
+        connector.start()
         // The number of selectors always equals to the number of acceptors
         minThreads += connector.getAcceptors * 2
 

From 523fcafc5c4a79cf3455f3ceab6d886679399495 Mon Sep 17 00:00:00 2001
From: Jose Torres <torres.joseph.f+github@gmail.com>
Date: Sat, 24 Mar 2018 18:21:01 -0700
Subject: [PATCH 371/566] [SPARK-23788][SS] Fix race in StreamingQuerySuite

## What changes were proposed in this pull request?

The serializability test uses the same MemoryStream instance for 3 different queries. If any of those queries ask it to commit before the others have run, the rest will see empty dataframes. This can fail the test if q3 is affected.

We should use one instance per query instead.

## How was this patch tested?

Existing unit test. If I move q2.processAllAvailable() before starting q3, the test always fails without the fix.

Author: Jose Torres <torres.joseph.f+github@gmail.com>

Closes #20896 from jose-torres/fixrace.

(cherry picked from commit 816a5496ba4caac438f70400f72bb10bfcc02418)
Signed-off-by: Shixiong Zhu <zsxwing@gmail.com>
---
 .../spark/sql/streaming/StreamingQuerySuite.scala      | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 76201c63a2701..2b0ab33f3e96c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -532,22 +532,22 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging wi
         .start()
     }
 
-    val input = MemoryStream[Int]
-    val q1 = startQuery(input.toDS, "stream_serializable_test_1")
-    val q2 = startQuery(input.toDS.map { i =>
+    val input = MemoryStream[Int] :: MemoryStream[Int] :: MemoryStream[Int] :: Nil
+    val q1 = startQuery(input(0).toDS, "stream_serializable_test_1")
+    val q2 = startQuery(input(1).toDS.map { i =>
       // Emulate that `StreamingQuery` get captured with normal usage unintentionally.
       // It should not fail the query.
       q1
       i
     }, "stream_serializable_test_2")
-    val q3 = startQuery(input.toDS.map { i =>
+    val q3 = startQuery(input(2).toDS.map { i =>
       // Emulate that `StreamingQuery` is used in executors. We should fail the query with a clear
       // error message.
       q1.explain()
       i
     }, "stream_serializable_test_3")
     try {
-      input.addData(1)
+      input.foreach(_.addData(1))
 
       // q2 should not fail since it doesn't use `q1` in the closure
       q2.processAllAvailable()

From 57026a1851aca9fe028cb39e8059f0bf133f3e0c Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Mon, 19 Mar 2018 09:41:43 +0100
Subject: [PATCH 372/566] [SPARK-23599][SQL] Add a UUID generator from
 Pseudo-Random Numbers

## What changes were proposed in this pull request?

This patch adds a UUID generator from Pseudo-Random Numbers. We can use it later to have deterministic `UUID()` expression.

## How was this patch tested?

Added unit tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #20817 from viirya/SPARK-23599.

(cherry picked from commit 4de638c1976dea74761bbe5c30da808178ee885d)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../catalyst/util/RandomUUIDGenerator.scala   | 43 ++++++++++++++
 .../util/RandomUUIDGeneratorSuite.scala       | 57 +++++++++++++++++++
 2 files changed, 100 insertions(+)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RandomUUIDGenerator.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RandomUUIDGeneratorSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RandomUUIDGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RandomUUIDGenerator.scala
new file mode 100644
index 0000000000000..4fe07a071c1ca
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RandomUUIDGenerator.scala
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util
+
+import java.util.UUID
+
+import org.apache.commons.math3.random.MersenneTwister
+
+import org.apache.spark.unsafe.types.UTF8String
+
+/**
+ * This class is used to generate a UUID from Pseudo-Random Numbers.
+ *
+ * For the algorithm, see RFC 4122: A Universally Unique IDentifier (UUID) URN Namespace,
+ * section 4.4 "Algorithms for Creating a UUID from Truly Random or Pseudo-Random Numbers".
+ */
+case class RandomUUIDGenerator(randomSeed: Long) {
+  private val random = new MersenneTwister(randomSeed)
+
+  def getNextUUID(): UUID = {
+    val mostSigBits = (random.nextLong() & 0xFFFFFFFFFFFF0FFFL) | 0x0000000000004000L
+    val leastSigBits = (random.nextLong() | 0x8000000000000000L) & 0xBFFFFFFFFFFFFFFFL
+
+    new UUID(mostSigBits, leastSigBits)
+  }
+
+  def getNextUUIDUTF8String(): UTF8String = UTF8String.fromString(getNextUUID().toString())
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RandomUUIDGeneratorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RandomUUIDGeneratorSuite.scala
new file mode 100644
index 0000000000000..b75739e5a3a65
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RandomUUIDGeneratorSuite.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util
+
+import scala.util.Random
+
+import org.apache.spark.SparkFunSuite
+
+class RandomUUIDGeneratorSuite extends SparkFunSuite {
+  test("RandomUUIDGenerator should generate version 4, variant 2 UUIDs") {
+    val generator = RandomUUIDGenerator(new Random().nextLong())
+    for (_ <- 0 to 100) {
+      val uuid = generator.getNextUUID()
+      assert(uuid.version() == 4)
+      assert(uuid.variant() == 2)
+    }
+  }
+
+ test("UUID from RandomUUIDGenerator should be deterministic") {
+   val r1 = new Random(100)
+   val generator1 = RandomUUIDGenerator(r1.nextLong())
+   val r2 = new Random(100)
+   val generator2 = RandomUUIDGenerator(r2.nextLong())
+   val r3 = new Random(101)
+   val generator3 = RandomUUIDGenerator(r3.nextLong())
+
+   for (_ <- 0 to 100) {
+      val uuid1 = generator1.getNextUUID()
+      val uuid2 = generator2.getNextUUID()
+      val uuid3 = generator3.getNextUUID()
+      assert(uuid1 == uuid2)
+      assert(uuid1 != uuid3)
+   }
+ }
+
+ test("Get UTF8String UUID") {
+   val generator = RandomUUIDGenerator(new Random().nextLong())
+   val utf8StringUUID = generator.getNextUUIDUTF8String()
+   val uuid = java.util.UUID.fromString(utf8StringUUID.toString)
+   assert(uuid.version() == 4 && uuid.variant() == 2 && utf8StringUUID.toString == uuid.toString)
+ }
+}

From 2fd7acabf8d55789662d52d94bd30f84b05a577a Mon Sep 17 00:00:00 2001
From: Yuming Wang <yumwang@ebay.com>
Date: Thu, 15 Mar 2018 19:54:58 +0100
Subject: [PATCH 373/566] [HOT-FIX] Fix SparkOutOfMemoryError: Unable to
 acquire 262144 bytes of memory, got 224631

## What changes were proposed in this pull request?

https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/88263/testReport
https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/88260/testReport
https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/88257/testReport
https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/88224/testReport

These tests all failed:
```
org.apache.spark.memory.SparkOutOfMemoryError:  Unable to acquire 262144 bytes of memory, got 224631
at org.apache.spark.memory.MemoryConsumer.throwOom(MemoryConsumer.java:157)
at org.apache.spark.memory.MemoryConsumer.allocateArray(MemoryConsumer.java:98)
at org.apache.spark.unsafe.map.BytesToBytesMap.allocate(BytesToBytesMap.java:787)
at org.apache.spark.unsafe.map.BytesToBytesMap.<init>(BytesToBytesMap.java:204)
at org.apache.spark.unsafe.map.BytesToBytesMap.<init>(BytesToBytesMap.java:219)
...
```

This PR ignore this test.

## How was this patch tested?

N/A

Author: Yuming Wang <yumwang@ebay.com>

Closes #20835 from wangyum/SPARK-23598.

(cherry picked from commit 15c3c983008557165cc91713ddaf2dbd6d5a506c)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../org/apache/spark/sql/execution/WholeStageCodegenSuite.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
index 4b40e4ef7571c..9180a22c260f1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -310,7 +310,7 @@ class WholeStageCodegenSuite extends QueryTest with SharedSQLContext {
     }
   }
 
-  test("SPARK-23598: Codegen working for lots of aggregation operations without runtime errors") {
+  ignore("SPARK-23598: Codegen working for lots of aggregation operations without runtime errors") {
     withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
       var df = Seq((8, "bat"), (15, "mouse"), (5, "horse")).toDF("age", "name")
       for (i <- 0 until 70) {

From 328dea6f8ffcd515face7d64c29f7af71abd88a2 Mon Sep 17 00:00:00 2001
From: "Michael (Stu) Stewart" <mstewart141@gmail.com>
Date: Mon, 26 Mar 2018 12:45:45 +0900
Subject: [PATCH 374/566] [SPARK-23645][MINOR][DOCS][PYTHON] Add docs RE
 `pandas_udf` with keyword args

## What changes were proposed in this pull request?

Add documentation about the limitations of `pandas_udf` with keyword arguments and related concepts, like `functools.partial` fn objects.

NOTE: intermediate commits on this PR show some of the steps that can be taken to fix some (but not all) of these pain points.

### Survey of problems we face today:

(Initialize) Note: python 3.6 and spark 2.4snapshot.
```
 from pyspark.sql import SparkSession
 import inspect, functools
 from pyspark.sql.functions import pandas_udf, PandasUDFType, col, lit, udf

 spark = SparkSession.builder.getOrCreate()
 print(spark.version)

 df = spark.range(1,6).withColumn('b', col('id') * 2)

 def ok(a,b): return a+b
```

Using a keyword argument at the call site `b=...` (and yes, *full* stack trace below, haha):
```
---> 14 df.withColumn('ok', pandas_udf(f=ok, returnType='bigint')('id', b='id')).show() # no kwargs

TypeError: wrapper() got an unexpected keyword argument 'b'
```

Using partial with a keyword argument where the kw-arg is the first argument of the fn:
*(Aside: kind of interesting that lines 15,16 work great and then 17 explodes)*
```
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-9-e9f31b8799c1> in <module>()
     15 df.withColumn('ok', pandas_udf(f=functools.partial(ok, 7), returnType='bigint')('id')).show()
     16 df.withColumn('ok', pandas_udf(f=functools.partial(ok, b=7), returnType='bigint')('id')).show()
---> 17 df.withColumn('ok', pandas_udf(f=functools.partial(ok, a=7), returnType='bigint')('id')).show()

/Users/stu/ZZ/spark/python/pyspark/sql/functions.py in pandas_udf(f, returnType, functionType)
   2378         return functools.partial(_create_udf, returnType=return_type, evalType=eval_type)
   2379     else:
-> 2380         return _create_udf(f=f, returnType=return_type, evalType=eval_type)
   2381
   2382

/Users/stu/ZZ/spark/python/pyspark/sql/udf.py in _create_udf(f, returnType, evalType)
     54                 argspec.varargs is None:
     55             raise ValueError(
---> 56                 "Invalid function: 0-arg pandas_udfs are not supported. "
     57                 "Instead, create a 1-arg pandas_udf and ignore the arg in your function."
     58             )

ValueError: Invalid function: 0-arg pandas_udfs are not supported. Instead, create a 1-arg pandas_udf and ignore the arg in your function.
```

Author: Michael (Stu) Stewart <mstewart141@gmail.com>

Closes #20900 from mstewart141/udfkw2.

(cherry picked from commit 087fb3142028d679524e22596b0ad4f74ff47e8d)

Signed-off-by: hyukjinkwon <gurwls223@apache.org>

Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 python/pyspark/sql/functions.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 316aa85550ad0..365be7be3aaa1 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2123,6 +2123,8 @@ def udf(f=None, returnType=StringType()):
         in boolean expressions and it ends up with being executed all internally. If the functions
         can fail on special rows, the workaround is to incorporate the condition into the functions.
 
+    .. note:: The user-defined functions do not take keyword arguments on the calling side.
+
     :param f: python function if used as a standalone function
     :param returnType: the return type of the user-defined function. The value can be either a
         :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
@@ -2252,6 +2254,8 @@ def pandas_udf(f=None, returnType=None, functionType=None):
     .. note:: The user-defined functions do not support conditional expressions or short circuiting
         in boolean expressions and it ends up with being executed all internally. If the functions
         can fail on special rows, the workaround is to incorporate the condition into the functions.
+
+    .. note:: The user-defined functions do not take keyword arguments on the calling side.
     """
     # decorator @pandas_udf(returnType, functionType)
     is_decorator = f is None or isinstance(f, (str, DataType))

From 1c39dfaef09538ad63e4d5d6d9a343c9bfe9f8d3 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Mon, 26 Mar 2018 20:45:20 +0200
Subject: [PATCH 375/566] [SPARK-23599][SQL][BACKPORT-2.3] Use
 RandomUUIDGenerator in Uuid expression

## What changes were proposed in this pull request?

As stated in Jira, there are problems with current `Uuid` expression which uses `java.util.UUID.randomUUID` for UUID generation.

This patch uses the newly added `RandomUUIDGenerator` for UUID generation. So we can make `Uuid` deterministic between retries.

This backports SPARK-23599 to Spark 2.3.

## How was this patch tested?

Added tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #20903 from viirya/SPARK-23599-2.3.
---
 .../sql/catalyst/analysis/Analyzer.scala      | 16 ++++
 .../spark/sql/catalyst/expressions/misc.scala | 27 +++++--
 .../ResolvedUuidExpressionsSuite.scala        | 73 +++++++++++++++++++
 .../expressions/ExpressionEvalHelper.scala    |  1 +
 .../expressions/MiscExpressionsSuite.scala    | 23 +++++-
 .../org/apache/spark/sql/DataFrameSuite.scala |  6 ++
 6 files changed, 139 insertions(+), 7 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolvedUuidExpressionsSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 251099f750cf6..9cc928c8ac108 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.analysis
 
 import scala.collection.mutable.ArrayBuffer
+import scala.util.Random
 
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst._
@@ -177,6 +178,7 @@ class Analyzer(
       TimeWindowing ::
       ResolveInlineTables(conf) ::
       ResolveTimeZone(conf) ::
+      ResolvedUuidExpressions ::
       TypeCoercion.typeCoercionRules(conf) ++
       extendedResolutionRules : _*),
     Batch("Post-Hoc Resolution", Once, postHocResolutionRules: _*),
@@ -1994,6 +1996,20 @@ class Analyzer(
     }
   }
 
+  /**
+   * Set the seed for random number generation in Uuid expressions.
+   */
+  object ResolvedUuidExpressions extends Rule[LogicalPlan] {
+    private lazy val random = new Random()
+
+    override def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
+      case p if p.resolved => p
+      case p => p transformExpressionsUp {
+        case Uuid(None) => Uuid(Some(random.nextLong()))
+      }
+    }
+  }
+
   /**
    * Correctly handle null primitive inputs for UDF by adding extra [[If]] expression to do the
    * null check.  When user defines a UDF with primitive parameters, there is no way to tell if the
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
index 4b9006ab5b423..cdbe611fdd064 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
@@ -21,6 +21,7 @@ import java.util.UUID
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.catalyst.util.RandomUUIDGenerator
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -117,18 +118,34 @@ case class CurrentDatabase() extends LeafExpression with Unevaluable {
        46707d92-02f4-4817-8116-a4c3b23e6266
   """)
 // scalastyle:on line.size.limit
-case class Uuid() extends LeafExpression {
+case class Uuid(randomSeed: Option[Long] = None) extends LeafExpression with Nondeterministic {
 
-  override lazy val deterministic: Boolean = false
+  def this() = this(None)
+
+  override lazy val resolved: Boolean = randomSeed.isDefined
 
   override def nullable: Boolean = false
 
   override def dataType: DataType = StringType
 
-  override def eval(input: InternalRow): Any = UTF8String.fromString(UUID.randomUUID().toString)
+  @transient private[this] var randomGenerator: RandomUUIDGenerator = _
+
+
+  override protected def initializeInternal(partitionIndex: Int): Unit =
+    randomGenerator = RandomUUIDGenerator(randomSeed.get + partitionIndex)
+
+  override protected def evalInternal(input: InternalRow): Any =
+    randomGenerator.getNextUUIDUTF8String()
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    ev.copy(code = s"final UTF8String ${ev.value} = " +
-      s"UTF8String.fromString(java.util.UUID.randomUUID().toString());", isNull = "false")
+    val randomGen = ctx.freshName("randomGen")
+    ctx.addMutableState("org.apache.spark.sql.catalyst.util.RandomUUIDGenerator", randomGen,
+      forceInline = true,
+      useFreshName = false)
+    ctx.addPartitionInitializationStatement(s"$randomGen = " +
+      "new org.apache.spark.sql.catalyst.util.RandomUUIDGenerator(" +
+      s"${randomSeed.get}L + partitionIndex);")
+    ev.copy(code = s"final UTF8String ${ev.value} = $randomGen.getNextUUIDUTF8String();",
+      isNull = "false")
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolvedUuidExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolvedUuidExpressionsSuite.scala
new file mode 100644
index 0000000000000..fe57c199b8744
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolvedUuidExpressionsSuite.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.analysis
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
+
+/**
+ * Test suite for resolving Uuid expressions.
+ */
+class ResolvedUuidExpressionsSuite extends AnalysisTest {
+
+  private lazy val a = 'a.int
+  private lazy val r = LocalRelation(a)
+  private lazy val uuid1 = Uuid().as('_uuid1)
+  private lazy val uuid2 = Uuid().as('_uuid2)
+  private lazy val uuid3 = Uuid().as('_uuid3)
+  private lazy val uuid1Ref = uuid1.toAttribute
+
+  private val analyzer = getAnalyzer(caseSensitive = true)
+
+  private def getUuidExpressions(plan: LogicalPlan): Seq[Uuid] = {
+    plan.flatMap {
+      case p =>
+        p.expressions.flatMap(_.collect {
+          case u: Uuid => u
+        })
+    }
+  }
+
+  test("analyzed plan sets random seed for Uuid expression") {
+    val plan = r.select(a, uuid1)
+    val resolvedPlan = analyzer.executeAndCheck(plan)
+    getUuidExpressions(resolvedPlan).foreach { u =>
+      assert(u.resolved)
+      assert(u.randomSeed.isDefined)
+    }
+  }
+
+  test("Uuid expressions should have different random seeds") {
+    val plan = r.select(a, uuid1).groupBy(uuid1Ref)(uuid2, uuid3)
+    val resolvedPlan = analyzer.executeAndCheck(plan)
+    assert(getUuidExpressions(resolvedPlan).map(_.randomSeed.get).distinct.length == 3)
+  }
+
+  test("Different analyzed plans should have different random seeds in Uuids") {
+    val plan = r.select(a, uuid1).groupBy(uuid1Ref)(uuid2, uuid3)
+    val resolvedPlan1 = analyzer.executeAndCheck(plan)
+    val resolvedPlan2 = analyzer.executeAndCheck(plan)
+    val uuids1 = getUuidExpressions(resolvedPlan1)
+    val uuids2 = getUuidExpressions(resolvedPlan2)
+    assert(uuids1.distinct.length == 3)
+    assert(uuids2.distinct.length == 3)
+    assert(uuids1.intersect(uuids2).length == 0)
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index b4c8eab19c5cc..fad26f0055d6c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -162,6 +162,7 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks {
         Alias(expression, s"Optimized($expression)1")() ::
           Alias(expression, s"Optimized($expression)2")() :: Nil),
       expression)
+    plan.initialize(0)
 
     val unsafeRow = plan(inputRow)
     val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscExpressionsSuite.scala
index facc863081303..593bcd02078ce 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscExpressionsSuite.scala
@@ -17,8 +17,11 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import scala.util.Random
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 class MiscExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
@@ -40,7 +43,23 @@ class MiscExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("uuid") {
-    checkEvaluation(Length(Uuid()), 36)
-    assert(evaluate(Uuid()) !== evaluate(Uuid()))
+    def assertIncorrectEval(f: () => Unit): Unit = {
+      intercept[Exception] {
+        f()
+      }.getMessage().contains("Incorrect evaluation")
+    }
+
+    checkEvaluation(Length(Uuid(Some(0))), 36)
+    val r = new Random()
+    val seed1 = Some(r.nextLong())
+    val uuid1 = evaluate(Uuid(seed1)).asInstanceOf[UTF8String]
+    checkEvaluation(Uuid(seed1), uuid1.toString)
+
+    val seed2 = Some(r.nextLong())
+    val uuid2 = evaluate(Uuid(seed2)).asInstanceOf[UTF8String]
+    assertIncorrectEval(() => checkEvaluationWithoutCodegen(Uuid(seed1), uuid2))
+    assertIncorrectEval(() => checkEvaluationWithGeneratedMutableProjection(Uuid(seed1), uuid2))
+    assertIncorrectEval(() => checkEvalutionWithUnsafeProjection(Uuid(seed1), uuid2))
+    assertIncorrectEval(() => checkEvaluationWithOptimization(Uuid(seed1), uuid2))
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 8b66f77b2f923..f7b3393f65cb1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -28,6 +28,7 @@ import org.scalatest.Matchers._
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.expressions.Uuid
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, OneRowRelation, Union}
 import org.apache.spark.sql.execution.{FilterExec, QueryExecution, WholeStageCodegenExec}
 import org.apache.spark.sql.execution.aggregate.HashAggregateExec
@@ -2264,4 +2265,9 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     checkAnswer(df, Row(0, 10) :: Nil)
     assert(df.queryExecution.executedPlan.isInstanceOf[WholeStageCodegenExec])
   }
+
+  test("Uuid expressions should produce same results at retries in the same DataFrame") {
+    val df = spark.range(1).select($"id", new Column(Uuid()))
+    checkAnswer(df, df.collect())
+  }
 }

From 38c0bd7db86ca2b7e167b89338028863bcc26906 Mon Sep 17 00:00:00 2001
From: Thomas Graves <tgraves@unharmedunarmed.corp.ne1.yahoo.com>
Date: Thu, 29 Mar 2018 16:37:46 +0800
Subject: [PATCH 376/566] =?UTF-8?q?[SPARK-23806]=20Broadcast.unpersist=20c?=
 =?UTF-8?q?an=20cause=20fatal=20exception=20when=20used=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… with dynamic allocation

## What changes were proposed in this pull request?

ignore errors when you are waiting for a broadcast.unpersist. This is handling it the same way as doing rdd.unpersist in https://issues.apache.org/jira/browse/SPARK-22618

## How was this patch tested?

Patch was tested manually against a couple jobs that exhibit this behavior, with the change the application no longer dies due to this and just prints the warning.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Thomas Graves <tgraves@unharmedunarmed.corp.ne1.yahoo.com>

Closes #20924 from tgravescs/SPARK-23806.

(cherry picked from commit 641aec68e8167546dbb922874c086c9b90198f08)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/storage/BlockManagerMasterEndpoint.scala | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
index 89a6a71a589a1..56b95c31eb4c3 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
@@ -192,11 +192,15 @@ class BlockManagerMasterEndpoint(
     val requiredBlockManagers = blockManagerInfo.values.filter { info =>
       removeFromDriver || !info.blockManagerId.isDriver
     }
-    Future.sequence(
-      requiredBlockManagers.map { bm =>
-        bm.slaveEndpoint.ask[Int](removeMsg)
-      }.toSeq
-    )
+    val futures = requiredBlockManagers.map { bm =>
+      bm.slaveEndpoint.ask[Int](removeMsg).recover {
+        case e: IOException =>
+          logWarning(s"Error trying to remove broadcast $broadcastId", e)
+          0 // zero blocks were removed
+      }
+    }.toSeq
+
+    Future.sequence(futures)
   }
 
   private def removeBlockManager(blockManagerId: BlockManagerId) {

From 0bfbcaf6696570b74923047266b00ba4dc2ba97c Mon Sep 17 00:00:00 2001
From: Sahil Takiar <stakiar@cloudera.com>
Date: Thu, 29 Mar 2018 10:23:23 -0700
Subject: [PATCH 377/566] [SPARK-23785][LAUNCHER] LauncherBackend doesn't check
 state of connection before setting state

## What changes were proposed in this pull request?

Changed `LauncherBackend` `set` method so that it checks if the connection is open or not before writing to it (uses `isConnected`).

## How was this patch tested?

None

Author: Sahil Takiar <stakiar@cloudera.com>

Closes #20893 from sahilTakiar/master.

(cherry picked from commit 491ec114fd3886ebd9fa29a482e3d112fb5a088c)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../spark/launcher/LauncherBackend.scala      |  6 +++---
 .../spark/launcher/LauncherServerSuite.java   | 20 +++++++++++++++++++
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala b/core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala
index aaae33ca4e6f3..1b049b786023a 100644
--- a/core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala
+++ b/core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala
@@ -67,13 +67,13 @@ private[spark] abstract class LauncherBackend {
   }
 
   def setAppId(appId: String): Unit = {
-    if (connection != null) {
+    if (connection != null && isConnected) {
       connection.send(new SetAppId(appId))
     }
   }
 
   def setState(state: SparkAppHandle.State): Unit = {
-    if (connection != null && lastState != state) {
+    if (connection != null && isConnected && lastState != state) {
       connection.send(new SetState(state))
       lastState = state
     }
@@ -114,10 +114,10 @@ private[spark] abstract class LauncherBackend {
 
     override def close(): Unit = {
       try {
+        _isConnected = false
         super.close()
       } finally {
         onDisconnected()
-        _isConnected = false
       }
     }
 
diff --git a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
index d16337a319be3..5413d3a416545 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
@@ -185,6 +185,26 @@ public void testStreamFiltering() throws Exception {
     }
   }
 
+  @Test
+  public void testAppHandleDisconnect() throws Exception {
+    LauncherServer server = LauncherServer.getOrCreateServer();
+    ChildProcAppHandle handle = new ChildProcAppHandle(server);
+    String secret = server.registerHandle(handle);
+
+    TestClient client = null;
+    try {
+      Socket s = new Socket(InetAddress.getLoopbackAddress(), server.getPort());
+      client = new TestClient(s);
+      client.send(new Hello(secret, "1.4.0"));
+      handle.disconnect();
+      waitForError(client, secret);
+    } finally {
+      handle.kill();
+      close(client);
+      client.clientThread.join();
+    }
+  }
+
   private void close(Closeable c) {
     if (c != null) {
       try {

From 516304521a9bd2632bd1a42c9161b1216a8ead92 Mon Sep 17 00:00:00 2001
From: Kent Yao <yaooqinn@hotmail.com>
Date: Thu, 29 Mar 2018 10:46:28 -0700
Subject: [PATCH 378/566] [SPARK-23639][SQL] Obtain token before init metastore
 client in SparkSQL CLI

## What changes were proposed in this pull request?

In SparkSQLCLI, SessionState generates before SparkContext instantiating. When we use --proxy-user to impersonate, it's unable to initializing a metastore client to talk to the secured metastore for no kerberos ticket.

This PR use real user ugi to obtain token for owner before talking to kerberized metastore.

## How was this patch tested?

Manually verified with kerberized hive metasotre / hdfs.

Author: Kent Yao <yaooqinn@hotmail.com>

Closes #20784 from yaooqinn/SPARK-23639.

(cherry picked from commit a7755fd8ce2f022118b9827aaac7d5d59f0f297a)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../deploy/security/HiveDelegationTokenProvider.scala    | 8 ++++----
 .../spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala  | 9 +++++++++
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HiveDelegationTokenProvider.scala b/core/src/main/scala/org/apache/spark/deploy/security/HiveDelegationTokenProvider.scala
index ece5ce79c650d..7249eb85ac7c7 100644
--- a/core/src/main/scala/org/apache/spark/deploy/security/HiveDelegationTokenProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/security/HiveDelegationTokenProvider.scala
@@ -36,7 +36,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.KEYTAB
 import org.apache.spark.util.Utils
 
-private[security] class HiveDelegationTokenProvider
+private[spark] class HiveDelegationTokenProvider
     extends HadoopDelegationTokenProvider with Logging {
 
   override def serviceName: String = "hive"
@@ -124,9 +124,9 @@ private[security] class HiveDelegationTokenProvider
     val currentUser = UserGroupInformation.getCurrentUser()
     val realUser = Option(currentUser.getRealUser()).getOrElse(currentUser)
 
-   // For some reason the Scala-generated anonymous class ends up causing an
-   // UndeclaredThrowableException, even if you annotate the method with @throws.
-   try {
+    // For some reason the Scala-generated anonymous class ends up causing an
+    // UndeclaredThrowableException, even if you annotate the method with @throws.
+    try {
       realUser.doAs(new PrivilegedExceptionAction[T]() {
         override def run(): T = fn
       })
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
index 832a15d09599f..084f8200102ba 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
@@ -34,11 +34,13 @@ import org.apache.hadoop.hive.ql.Driver
 import org.apache.hadoop.hive.ql.exec.Utilities
 import org.apache.hadoop.hive.ql.processors._
 import org.apache.hadoop.hive.ql.session.SessionState
+import org.apache.hadoop.security.{Credentials, UserGroupInformation}
 import org.apache.log4j.{Level, Logger}
 import org.apache.thrift.transport.TSocket
 
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.security.HiveDelegationTokenProvider
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.hive.HiveUtils
@@ -121,6 +123,13 @@ private[hive] object SparkSQLCLIDriver extends Logging {
       }
     }
 
+    val tokenProvider = new HiveDelegationTokenProvider()
+    if (tokenProvider.delegationTokensRequired(sparkConf, hadoopConf)) {
+      val credentials = new Credentials()
+      tokenProvider.obtainDelegationTokens(hadoopConf, sparkConf, credentials)
+      UserGroupInformation.getCurrentUser.addCredentials(credentials)
+    }
+
     SessionState.start(sessionState)
 
     // Clean up after we exit

From 1365d739d9d4d9cdc81b8b0c6b80ddad15227cba Mon Sep 17 00:00:00 2001
From: Jose Torres <torres.joseph.f+github@gmail.com>
Date: Thu, 29 Mar 2018 21:36:56 -0700
Subject: [PATCH 379/566] [SPARK-23808][SQL] Set default Spark session in
 test-only spark sessions.

## What changes were proposed in this pull request?

Set default Spark session in the TestSparkSession and TestHiveSparkSession constructors.

## How was this patch tested?

new unit tests

Author: Jose Torres <torres.joseph.f+github@gmail.com>

Closes #20926 from jose-torres/test3.

(cherry picked from commit b348901192b231153b58fe5720253168c87963d4)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../spark/sql/test/TestSQLContext.scala       |  2 ++
 .../sql/test/TestSparkSessionSuite.scala      | 29 +++++++++++++++++++
 .../apache/spark/sql/hive/test/TestHive.scala |  4 +++
 3 files changed, 35 insertions(+)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/test/TestSparkSessionSuite.scala

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
index 4286e8a6ca2c8..3038b822beb4a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
@@ -34,6 +34,8 @@ private[spark] class TestSparkSession(sc: SparkContext) extends SparkSession(sc)
     this(new SparkConf)
   }
 
+  SparkSession.setDefaultSession(this)
+
   @transient
   override lazy val sessionState: SessionState = {
     new TestSQLSessionStateBuilder(this, None).build()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/TestSparkSessionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSparkSessionSuite.scala
new file mode 100644
index 0000000000000..4019c6888da98
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSparkSessionSuite.scala
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.test
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.SparkSession
+
+class TestSparkSessionSuite extends SparkFunSuite {
+  test("default session is set in constructor") {
+    val session = new TestSparkSession()
+    assert(SparkSession.getDefaultSession.contains(session))
+    session.stop()
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 19028939f3673..8b0fe56b24fcd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -159,6 +159,10 @@ private[hive] class TestHiveSparkSession(
     private val loadTestTables: Boolean)
   extends SparkSession(sc) with Logging { self =>
 
+  // TODO(SPARK-23826): TestHiveSparkSession should set default session the same way as
+  // TestSparkSession, but doing this the same way breaks many tests in the package. We need
+  // to investigate and find a different strategy.
+
   def this(sc: SparkContext, loadTestTables: Boolean) {
     this(
       sc,

From 3f5955aa0a16707a31fa9089d4babd64b5eed6ff Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 30 Mar 2018 10:25:17 -0700
Subject: [PATCH 380/566] Revert "[SPARK-23785][LAUNCHER] LauncherBackend
 doesn't check state of connection before setting state"

This reverts commit 0bfbcaf6696570b74923047266b00ba4dc2ba97c.
---
 .../spark/launcher/LauncherBackend.scala      |  6 +++---
 .../spark/launcher/LauncherServerSuite.java   | 20 -------------------
 2 files changed, 3 insertions(+), 23 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala b/core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala
index 1b049b786023a..aaae33ca4e6f3 100644
--- a/core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala
+++ b/core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala
@@ -67,13 +67,13 @@ private[spark] abstract class LauncherBackend {
   }
 
   def setAppId(appId: String): Unit = {
-    if (connection != null && isConnected) {
+    if (connection != null) {
       connection.send(new SetAppId(appId))
     }
   }
 
   def setState(state: SparkAppHandle.State): Unit = {
-    if (connection != null && isConnected && lastState != state) {
+    if (connection != null && lastState != state) {
       connection.send(new SetState(state))
       lastState = state
     }
@@ -114,10 +114,10 @@ private[spark] abstract class LauncherBackend {
 
     override def close(): Unit = {
       try {
-        _isConnected = false
         super.close()
       } finally {
         onDisconnected()
+        _isConnected = false
       }
     }
 
diff --git a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
index 5413d3a416545..d16337a319be3 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
@@ -185,26 +185,6 @@ public void testStreamFiltering() throws Exception {
     }
   }
 
-  @Test
-  public void testAppHandleDisconnect() throws Exception {
-    LauncherServer server = LauncherServer.getOrCreateServer();
-    ChildProcAppHandle handle = new ChildProcAppHandle(server);
-    String secret = server.registerHandle(handle);
-
-    TestClient client = null;
-    try {
-      Socket s = new Socket(InetAddress.getLoopbackAddress(), server.getPort());
-      client = new TestClient(s);
-      client.send(new Hello(secret, "1.4.0"));
-      handle.disconnect();
-      waitForError(client, secret);
-    } finally {
-      handle.kill();
-      close(client);
-      client.clientThread.join();
-    }
-  }
-
   private void close(Closeable c) {
     if (c != null) {
       try {

From 507cff246cd9e15a418d67b66bf762be4ae71c67 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Fri, 30 Mar 2018 16:48:26 -0700
Subject: [PATCH 381/566] [SPARK-23827][SS] StreamingJoinExec should ensure
 that input data is partitioned into specific number of partitions

## What changes were proposed in this pull request?

Currently, the requiredChildDistribution does not specify the partitions. This can cause the weird corner cases where the child's distribution is `SinglePartition` which satisfies the required distribution of `ClusterDistribution(no-num-partition-requirement)`, thus eliminating the shuffle needed to repartition input data into the required number of partitions (i.e. same as state stores). That can lead to "file not found" errors on the state store delta files as the micro-batch-with-no-shuffle will not run certain tasks and therefore not generate the expected state store delta files.

This PR adds the required constraint on the number of partitions.

## How was this patch tested?
Modified test harness to always check that ANY stateful operator should have a constraint on the number of partitions. As part of that, the existing opt-in checks on child output partitioning were removed, as they are redundant.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #20941 from tdas/SPARK-23827.

(cherry picked from commit 15298b99ac8944e781328423289586176cf824d7)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../streaming/IncrementalExecution.scala      |  2 +-
 .../StreamingSymmetricHashJoinExec.scala      |  3 +-
 .../sql/streaming/DeduplicateSuite.scala      |  8 +--
 .../FlatMapGroupsWithStateSuite.scala         |  5 +-
 .../sql/streaming/StatefulOperatorTest.scala  | 49 -------------------
 .../spark/sql/streaming/StreamTest.scala      | 19 +++++++
 .../streaming/StreamingAggregationSuite.scala |  4 +-
 7 files changed, 25 insertions(+), 65 deletions(-)
 delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/streaming/StatefulOperatorTest.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
index a10ed5f2df1b5..1a83c884d55bd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
@@ -62,7 +62,7 @@ class IncrementalExecution(
       StreamingDeduplicationStrategy :: Nil
   }
 
-  private val numStateStores = offsetSeqMetadata.conf.get(SQLConf.SHUFFLE_PARTITIONS.key)
+  private[sql] val numStateStores = offsetSeqMetadata.conf.get(SQLConf.SHUFFLE_PARTITIONS.key)
     .map(SQLConf.SHUFFLE_PARTITIONS.valueConverter)
     .getOrElse(sparkSession.sessionState.conf.numShufflePartitions)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
index c351f658cb955..fa7c8ee906ecd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
@@ -167,7 +167,8 @@ case class StreamingSymmetricHashJoinExec(
   val nullRight = new GenericInternalRow(right.output.map(_.withNullability(true)).length)
 
   override def requiredChildDistribution: Seq[Distribution] =
-    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
+    ClusteredDistribution(leftKeys, stateInfo.map(_.numPartitions)) ::
+      ClusteredDistribution(rightKeys, stateInfo.map(_.numPartitions)) :: Nil
 
   override def output: Seq[Attribute] = joinType match {
     case _: InnerLike => left.output ++ right.output
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeduplicateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeduplicateSuite.scala
index caf2bab8a5859..0088b64d6195e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeduplicateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeduplicateSuite.scala
@@ -25,9 +25,7 @@ import org.apache.spark.sql.execution.streaming.{MemoryStream, StreamingDeduplic
 import org.apache.spark.sql.execution.streaming.state.StateStore
 import org.apache.spark.sql.functions._
 
-class DeduplicateSuite extends StateStoreMetricsTest
-    with BeforeAndAfterAll
-    with StatefulOperatorTest {
+class DeduplicateSuite extends StateStoreMetricsTest with BeforeAndAfterAll {
 
   import testImplicits._
 
@@ -44,8 +42,6 @@ class DeduplicateSuite extends StateStoreMetricsTest
       AddData(inputData, "a"),
       CheckLastBatch("a"),
       assertNumStateRows(total = 1, updated = 1),
-      AssertOnQuery(sq =>
-        checkChildOutputHashPartitioning[StreamingDeduplicateExec](sq, Seq("value"))),
       AddData(inputData, "a"),
       CheckLastBatch(),
       assertNumStateRows(total = 1, updated = 0),
@@ -63,8 +59,6 @@ class DeduplicateSuite extends StateStoreMetricsTest
       AddData(inputData, "a" -> 1),
       CheckLastBatch("a" -> 1),
       assertNumStateRows(total = 1, updated = 1),
-      AssertOnQuery(sq =>
-        checkChildOutputHashPartitioning[StreamingDeduplicateExec](sq, Seq("_1"))),
       AddData(inputData, "a" -> 2), // Dropped
       CheckLastBatch(),
       assertNumStateRows(total = 1, updated = 0),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
index de2b51678cea6..b1416bff87ee7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
@@ -42,8 +42,7 @@ case class RunningCount(count: Long)
 case class Result(key: Long, count: Int)
 
 class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
-    with BeforeAndAfterAll
-    with StatefulOperatorTest {
+    with BeforeAndAfterAll {
 
   import testImplicits._
   import GroupStateImpl._
@@ -618,8 +617,6 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
       AddData(inputData, "a"),
       CheckLastBatch(("a", "1")),
       assertNumStateRows(total = 1, updated = 1),
-      AssertOnQuery(sq => checkChildOutputHashPartitioning[FlatMapGroupsWithStateExec](
-        sq, Seq("value"))),
       AddData(inputData, "a", "b"),
       CheckLastBatch(("a", "2"), ("b", "1")),
       assertNumStateRows(total = 2, updated = 2),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StatefulOperatorTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StatefulOperatorTest.scala
deleted file mode 100644
index 45142278993bb..0000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StatefulOperatorTest.scala
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.streaming
-
-import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.sql.execution.streaming._
-
-trait StatefulOperatorTest {
-  /**
-   * Check that the output partitioning of a child operator of a Stateful operator satisfies the
-   * distribution that we expect for our Stateful operator.
-   */
-  protected def checkChildOutputHashPartitioning[T <: StatefulOperator](
-      sq: StreamingQuery,
-      colNames: Seq[String]): Boolean = {
-    val attr = sq.asInstanceOf[StreamExecution].lastExecution.analyzed.output
-    val partitions = sq.sparkSession.sessionState.conf.numShufflePartitions
-    val groupingAttr = attr.filter(a => colNames.contains(a.name))
-    checkChildOutputPartitioning(sq, HashPartitioning(groupingAttr, partitions))
-  }
-
-  /**
-   * Check that the output partitioning of a child operator of a Stateful operator satisfies the
-   * distribution that we expect for our Stateful operator.
-   */
-  protected def checkChildOutputPartitioning[T <: StatefulOperator](
-      sq: StreamingQuery,
-      expectedPartitioning: Partitioning): Boolean = {
-    val operator = sq.asInstanceOf[StreamExecution].lastExecution
-      .executedPlan.collect { case p: T => p }
-    operator.head.children.forall(
-      _.outputPartitioning.numPartitions == expectedPartitioning.numPartitions)
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index d6433562fb29b..c2620d197b832 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -37,6 +37,7 @@ import org.apache.spark.SparkEnv
 import org.apache.spark.sql.{Dataset, Encoder, QueryTest, Row}
 import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.physical.AllTuples
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
 import org.apache.spark.sql.execution.streaming._
@@ -415,6 +416,24 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
         }
       }
 
+      val lastExecution = currentStream.lastExecution
+      if (currentStream.isInstanceOf[MicroBatchExecution] && lastExecution != null) {
+        // Verify if stateful operators have correct metadata and distribution
+        // This can often catch hard to debug errors when developing stateful operators
+        lastExecution.executedPlan.collect { case s: StatefulOperator => s }.foreach { s =>
+          assert(s.stateInfo.map(_.numPartitions).contains(lastExecution.numStateStores))
+          s.requiredChildDistribution.foreach { d =>
+            withClue(s"$s specifies incorrect # partitions in requiredChildDistribution $d") {
+              assert(d.requiredNumPartitions.isDefined)
+              assert(d.requiredNumPartitions.get >= 1)
+              if (d != AllTuples) {
+                assert(d.requiredNumPartitions.get == s.stateInfo.get.numPartitions)
+              }
+            }
+          }
+        }
+      }
+
       val (latestBatchData, allData) = sink match {
         case s: MemorySink => (s.latestBatchData, s.allData)
         case s: MemorySinkV2 => (s.latestBatchData, s.allData)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
index 97e065193fd05..1cae8cb8d47f1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
@@ -44,7 +44,7 @@ object FailureSingleton {
 }
 
 class StreamingAggregationSuite extends StateStoreMetricsTest
-    with BeforeAndAfterAll with Assertions with StatefulOperatorTest {
+    with BeforeAndAfterAll with Assertions {
 
   override def afterAll(): Unit = {
     super.afterAll()
@@ -281,8 +281,6 @@ class StreamingAggregationSuite extends StateStoreMetricsTest
       AddData(inputData, 0L, 5L, 5L, 10L),
       AdvanceManualClock(10 * 1000),
       CheckLastBatch((0L, 1), (5L, 2), (10L, 1)),
-      AssertOnQuery(sq =>
-        checkChildOutputHashPartitioning[StateStoreRestoreExec](sq, Seq("value"))),
 
       // advance clock to 20 seconds, should retain keys >= 10
       AddData(inputData, 15L, 15L, 20L),

From f1f10da2b4a0913dd230b4a4de95bf05a4672dce Mon Sep 17 00:00:00 2001
From: Xianjin YE <advancedxy@gmail.com>
Date: Sun, 1 Apr 2018 22:58:52 +0800
Subject: [PATCH 382/566] [SPARK-23040][BACKPORT][CORE] Returns interruptible
 iterator for shuffle reader

Backport https://github.com/apache/spark/pull/20449 and https://github.com/apache/spark/pull/20920 to branch-2.3

---

## What changes were proposed in this pull request?
Before this commit, a non-interruptible iterator is returned if aggregator or ordering is specified.
This commit also ensures that sorter is closed even when task is cancelled(killed) in the middle of sorting.

## How was this patch tested?
Add a unit test in JobCancellationSuite

Author: Xianjin YE <advancedxy@gmail.com>
Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #20954 from jiangxb1987/SPARK-23040-2.3.
---
 .../shuffle/BlockStoreShuffleReader.scala     | 14 +++-
 .../apache/spark/JobCancellationSuite.scala   | 65 ++++++++++++++++++-
 2 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala b/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
index 0562d45ff57c5..2a77a1cab976a 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
@@ -95,7 +95,7 @@ private[spark] class BlockStoreShuffleReader[K, C](
     }
 
     // Sort the output if there is a sort ordering defined.
-    dep.keyOrdering match {
+    val resultIter = dep.keyOrdering match {
       case Some(keyOrd: Ordering[K]) =>
         // Create an ExternalSorter to sort the data.
         val sorter =
@@ -104,9 +104,21 @@ private[spark] class BlockStoreShuffleReader[K, C](
         context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled)
         context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled)
         context.taskMetrics().incPeakExecutionMemory(sorter.peakMemoryUsedBytes)
+        // Use completion callback to stop sorter if task was finished/cancelled.
+        context.addTaskCompletionListener(_ => {
+          sorter.stop()
+        })
         CompletionIterator[Product2[K, C], Iterator[Product2[K, C]]](sorter.iterator, sorter.stop())
       case None =>
         aggregatedIter
     }
+
+    resultIter match {
+      case _: InterruptibleIterator[Product2[K, C]] => resultIter
+      case _ =>
+        // Use another interruptible iterator here to support task cancellation as aggregator
+        // or(and) sorter may have consumed previous interruptible iterator.
+        new InterruptibleIterator[Product2[K, C]](context, resultIter)
+    }
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala
index 8a77aea75a992..3b793bb231cf3 100644
--- a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark
 
 import java.util.concurrent.Semaphore
+import java.util.concurrent.atomic.AtomicInteger
 
 import scala.concurrent.ExecutionContext.Implicits.global
 import scala.concurrent.Future
@@ -26,7 +27,7 @@ import scala.concurrent.duration._
 import org.scalatest.BeforeAndAfter
 import org.scalatest.Matchers
 
-import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart}
+import org.apache.spark.scheduler.{SparkListener, SparkListenerStageCompleted, SparkListenerTaskEnd, SparkListenerTaskStart}
 import org.apache.spark.util.ThreadUtils
 
 /**
@@ -40,6 +41,10 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft
   override def afterEach() {
     try {
       resetSparkContext()
+      JobCancellationSuite.taskStartedSemaphore.drainPermits()
+      JobCancellationSuite.taskCancelledSemaphore.drainPermits()
+      JobCancellationSuite.twoJobsSharingStageSemaphore.drainPermits()
+      JobCancellationSuite.executionOfInterruptibleCounter.set(0)
     } finally {
       super.afterEach()
     }
@@ -320,6 +325,62 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft
     f2.get()
   }
 
+  test("interruptible iterator of shuffle reader") {
+    // In this test case, we create a Spark job of two stages. The second stage is cancelled during
+    // execution and a counter is used to make sure that the corresponding tasks are indeed
+    // cancelled.
+    import JobCancellationSuite._
+    sc = new SparkContext("local[2]", "test interruptible iterator")
+
+    val taskCompletedSem = new Semaphore(0)
+
+    sc.addSparkListener(new SparkListener {
+      override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = {
+        // release taskCancelledSemaphore when cancelTasks event has been posted
+        if (stageCompleted.stageInfo.stageId == 1) {
+          taskCancelledSemaphore.release(1000)
+        }
+      }
+
+      override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = {
+        if (taskEnd.stageId == 1) { // make sure tasks are completed
+          taskCompletedSem.release()
+        }
+      }
+    })
+
+    val f = sc.parallelize(1 to 1000).map { i => (i, i) }
+      .repartitionAndSortWithinPartitions(new HashPartitioner(1))
+      .mapPartitions { iter =>
+        taskStartedSemaphore.release()
+        iter
+      }.foreachAsync { x =>
+        if (x._1 >= 10) {
+          // This block of code is partially executed. It will be blocked when x._1 >= 10 and the
+          // next iteration will be cancelled if the source iterator is interruptible. Then in this
+          // case, the maximum num of increment would be 10(|1...10|)
+          taskCancelledSemaphore.acquire()
+        }
+        executionOfInterruptibleCounter.getAndIncrement()
+    }
+
+    taskStartedSemaphore.acquire()
+    // Job is cancelled when:
+    // 1. task in reduce stage has been started, guaranteed by previous line.
+    // 2. task in reduce stage is blocked after processing at most 10 records as
+    //    taskCancelledSemaphore is not released until cancelTasks event is posted
+    // After job being cancelled, task in reduce stage will be cancelled and no more iteration are
+    // executed.
+    f.cancel()
+
+    val e = intercept[SparkException](f.get()).getCause
+    assert(e.getMessage.contains("cancelled") || e.getMessage.contains("killed"))
+
+    // Make sure tasks are indeed completed.
+    taskCompletedSem.acquire()
+    assert(executionOfInterruptibleCounter.get() <= 10)
+ }
+
   def testCount() {
     // Cancel before launching any tasks
     {
@@ -381,7 +442,9 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft
 
 
 object JobCancellationSuite {
+  // To avoid any headaches, reset these global variables in the companion class's afterEach block
   val taskStartedSemaphore = new Semaphore(0)
   val taskCancelledSemaphore = new Semaphore(0)
   val twoJobsSharingStageSemaphore = new Semaphore(0)
+  val executionOfInterruptibleCounter = new AtomicInteger(0)
 }

From 6ca6483c122baa40d69c1781bb34a3cd9e1361c0 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 3 Apr 2018 09:31:47 +0800
Subject: [PATCH 383/566] [SPARK-19964][CORE] Avoid reading from remote repos
 in SparkSubmitSuite.

These tests can fail with a timeout if the remote repos are not responding,
or slow. The tests don't need anything from those repos, so use an empty
ivy config file to avoid setting up the defaults.

The tests are passing reliably for me locally now, and failing more often
than not today without this change since http://dl.bintray.com/spark-packages/maven
doesn't seem to be loading from my machine.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20916 from vanzin/SPARK-19964.

(cherry picked from commit 441d0d0766e9a6ac4c6ff79680394999ff7191fd)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 .../org/apache/spark/deploy/DependencyUtils.scala   | 13 ++++++++-----
 .../scala/org/apache/spark/deploy/SparkSubmit.scala |  3 ++-
 .../apache/spark/deploy/SparkSubmitArguments.scala  |  2 ++
 .../apache/spark/deploy/worker/DriverWrapper.scala  | 13 +++++++++----
 .../org/apache/spark/deploy/SparkSubmitSuite.scala  |  9 ++++++---
 5 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala b/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala
index ab319c860ee69..fac834a70b893 100644
--- a/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala
@@ -33,7 +33,8 @@ private[deploy] object DependencyUtils {
       packagesExclusions: String,
       packages: String,
       repositories: String,
-      ivyRepoPath: String): String = {
+      ivyRepoPath: String,
+      ivySettingsPath: Option[String]): String = {
     val exclusions: Seq[String] =
       if (!StringUtils.isBlank(packagesExclusions)) {
         packagesExclusions.split(",")
@@ -41,10 +42,12 @@ private[deploy] object DependencyUtils {
         Nil
       }
     // Create the IvySettings, either load from file or build defaults
-    val ivySettings = sys.props.get("spark.jars.ivySettings").map { ivySettingsFile =>
-      SparkSubmitUtils.loadIvySettings(ivySettingsFile, Option(repositories), Option(ivyRepoPath))
-    }.getOrElse {
-      SparkSubmitUtils.buildIvySettings(Option(repositories), Option(ivyRepoPath))
+    val ivySettings = ivySettingsPath match {
+      case Some(path) =>
+        SparkSubmitUtils.loadIvySettings(path, Option(repositories), Option(ivyRepoPath))
+
+      case None =>
+        SparkSubmitUtils.buildIvySettings(Option(repositories), Option(ivyRepoPath))
     }
 
     SparkSubmitUtils.resolveMavenCoordinates(packages, ivySettings, exclusions = exclusions)
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index b44c8802ea6d8..deb52a4ff20e9 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -361,7 +361,8 @@ object SparkSubmit extends CommandLineUtils with Logging {
       // Resolve maven dependencies if there are any and add classpath to jars. Add them to py-files
       // too for packages that include Python code
       val resolvedMavenCoordinates = DependencyUtils.resolveMavenDependencies(
-        args.packagesExclusions, args.packages, args.repositories, args.ivyRepoPath)
+        args.packagesExclusions, args.packages, args.repositories, args.ivyRepoPath,
+        args.ivySettingsPath)
 
       if (!StringUtils.isBlank(resolvedMavenCoordinates)) {
         args.jars = mergeFileLists(args.jars, resolvedMavenCoordinates)
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index 9db7a1fe3106d..7e19417538656 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -63,6 +63,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
   var packages: String = null
   var repositories: String = null
   var ivyRepoPath: String = null
+  var ivySettingsPath: Option[String] = None
   var packagesExclusions: String = null
   var verbose: Boolean = false
   var isPython: Boolean = false
@@ -184,6 +185,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
     jars = Option(jars).orElse(sparkProperties.get("spark.jars")).orNull
     files = Option(files).orElse(sparkProperties.get("spark.files")).orNull
     ivyRepoPath = sparkProperties.get("spark.jars.ivy").orNull
+    ivySettingsPath = sparkProperties.get("spark.jars.ivySettings")
     packages = Option(packages).orElse(sparkProperties.get("spark.jars.packages")).orNull
     packagesExclusions = Option(packagesExclusions)
       .orElse(sparkProperties.get("spark.jars.excludes")).orNull
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
index b19c9904d5982..3f71237164a15 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
@@ -79,12 +79,17 @@ object DriverWrapper extends Logging {
     val secMgr = new SecurityManager(sparkConf)
     val hadoopConf = SparkHadoopUtil.newConfiguration(sparkConf)
 
-    val Seq(packagesExclusions, packages, repositories, ivyRepoPath) =
-      Seq("spark.jars.excludes", "spark.jars.packages", "spark.jars.repositories", "spark.jars.ivy")
-        .map(sys.props.get(_).orNull)
+    val Seq(packagesExclusions, packages, repositories, ivyRepoPath, ivySettingsPath) =
+      Seq(
+        "spark.jars.excludes",
+        "spark.jars.packages",
+        "spark.jars.repositories",
+        "spark.jars.ivy",
+        "spark.jars.ivySettings"
+      ).map(sys.props.get(_).orNull)
 
     val resolvedMavenCoordinates = DependencyUtils.resolveMavenDependencies(packagesExclusions,
-      packages, repositories, ivyRepoPath)
+      packages, repositories, ivyRepoPath, Option(ivySettingsPath))
     val jars = {
       val jarsProp = sys.props.get("spark.jars").orNull
       if (!StringUtils.isBlank(resolvedMavenCoordinates)) {
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index feab888f89aca..e5268ca31373e 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -105,6 +105,9 @@ class SparkSubmitSuite
   // Necessary to make ScalaTest 3.x interrupt a thread on the JVM like ScalaTest 2.2.x
   implicit val defaultSignaler: Signaler = ThreadSignaler
 
+  private val emptyIvySettings = File.createTempFile("ivy", ".xml")
+  FileUtils.write(emptyIvySettings, "<ivysettings />", StandardCharsets.UTF_8)
+
   override def beforeEach() {
     super.beforeEach()
     System.setProperty("spark.testing", "true")
@@ -520,6 +523,7 @@ class SparkSubmitSuite
         "--repositories", repo,
         "--conf", "spark.ui.enabled=false",
         "--conf", "spark.master.rest.enabled=false",
+        "--conf", s"spark.jars.ivySettings=${emptyIvySettings.getAbsolutePath()}",
         unusedJar.toString,
         "my.great.lib.MyLib", "my.great.dep.MyLib")
       runSparkSubmit(args)
@@ -530,7 +534,6 @@ class SparkSubmitSuite
     val unusedJar = TestUtils.createJarWithClasses(Seq.empty)
     val main = MavenCoordinate("my.great.lib", "mylib", "0.1")
     val dep = MavenCoordinate("my.great.dep", "mylib", "0.1")
-    // Test using "spark.jars.packages" and "spark.jars.repositories" configurations.
     IvyTestUtils.withRepository(main, Some(dep.toString), None) { repo =>
       val args = Seq(
         "--class", JarCreationTest.getClass.getName.stripSuffix("$"),
@@ -540,6 +543,7 @@ class SparkSubmitSuite
         "--conf", s"spark.jars.repositories=$repo",
         "--conf", "spark.ui.enabled=false",
         "--conf", "spark.master.rest.enabled=false",
+        "--conf", s"spark.jars.ivySettings=${emptyIvySettings.getAbsolutePath()}",
         unusedJar.toString,
         "my.great.lib.MyLib", "my.great.dep.MyLib")
       runSparkSubmit(args)
@@ -550,7 +554,6 @@ class SparkSubmitSuite
   // See https://gist.github.com/shivaram/3a2fecce60768a603dac for a error log
   ignore("correctly builds R packages included in a jar with --packages") {
     assume(RUtils.isRInstalled, "R isn't installed on this machine.")
-    // Check if the SparkR package is installed
     assume(RUtils.isSparkRInstalled, "SparkR is not installed in this build.")
     val main = MavenCoordinate("my.great.lib", "mylib", "0.1")
     val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
@@ -563,6 +566,7 @@ class SparkSubmitSuite
         "--master", "local-cluster[2,1,1024]",
         "--packages", main.toString,
         "--repositories", repo,
+        "--conf", s"spark.jars.ivySettings=${emptyIvySettings.getAbsolutePath()}",
         "--verbose",
         "--conf", "spark.ui.enabled=false",
         rScriptDir)
@@ -573,7 +577,6 @@ class SparkSubmitSuite
   test("include an external JAR in SparkR") {
     assume(RUtils.isRInstalled, "R isn't installed on this machine.")
     val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
-    // Check if the SparkR package is installed
     assume(RUtils.isSparkRInstalled, "SparkR is not installed in this build.")
     val rScriptDir =
       Seq(sparkHome, "R", "pkg", "tests", "fulltests", "jarTest.R").mkString(File.separator)

From ce1565115481343af9043ecc4080d6d97eee698c Mon Sep 17 00:00:00 2001
From: lemonjing <932191671@qq.com>
Date: Tue, 3 Apr 2018 09:36:44 +0800
Subject: [PATCH 384/566] [MINOR][DOC] Fix a few markdown typos

## What changes were proposed in this pull request?

Easy fix in the markdown.

## How was this patch tested?

jekyII build test manually.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: lemonjing <932191671@qq.com>

Closes #20897 from Lemonjing/master.

(cherry picked from commit 8020f66fc47140a1b5f843fb18c34ec80541d5ca)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 docs/ml-guide.md                 | 2 +-
 docs/mllib-feature-extraction.md | 4 ++--
 docs/mllib-pmml-model-export.md  | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index 702bcf748fc74..aea07be34cb86 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -111,7 +111,7 @@ and the migration guide below will explain all changes between releases.
 * The class and trait hierarchy for logistic regression model summaries was changed to be cleaner
 and better accommodate the addition of the multi-class summary. This is a breaking change for user
 code that casts a `LogisticRegressionTrainingSummary` to a
-` BinaryLogisticRegressionTrainingSummary`. Users should instead use the `model.binarySummary`
+`BinaryLogisticRegressionTrainingSummary`. Users should instead use the `model.binarySummary`
 method. See [SPARK-17139](https://issues.apache.org/jira/browse/SPARK-17139) for more detail 
 (_note_ this is an `Experimental` API). This _does not_ affect the Python `summary` method, which
 will still work correctly for both multinomial and binary cases.
diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
index 75aea70601875..8b89296b14cdd 100644
--- a/docs/mllib-feature-extraction.md
+++ b/docs/mllib-feature-extraction.md
@@ -278,8 +278,8 @@ for details on the API.
 multiplication. In other words, it scales each column of the dataset by a scalar multiplier. This
 represents the [Hadamard product](https://en.wikipedia.org/wiki/Hadamard_product_%28matrices%29)
 between the input vector, `v` and transforming vector, `scalingVec`, to yield a result vector.
-Qu8T948*1#
-Denoting the `scalingVec` as "`w`," this transformation may be written as:
+
+Denoting the `scalingVec` as "`w`", this transformation may be written as:
 
 `\[ \begin{pmatrix}
 v_1 \\
diff --git a/docs/mllib-pmml-model-export.md b/docs/mllib-pmml-model-export.md
index d3530908706d0..f567565437927 100644
--- a/docs/mllib-pmml-model-export.md
+++ b/docs/mllib-pmml-model-export.md
@@ -7,7 +7,7 @@ displayTitle: PMML model export - RDD-based API
 * Table of contents
 {:toc}
 
-## `spark.mllib` supported models
+## spark.mllib supported models
 
 `spark.mllib` supports model export to Predictive Model Markup Language ([PMML](http://en.wikipedia.org/wiki/Predictive_Model_Markup_Language)).
 
@@ -15,7 +15,7 @@ The table below outlines the `spark.mllib` models that can be exported to PMML a
 
 <table class="table">
   <thead>
-    <tr><th>`spark.mllib` model</th><th>PMML model</th></tr>
+    <tr><th>spark.mllib model</th><th>PMML model</th></tr>
   </thead>
   <tbody>
     <tr>

From f36bdb40116b1600e1f788e460c31c20ca939353 Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Tue, 3 Apr 2018 21:26:49 +0800
Subject: [PATCH 385/566] [MINOR][CORE] Show block manager id when remove
 RDD/Broadcast fails.

## What changes were proposed in this pull request?

Address https://github.com/apache/spark/pull/20924#discussion_r177987175, show block manager id when remove RDD/Broadcast fails.

## How was this patch tested?

N/A

Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #20960 from jiangxb1987/bmid.

(cherry picked from commit 7cf9fab33457ccc9b2d548f15dd5700d5e8d08ef)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 .../apache/spark/storage/BlockManagerMasterEndpoint.scala   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
index 56b95c31eb4c3..8e8f7d197c9ef 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
@@ -164,7 +164,8 @@ class BlockManagerMasterEndpoint(
     val futures = blockManagerInfo.values.map { bm =>
       bm.slaveEndpoint.ask[Int](removeMsg).recover {
         case e: IOException =>
-          logWarning(s"Error trying to remove RDD $rddId", e)
+          logWarning(s"Error trying to remove RDD $rddId from block manager ${bm.blockManagerId}",
+            e)
           0 // zero blocks were removed
       }
     }.toSeq
@@ -195,7 +196,8 @@ class BlockManagerMasterEndpoint(
     val futures = requiredBlockManagers.map { bm =>
       bm.slaveEndpoint.ask[Int](removeMsg).recover {
         case e: IOException =>
-          logWarning(s"Error trying to remove broadcast $broadcastId", e)
+          logWarning(s"Error trying to remove broadcast $broadcastId from block manager " +
+            s"${bm.blockManagerId}", e)
           0 // zero blocks were removed
       }
     }.toSeq

From 28c9adbd6537c1545cc2b448c8908444ca858c44 Mon Sep 17 00:00:00 2001
From: Robert Kruszewski <robertk@palantir.com>
Date: Tue, 3 Apr 2018 17:25:54 -0700
Subject: [PATCH 386/566] [SPARK-23802][SQL] PropagateEmptyRelation can leave
 query plan in unresolved state

## What changes were proposed in this pull request?

Add cast to nulls introduced by PropagateEmptyRelation so in cases they're part of coalesce they will not break its type checking rules

## How was this patch tested?

Added unit test

Author: Robert Kruszewski <robertk@palantir.com>

Closes #20914 from robert3005/rk/propagate-empty-fix.

(cherry picked from commit 5cfd5fabcdbd77a806b98a6dd59b02772d2f6dee)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../optimizer/PropagateEmptyRelation.scala    |  8 ++++--
 .../PropagateEmptyRelationSuite.scala         | 26 ++++++++++++++-----
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala
index a6e5aa6daca65..c3fdb924243df 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala
@@ -17,10 +17,12 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
+import org.apache.spark.sql.catalyst.analysis.CastSupport
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
+import org.apache.spark.sql.internal.SQLConf
 
 /**
  * Collapse plans consisting empty local relations generated by [[PruneFilters]].
@@ -32,7 +34,7 @@ import org.apache.spark.sql.catalyst.rules._
  *    - Aggregate with all empty children and at least one grouping expression.
  *    - Generate(Explode) with all empty children. Others like Hive UDTF may return results.
  */
-object PropagateEmptyRelation extends Rule[LogicalPlan] with PredicateHelper {
+object PropagateEmptyRelation extends Rule[LogicalPlan] with PredicateHelper with CastSupport {
   private def isEmptyLocalRelation(plan: LogicalPlan): Boolean = plan match {
     case p: LocalRelation => p.data.isEmpty
     case _ => false
@@ -43,7 +45,9 @@ object PropagateEmptyRelation extends Rule[LogicalPlan] with PredicateHelper {
 
   // Construct a project list from plan's output, while the value is always NULL.
   private def nullValueProjectList(plan: LogicalPlan): Seq[NamedExpression] =
-    plan.output.map{ a => Alias(Literal(null), a.name)(a.exprId) }
+    plan.output.map{ a => Alias(cast(Literal(null), a.dataType), a.name)(a.exprId) }
+
+  override def conf: SQLConf = SQLConf.get
 
   def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
     case p: Union if p.children.forall(isEmptyLocalRelation) =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelationSuite.scala
index 3964508e3a55e..f1ce7543ffdc1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelationSuite.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.expressions.Literal
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types.{IntegerType, StructType}
 
 class PropagateEmptyRelationSuite extends PlanTest {
   object Optimize extends RuleExecutor[LogicalPlan] {
@@ -37,7 +37,8 @@ class PropagateEmptyRelationSuite extends PlanTest {
         ReplaceIntersectWithSemiJoin,
         PushDownPredicate,
         PruneFilters,
-        PropagateEmptyRelation) :: Nil
+        PropagateEmptyRelation,
+        CollapseProject) :: Nil
   }
 
   object OptimizeWithoutPropagateEmptyRelation extends RuleExecutor[LogicalPlan] {
@@ -48,7 +49,8 @@ class PropagateEmptyRelationSuite extends PlanTest {
         ReplaceExceptWithAntiJoin,
         ReplaceIntersectWithSemiJoin,
         PushDownPredicate,
-        PruneFilters) :: Nil
+        PruneFilters,
+        CollapseProject) :: Nil
   }
 
   val testRelation1 = LocalRelation.fromExternalRows(Seq('a.int), data = Seq(Row(1)))
@@ -79,9 +81,11 @@ class PropagateEmptyRelationSuite extends PlanTest {
 
       (true, false, Inner, Some(LocalRelation('a.int, 'b.int))),
       (true, false, Cross, Some(LocalRelation('a.int, 'b.int))),
-      (true, false, LeftOuter, Some(Project(Seq('a, Literal(null).as('b)), testRelation1).analyze)),
+      (true, false, LeftOuter,
+        Some(Project(Seq('a, Literal(null).cast(IntegerType).as('b)), testRelation1).analyze)),
       (true, false, RightOuter, Some(LocalRelation('a.int, 'b.int))),
-      (true, false, FullOuter, Some(Project(Seq('a, Literal(null).as('b)), testRelation1).analyze)),
+      (true, false, FullOuter,
+        Some(Project(Seq('a, Literal(null).cast(IntegerType).as('b)), testRelation1).analyze)),
       (true, false, LeftAnti, Some(testRelation1)),
       (true, false, LeftSemi, Some(LocalRelation('a.int))),
 
@@ -89,8 +93,9 @@ class PropagateEmptyRelationSuite extends PlanTest {
       (false, true, Cross, Some(LocalRelation('a.int, 'b.int))),
       (false, true, LeftOuter, Some(LocalRelation('a.int, 'b.int))),
       (false, true, RightOuter,
-        Some(Project(Seq(Literal(null).as('a), 'b), testRelation2).analyze)),
-      (false, true, FullOuter, Some(Project(Seq(Literal(null).as('a), 'b), testRelation2).analyze)),
+        Some(Project(Seq(Literal(null).cast(IntegerType).as('a), 'b), testRelation2).analyze)),
+      (false, true, FullOuter,
+        Some(Project(Seq(Literal(null).cast(IntegerType).as('a), 'b), testRelation2).analyze)),
       (false, true, LeftAnti, Some(LocalRelation('a.int))),
       (false, true, LeftSemi, Some(LocalRelation('a.int))),
 
@@ -209,4 +214,11 @@ class PropagateEmptyRelationSuite extends PlanTest {
 
     comparePlans(optimized, correctAnswer)
   }
+
+  test("propagate empty relation keeps the plan resolved") {
+    val query = testRelation1.join(
+      LocalRelation('a.int, 'b.int), UsingJoin(FullOuter, "a" :: Nil), None)
+    val optimized = Optimize.execute(query.analyze)
+    assert(optimized.resolved)
+  }
 }

From a81e20314ddd1c5078ae8b25b00e1efc3cb9652d Mon Sep 17 00:00:00 2001
From: Gengliang Wang <gengliang.wang@databricks.com>
Date: Wed, 4 Apr 2018 15:43:58 -0700
Subject: [PATCH 387/566] [SPARK-23838][WEBUI] Running SQL query is displayed
 as "completed" in SQL tab

## What changes were proposed in this pull request?

A running SQL query would appear as completed in the Spark UI:
![image1](https://user-images.githubusercontent.com/1097932/38170733-3d7cb00c-35bf-11e8-994c-43f2d4fa285d.png)

We can see the query in "Completed queries", while in in the job page we see it's still running Job 132.
![image2](https://user-images.githubusercontent.com/1097932/38170735-48f2c714-35bf-11e8-8a41-6fae23543c46.png)

After some time in the query still appears in "Completed queries" (while it's still running), but the "Duration" gets increased.
![image3](https://user-images.githubusercontent.com/1097932/38170737-50f87ea4-35bf-11e8-8b60-000f6f918964.png)

To reproduce, we can run a query with multiple jobs. E.g. Run TPCDS q6.

The reason is that updates from executions are written into kvstore periodically, and the job start event may be missed.

## How was this patch tested?
Manually run the job again and check the SQL Tab. The fix is pretty simple.

Author: Gengliang Wang <gengliang.wang@databricks.com>

Closes #20955 from gengliangwang/jobCompleted.

(cherry picked from commit d8379e5bc3629f4e8233ad42831bdaf68c24cfeb)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../apache/spark/sql/execution/ui/AllExecutionsPage.scala   | 3 ++-
 .../spark/sql/execution/ui/SQLAppStatusListener.scala       | 6 ++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala
index e751ce39cd5d7..582528777f90e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala
@@ -39,7 +39,8 @@ private[ui] class AllExecutionsPage(parent: SQLTab) extends WebUIPage("") with L
     val failed = new mutable.ArrayBuffer[SQLExecutionUIData]()
 
     sqlStore.executionsList().foreach { e =>
-      val isRunning = e.jobs.exists { case (_, status) => status == JobExecutionStatus.RUNNING }
+      val isRunning = e.completionTime.isEmpty ||
+        e.jobs.exists { case (_, status) => status == JobExecutionStatus.RUNNING }
       val isFailed = e.jobs.exists { case (_, status) => status == JobExecutionStatus.FAILED }
       if (isRunning) {
         running += e
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
index 71e9f93c4566e..2b6bb48467eb3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
@@ -88,7 +88,7 @@ class SQLAppStatusListener(
 
     exec.jobs = exec.jobs + (jobId -> JobExecutionStatus.RUNNING)
     exec.stages ++= event.stageIds.toSet
-    update(exec)
+    update(exec, force = true)
   }
 
   override def onStageSubmitted(event: SparkListenerStageSubmitted): Unit = {
@@ -308,11 +308,13 @@ class SQLAppStatusListener(
       })
   }
 
-  private def update(exec: LiveExecutionData): Unit = {
+  private def update(exec: LiveExecutionData, force: Boolean = false): Unit = {
     val now = System.nanoTime()
     if (exec.endEvents >= exec.jobs.size + 1) {
       exec.write(kvstore, now)
       liveExecutions.remove(exec.executionId)
+    } else if (force) {
+      exec.write(kvstore, now)
     } else if (liveUpdatePeriodNs >= 0) {
       if (now - exec.lastWriteTime > liveUpdatePeriodNs) {
         exec.write(kvstore, now)

From 0b7b8cceda4ce7791d78259451b8c51b49fb2786 Mon Sep 17 00:00:00 2001
From: jinxing <jinxing6042@126.com>
Date: Wed, 4 Apr 2018 15:51:27 -0700
Subject: [PATCH 388/566] [SPARK-23637][YARN] Yarn might allocate more resource
 if a same executor is killed multiple times.

## What changes were proposed in this pull request?
`YarnAllocator` uses `numExecutorsRunning` to track the number of running executor. `numExecutorsRunning` is used to check if there're executors missing and need to allocate more.

 In current code, `numExecutorsRunning` can be negative when driver asks to kill a same idle executor multiple times.

## How was this patch tested?
UT added

Author: jinxing <jinxing6042@126.com>

Closes #20781 from jinxing64/SPARK-23637.

(cherry picked from commit d3bd0435ee4ff3d414f32cce3f58b6b9f67e68bc)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../spark/deploy/yarn/YarnAllocator.scala     | 36 +++++++-------
 .../deploy/yarn/YarnAllocatorSuite.scala      | 48 ++++++++++++++++++-
 2 files changed, 66 insertions(+), 18 deletions(-)

diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index 506adb363aa90..b2d960bb468ec 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -81,7 +81,8 @@ private[yarn] class YarnAllocator(
   private val releasedContainers = Collections.newSetFromMap[ContainerId](
     new ConcurrentHashMap[ContainerId, java.lang.Boolean])
 
-  private val numExecutorsRunning = new AtomicInteger(0)
+  private val runningExecutors = Collections.newSetFromMap[String](
+    new ConcurrentHashMap[String, java.lang.Boolean]())
 
   private val numExecutorsStarting = new AtomicInteger(0)
 
@@ -166,7 +167,7 @@ private[yarn] class YarnAllocator(
     clock = newClock
   }
 
-  def getNumExecutorsRunning: Int = numExecutorsRunning.get()
+  def getNumExecutorsRunning: Int = runningExecutors.size()
 
   def getNumExecutorsFailed: Int = synchronized {
     val endTime = clock.getTimeMillis()
@@ -242,12 +243,11 @@ private[yarn] class YarnAllocator(
    * Request that the ResourceManager release the container running the specified executor.
    */
   def killExecutor(executorId: String): Unit = synchronized {
-    if (executorIdToContainer.contains(executorId)) {
-      val container = executorIdToContainer.get(executorId).get
-      internalReleaseContainer(container)
-      numExecutorsRunning.decrementAndGet()
-    } else {
-      logWarning(s"Attempted to kill unknown executor $executorId!")
+    executorIdToContainer.get(executorId) match {
+      case Some(container) if !releasedContainers.contains(container.getId) =>
+        internalReleaseContainer(container)
+        runningExecutors.remove(executorId)
+      case _ => logWarning(s"Attempted to kill unknown executor $executorId!")
     }
   }
 
@@ -274,7 +274,7 @@ private[yarn] class YarnAllocator(
         "Launching executor count: %d. Cluster resources: %s.")
         .format(
           allocatedContainers.size,
-          numExecutorsRunning.get,
+          runningExecutors.size,
           numExecutorsStarting.get,
           allocateResponse.getAvailableResources))
 
@@ -286,7 +286,7 @@ private[yarn] class YarnAllocator(
       logDebug("Completed %d containers".format(completedContainers.size))
       processCompletedContainers(completedContainers.asScala)
       logDebug("Finished processing %d completed containers. Current running executor count: %d."
-        .format(completedContainers.size, numExecutorsRunning.get))
+        .format(completedContainers.size, runningExecutors.size))
     }
   }
 
@@ -300,9 +300,9 @@ private[yarn] class YarnAllocator(
     val pendingAllocate = getPendingAllocate
     val numPendingAllocate = pendingAllocate.size
     val missing = targetNumExecutors - numPendingAllocate -
-      numExecutorsStarting.get - numExecutorsRunning.get
+      numExecutorsStarting.get - runningExecutors.size
     logDebug(s"Updating resource requests, target: $targetNumExecutors, " +
-      s"pending: $numPendingAllocate, running: ${numExecutorsRunning.get}, " +
+      s"pending: $numPendingAllocate, running: ${runningExecutors.size}, " +
       s"executorsStarting: ${numExecutorsStarting.get}")
 
     if (missing > 0) {
@@ -502,7 +502,7 @@ private[yarn] class YarnAllocator(
         s"for executor with ID $executorId")
 
       def updateInternalState(): Unit = synchronized {
-        numExecutorsRunning.incrementAndGet()
+        runningExecutors.add(executorId)
         numExecutorsStarting.decrementAndGet()
         executorIdToContainer(executorId) = container
         containerIdToExecutorId(container.getId) = executorId
@@ -513,7 +513,7 @@ private[yarn] class YarnAllocator(
         allocatedContainerToHostMap.put(containerId, executorHostname)
       }
 
-      if (numExecutorsRunning.get < targetNumExecutors) {
+      if (runningExecutors.size() < targetNumExecutors) {
         numExecutorsStarting.incrementAndGet()
         if (launchContainers) {
           launcherPool.execute(new Runnable {
@@ -554,7 +554,7 @@ private[yarn] class YarnAllocator(
       } else {
         logInfo(("Skip launching executorRunnable as running executors count: %d " +
           "reached target executors count: %d.").format(
-          numExecutorsRunning.get, targetNumExecutors))
+          runningExecutors.size, targetNumExecutors))
       }
     }
   }
@@ -569,7 +569,11 @@ private[yarn] class YarnAllocator(
       val exitReason = if (!alreadyReleased) {
         // Decrement the number of executors running. The next iteration of
         // the ApplicationMaster's reporting thread will take care of allocating.
-        numExecutorsRunning.decrementAndGet()
+        containerIdToExecutorId.get(containerId) match {
+          case Some(executorId) => runningExecutors.remove(executorId)
+          case None => logWarning(s"Cannot find executorId for container: ${containerId.toString}")
+        }
+
         logInfo("Completed container %s%s (state: %s, exit status: %s)".format(
           containerId,
           onHostStr,
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
index cb1e3c5268510..525abb6f2b350 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
@@ -251,11 +251,55 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
       ContainerStatus.newInstance(c.getId(), ContainerState.COMPLETE, "Finished", 0)
     }
     handler.updateResourceRequests()
-    handler.processCompletedContainers(statuses.toSeq)
+    handler.processCompletedContainers(statuses)
     handler.getNumExecutorsRunning should be (0)
     handler.getPendingAllocate.size should be (1)
   }
 
+  test("kill same executor multiple times") {
+    val handler = createAllocator(2)
+    handler.updateResourceRequests()
+    handler.getNumExecutorsRunning should be (0)
+    handler.getPendingAllocate.size should be (2)
+
+    val container1 = createContainer("host1")
+    val container2 = createContainer("host2")
+    handler.handleAllocatedContainers(Array(container1, container2))
+    handler.getNumExecutorsRunning should be (2)
+    handler.getPendingAllocate.size should be (0)
+
+    val executorToKill = handler.executorIdToContainer.keys.head
+    handler.killExecutor(executorToKill)
+    handler.getNumExecutorsRunning should be (1)
+    handler.killExecutor(executorToKill)
+    handler.killExecutor(executorToKill)
+    handler.killExecutor(executorToKill)
+    handler.getNumExecutorsRunning should be (1)
+    handler.requestTotalExecutorsWithPreferredLocalities(2, 0, Map.empty, Set.empty)
+    handler.updateResourceRequests()
+    handler.getPendingAllocate.size should be (1)
+  }
+
+  test("process same completed container multiple times") {
+    val handler = createAllocator(2)
+    handler.updateResourceRequests()
+    handler.getNumExecutorsRunning should be (0)
+    handler.getPendingAllocate.size should be (2)
+
+    val container1 = createContainer("host1")
+    val container2 = createContainer("host2")
+    handler.handleAllocatedContainers(Array(container1, container2))
+    handler.getNumExecutorsRunning should be (2)
+    handler.getPendingAllocate.size should be (0)
+
+    val statuses = Seq(container1, container1, container2).map { c =>
+      ContainerStatus.newInstance(c.getId(), ContainerState.COMPLETE, "Finished", 0)
+    }
+    handler.processCompletedContainers(statuses)
+    handler.getNumExecutorsRunning should be (0)
+
+  }
+
   test("lost executor removed from backend") {
     val handler = createAllocator(4)
     handler.updateResourceRequests()
@@ -272,7 +316,7 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
       ContainerStatus.newInstance(c.getId(), ContainerState.COMPLETE, "Failed", -1)
     }
     handler.updateResourceRequests()
-    handler.processCompletedContainers(statuses.toSeq)
+    handler.processCompletedContainers(statuses)
     handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
     handler.getPendingAllocate.size should be (2)

From f93667f84b33537a69ee20a7d4dee664828803f6 Mon Sep 17 00:00:00 2001
From: JiahuiJiang <jjiang@palantir.com>
Date: Thu, 5 Apr 2018 20:06:08 -0700
Subject: [PATCH 389/566] [SPARK-23823][SQL] Keep origin in transformExpression

Fixes https://issues.apache.org/jira/browse/SPARK-23823

Keep origin for all the methods using transformExpression

## What changes were proposed in this pull request?

Keep origin in transformExpression

## How was this patch tested?

Manually tested that this fixes https://issues.apache.org/jira/browse/SPARK-23823 and columns have correct origins after Analyzer.analyze

Author: JiahuiJiang <jjiang@palantir.com>
Author: Jiahui Jiang <jjiang@palantir.com>

Closes #20961 from JiahuiJiang/jj/keep-origin.

(cherry picked from commit d65e531b44a388fed25d3cbf28fdce5a2d0598e6)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../spark/sql/catalyst/plans/QueryPlan.scala  |  6 ++-
 .../sql/catalyst/plans/QueryPlanSuite.scala   | 42 +++++++++++++++++++
 2 files changed, 46 insertions(+), 2 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/QueryPlanSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index ddf2cbf2ab911..64cb8c726772f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.plans
 
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.trees.TreeNode
+import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, TreeNode}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{DataType, StructType}
 
@@ -103,7 +103,9 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT
     var changed = false
 
     @inline def transformExpression(e: Expression): Expression = {
-      val newE = f(e)
+      val newE = CurrentOrigin.withOrigin(e.origin) {
+        f(e)
+      }
       if (newE.fastEquals(e)) {
         e
       } else {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/QueryPlanSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/QueryPlanSuite.scala
new file mode 100644
index 0000000000000..27914ef5565c0
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/QueryPlanSuite.scala
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.plans
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.dsl.plans
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, Literal, NamedExpression}
+import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin}
+import org.apache.spark.sql.types.IntegerType
+
+class QueryPlanSuite extends SparkFunSuite {
+
+  test("origin remains the same after mapExpressions (SPARK-23823)") {
+    CurrentOrigin.setPosition(0, 0)
+    val column = AttributeReference("column", IntegerType)(NamedExpression.newExprId)
+    val query = plans.DslLogicalPlan(plans.table("table")).select(column)
+    CurrentOrigin.reset()
+
+    val mappedQuery = query mapExpressions {
+      case _: Expression => Literal(1)
+    }
+
+    val mappedOrigin = mappedQuery.expressions.apply(0).origin
+    assert(mappedOrigin == Origin.apply(Some(0), Some(0)))
+  }
+
+}

From ccc4a20453bbbaf1f3e5e46fb7c0277f1e6c65b9 Mon Sep 17 00:00:00 2001
From: Yuchen Huo <yuchen.huo@databricks.com>
Date: Fri, 6 Apr 2018 08:35:20 -0700
Subject: [PATCH 390/566] [SPARK-23822][SQL] Improve error message for Parquet
 schema mismatches

## What changes were proposed in this pull request?

This pull request tries to improve the error message for spark while reading parquet files with different schemas, e.g. One with a STRING column and the other with a INT column. A new ParquetSchemaColumnConvertNotSupportedException is added to replace the old UnsupportedOperationException. The Exception is again wrapped in FileScanRdd.scala to throw a more a general QueryExecutionException with the actual parquet file name which trigger the exception.

## How was this patch tested?

Unit tests added to check the new exception and verify the error messages.

Also manually tested with two parquet with different schema to check the error message.

<img width="1125" alt="screen shot 2018-03-30 at 4 03 04 pm" src="https://user-images.githubusercontent.com/37087310/38156580-dd58a140-3433-11e8-973a-b816d859fbe1.png">

Author: Yuchen Huo <yuchen.huo@databricks.com>

Closes #20953 from yuchenhuo/SPARK-23822.

(cherry picked from commit 94524019315ad463f9bc13c107131091d17c6af9)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 ...emaColumnConvertNotSupportedException.java | 62 +++++++++++++++++++
 .../parquet/VectorizedColumnReader.java       | 38 ++++++++----
 .../execution/QueryExecutionException.scala   |  3 +-
 .../execution/datasources/FileScanRDD.scala   | 21 ++++++-
 .../parquet/ParquetSchemaSuite.scala          | 55 ++++++++++++++++
 5 files changed, 166 insertions(+), 13 deletions(-)
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/execution/datasources/SchemaColumnConvertNotSupportedException.java

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/SchemaColumnConvertNotSupportedException.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/SchemaColumnConvertNotSupportedException.java
new file mode 100644
index 0000000000000..82a1169cbe7ae
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/SchemaColumnConvertNotSupportedException.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources;
+
+import org.apache.spark.annotation.InterfaceStability;
+
+/**
+ * Exception thrown when the parquet reader find column type mismatches.
+ */
+@InterfaceStability.Unstable
+public class SchemaColumnConvertNotSupportedException extends RuntimeException {
+
+  /**
+   * Name of the column which cannot be converted.
+   */
+  private String column;
+  /**
+   * Physical column type in the actual parquet file.
+   */
+  private String physicalType;
+  /**
+   * Logical column type in the parquet schema the parquet reader use to parse all files.
+   */
+  private String logicalType;
+
+  public String getColumn() {
+    return column;
+  }
+
+  public String getPhysicalType() {
+    return physicalType;
+  }
+
+  public String getLogicalType() {
+    return logicalType;
+  }
+
+  public SchemaColumnConvertNotSupportedException(
+      String column,
+      String physicalType,
+      String logicalType) {
+    super();
+    this.column = column;
+    this.physicalType = physicalType;
+    this.logicalType = logicalType;
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
index 47dd625f4b154..72f1d024b08ce 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution.datasources.parquet;
 
 import java.io.IOException;
+import java.util.Arrays;
 import java.util.TimeZone;
 
 import org.apache.parquet.bytes.BytesUtils;
@@ -31,6 +32,7 @@
 import org.apache.parquet.schema.PrimitiveType;
 
 import org.apache.spark.sql.catalyst.util.DateTimeUtils;
+import org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedException;
 import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.DecimalType;
@@ -231,6 +233,18 @@ private boolean shouldConvertTimestamps() {
     return convertTz != null && !convertTz.equals(UTC);
   }
 
+  /**
+   * Helper function to construct exception for parquet schema mismatch.
+   */
+  private SchemaColumnConvertNotSupportedException constructConvertNotSupportedException(
+      ColumnDescriptor descriptor,
+      WritableColumnVector column) {
+    return new SchemaColumnConvertNotSupportedException(
+      Arrays.toString(descriptor.getPath()),
+      descriptor.getType().toString(),
+      column.dataType().toString());
+  }
+
   /**
    * Reads `num` values into column, decoding the values from `dictionaryIds` and `dictionary`.
    */
@@ -261,7 +275,7 @@ private void decodeDictionaryIds(
             }
           }
         } else {
-          throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
+          throw constructConvertNotSupportedException(descriptor, column);
         }
         break;
 
@@ -282,7 +296,7 @@ private void decodeDictionaryIds(
             }
           }
         } else {
-          throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
+          throw constructConvertNotSupportedException(descriptor, column);
         }
         break;
 
@@ -321,7 +335,7 @@ private void decodeDictionaryIds(
             }
           }
         } else {
-          throw new UnsupportedOperationException();
+          throw constructConvertNotSupportedException(descriptor, column);
         }
         break;
       case BINARY:
@@ -360,7 +374,7 @@ private void decodeDictionaryIds(
             }
           }
         } else {
-          throw new UnsupportedOperationException();
+          throw constructConvertNotSupportedException(descriptor, column);
         }
         break;
 
@@ -375,7 +389,9 @@ private void decodeDictionaryIds(
    */
 
   private void readBooleanBatch(int rowId, int num, WritableColumnVector column) {
-    assert(column.dataType() == DataTypes.BooleanType);
+    if (column.dataType() != DataTypes.BooleanType) {
+      throw constructConvertNotSupportedException(descriptor, column);
+    }
     defColumn.readBooleans(
         num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
   }
@@ -394,7 +410,7 @@ private void readIntBatch(int rowId, int num, WritableColumnVector column) {
       defColumn.readShorts(
           num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
     } else {
-      throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
+      throw constructConvertNotSupportedException(descriptor, column);
     }
   }
 
@@ -414,7 +430,7 @@ private void readLongBatch(int rowId, int num, WritableColumnVector column) {
         }
       }
     } else {
-      throw new UnsupportedOperationException("Unsupported conversion to: " + column.dataType());
+      throw constructConvertNotSupportedException(descriptor, column);
     }
   }
 
@@ -425,7 +441,7 @@ private void readFloatBatch(int rowId, int num, WritableColumnVector column) {
       defColumn.readFloats(
           num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
     } else {
-      throw new UnsupportedOperationException("Unsupported conversion to: " + column.dataType());
+      throw constructConvertNotSupportedException(descriptor, column);
     }
   }
 
@@ -436,7 +452,7 @@ private void readDoubleBatch(int rowId, int num, WritableColumnVector column) {
       defColumn.readDoubles(
           num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
     } else {
-      throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
+      throw constructConvertNotSupportedException(descriptor, column);
     }
   }
 
@@ -471,7 +487,7 @@ private void readBinaryBatch(int rowId, int num, WritableColumnVector column) {
         }
       }
     } else {
-      throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
+      throw constructConvertNotSupportedException(descriptor, column);
     }
   }
 
@@ -510,7 +526,7 @@ private void readFixedLenByteArrayBatch(
         }
       }
     } else {
-      throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
+      throw constructConvertNotSupportedException(descriptor, column);
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecutionException.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecutionException.scala
index 16806c620635f..cffd97baea6a2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecutionException.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecutionException.scala
@@ -17,4 +17,5 @@
 
 package org.apache.spark.sql.execution
 
-class QueryExecutionException(message: String) extends Exception(message)
+class QueryExecutionException(message: String, cause: Throwable = null)
+  extends Exception(message, cause)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
index 835ce98462477..28c36b6020d33 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
@@ -21,11 +21,14 @@ import java.io.{FileNotFoundException, IOException}
 
 import scala.collection.mutable
 
+import org.apache.parquet.io.ParquetDecodingException
+
 import org.apache.spark.{Partition => RDDPartition, TaskContext, TaskKilledException}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.{InputFileBlockHolder, RDD}
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.QueryExecutionException
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.NextIterator
 
@@ -179,7 +182,23 @@ class FileScanRDD(
             currentIterator = readCurrentFile()
           }
 
-          hasNext
+          try {
+            hasNext
+          } catch {
+            case e: SchemaColumnConvertNotSupportedException =>
+              val message = "Parquet column cannot be converted in " +
+                s"file ${currentFile.filePath}. Column: ${e.getColumn}, " +
+                s"Expected: ${e.getLogicalType}, Found: ${e.getPhysicalType}"
+              throw new QueryExecutionException(message, e)
+            case e: ParquetDecodingException =>
+              if (e.getMessage.contains("Can not read value at")) {
+                val message = "Encounter error while reading parquet files. " +
+                  "One possible cause: Parquet column cannot be converted in the " +
+                  "corresponding files. Details: "
+                throw new QueryExecutionException(message, e)
+              }
+              throw e
+          }
         } else {
           currentFile = null
           InputFileBlockHolder.unset()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
index 2cd2a600f2b97..9d3dfae348beb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
@@ -20,10 +20,13 @@ package org.apache.spark.sql.execution.datasources.parquet
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
 
+import org.apache.parquet.io.ParquetDecodingException
 import org.apache.parquet.schema.{MessageType, MessageTypeParser}
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.catalyst.ScalaReflection
+import org.apache.spark.sql.execution.QueryExecutionException
+import org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedException
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
@@ -382,6 +385,58 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     }
   }
 
+  // =======================================
+  // Tests for parquet schema mismatch error
+  // =======================================
+  def testSchemaMismatch(path: String, vectorizedReaderEnabled: Boolean): SparkException = {
+    import testImplicits._
+
+    var e: SparkException = null
+    withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorizedReaderEnabled.toString) {
+      // Create two parquet files with different schemas in the same folder
+      Seq(("bcd", 2)).toDF("a", "b").coalesce(1).write.mode("overwrite").parquet(s"$path/parquet")
+      Seq((1, "abc")).toDF("a", "b").coalesce(1).write.mode("append").parquet(s"$path/parquet")
+
+      e = intercept[SparkException] {
+        spark.read.parquet(s"$path/parquet").collect()
+      }
+    }
+    e
+  }
+
+  test("schema mismatch failure error message for parquet reader") {
+    withTempPath { dir =>
+      val e = testSchemaMismatch(dir.getCanonicalPath, vectorizedReaderEnabled = false)
+      val expectedMessage = "Encounter error while reading parquet files. " +
+        "One possible cause: Parquet column cannot be converted in the corresponding " +
+        "files. Details:"
+      assert(e.getCause.isInstanceOf[QueryExecutionException])
+      assert(e.getCause.getCause.isInstanceOf[ParquetDecodingException])
+      assert(e.getCause.getMessage.startsWith(expectedMessage))
+    }
+  }
+
+  test("schema mismatch failure error message for parquet vectorized reader") {
+    withTempPath { dir =>
+      val e = testSchemaMismatch(dir.getCanonicalPath, vectorizedReaderEnabled = true)
+      assert(e.getCause.isInstanceOf[QueryExecutionException])
+      assert(e.getCause.getCause.isInstanceOf[SchemaColumnConvertNotSupportedException])
+
+      // Check if the physical type is reporting correctly
+      val errMsg = e.getCause.getMessage
+      assert(errMsg.startsWith("Parquet column cannot be converted in file"))
+      val file = errMsg.substring("Parquet column cannot be converted in file ".length,
+        errMsg.indexOf(". "))
+      val col = spark.read.parquet(file).schema.fields.filter(_.name.equals("a"))
+      assert(col.length == 1)
+      if (col(0).dataType == StringType) {
+        assert(errMsg.contains("Column: [a], Expected: IntegerType, Found: BINARY"))
+      } else {
+        assert(errMsg.endsWith("Column: [a], Expected: StringType, Found: INT32"))
+      }
+    }
+  }
+
   // =======================================================
   // Tests for converting Parquet LIST to Catalyst ArrayType
   // =======================================================

From 1a537a2ad5647782932337888f5db20a35ec422c Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Sat, 7 Apr 2018 21:18:50 -0700
Subject: [PATCH 391/566] [SPARK-23809][SQL][BACKPORT] Active SparkSession
 should be set by getOrCreate

This backports https://github.com/apache/spark/pull/20927 to branch-2.3

## What changes were proposed in this pull request?

Currently, the active spark session is set inconsistently (e.g., in createDataFrame, prior to query execution). Many places in spark also incorrectly query active session when they should be calling activeSession.getOrElse(defaultSession) and so might get None even if a Spark session exists.

The semantics here can be cleaned up if we also set the active session when the default session is set.

Related: https://github.com/apache/spark/pull/20926/files

## How was this patch tested?

Unit test, existing test. Note that if https://github.com/apache/spark/pull/20926 merges first we should also update the tests there.

Author: Eric Liang <ekl@databricks.com>

Closes #20971 from ericl/backport-23809.
---
 .../main/scala/org/apache/spark/sql/SparkSession.scala    | 3 ++-
 .../org/apache/spark/sql/SparkSessionBuilderSuite.scala   | 8 ++++++++
 .../scala/org/apache/spark/sql/test/TestSQLContext.scala  | 1 +
 .../scala/org/apache/spark/sql/hive/test/TestHive.scala   | 3 +++
 4 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 734573ba31f71..6cd5de887b169 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -951,7 +951,8 @@ object SparkSession {
 
         session = new SparkSession(sparkContext, None, None, extensions)
         options.foreach { case (k, v) => session.initialSessionOptions.put(k, v) }
-        defaultSession.set(session)
+        setDefaultSession(session)
+        setActiveSession(session)
 
         // Register a successfully instantiated context to the singleton. This should be at the
         // end of the class definition so that the singleton is updated only if there is no
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala
index c0301f2ce2d66..4c560d4b032d0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala
@@ -50,6 +50,14 @@ class SparkSessionBuilderSuite extends SparkFunSuite with BeforeAndAfterEach {
     assert(SparkSession.builder().getOrCreate() == session)
   }
 
+  test("sets default and active session") {
+    assert(SparkSession.getDefaultSession == None)
+    assert(SparkSession.getActiveSession == None)
+    val session = SparkSession.builder().master("local").getOrCreate()
+    assert(SparkSession.getDefaultSession == Some(session))
+    assert(SparkSession.getActiveSession == Some(session))
+  }
+
   test("config options are propagated to existing SparkSession") {
     val session1 = SparkSession.builder().master("local").config("spark-config1", "a").getOrCreate()
     assert(session1.conf.get("spark-config1") == "a")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
index 3038b822beb4a..17603deacdcdd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
@@ -35,6 +35,7 @@ private[spark] class TestSparkSession(sc: SparkContext) extends SparkSession(sc)
   }
 
   SparkSession.setDefaultSession(this)
+  SparkSession.setActiveSession(this)
 
   @transient
   override lazy val sessionState: SessionState = {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 8b0fe56b24fcd..533e7a0e022cb 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -179,6 +179,9 @@ private[hive] class TestHiveSparkSession(
       loadTestTables)
   }
 
+  SparkSession.setDefaultSession(this)
+  SparkSession.setActiveSession(this)
+
   { // set the metastore temporary configuration
     val metastoreTempConf = HiveUtils.newTemporaryConfiguration(useInMemoryDerby = false) ++ Map(
       ConfVars.METASTORE_INTEGER_JDO_PUSHDOWN.varname -> "true",

From bf1dabede512ec89fec507f85bec0a8bae5753f0 Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Mon, 9 Apr 2018 10:19:22 -0700
Subject: [PATCH 392/566] [SPARK-23881][CORE][TEST] Fix flaky test
 JobCancellationSuite."interruptible iterator of shuffle reader"

## What changes were proposed in this pull request?

The test case JobCancellationSuite."interruptible iterator of shuffle reader" has been flaky because `KillTask` event is handled asynchronously, so it can happen that the semaphore is released but the task is still running.
Actually we only have to check if the total number of processed elements is less than the input elements number, so we know the task get cancelled.

## How was this patch tested?

The new test case still fails without the purposed patch, and succeeded in current master.

Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #20993 from jiangxb1987/JobCancellationSuite.

(cherry picked from commit d81f29ecafe8fc9816e36087e3b8acdc93d6cc1b)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../apache/spark/JobCancellationSuite.scala   | 31 +++++++++++--------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala
index 3b793bb231cf3..61da4138896cd 100644
--- a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala
@@ -332,13 +332,15 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft
     import JobCancellationSuite._
     sc = new SparkContext("local[2]", "test interruptible iterator")
 
+    // Increase the number of elements to be proceeded to avoid this test being flaky.
+    val numElements = 10000
     val taskCompletedSem = new Semaphore(0)
 
     sc.addSparkListener(new SparkListener {
       override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = {
         // release taskCancelledSemaphore when cancelTasks event has been posted
         if (stageCompleted.stageInfo.stageId == 1) {
-          taskCancelledSemaphore.release(1000)
+          taskCancelledSemaphore.release(numElements)
         }
       }
 
@@ -349,28 +351,31 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft
       }
     })
 
-    val f = sc.parallelize(1 to 1000).map { i => (i, i) }
+    // Explicitly disable interrupt task thread on cancelling tasks, so the task thread can only be
+    // interrupted by `InterruptibleIterator`.
+    sc.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "false")
+
+    val f = sc.parallelize(1 to numElements).map { i => (i, i) }
       .repartitionAndSortWithinPartitions(new HashPartitioner(1))
       .mapPartitions { iter =>
         taskStartedSemaphore.release()
         iter
       }.foreachAsync { x =>
-        if (x._1 >= 10) {
-          // This block of code is partially executed. It will be blocked when x._1 >= 10 and the
-          // next iteration will be cancelled if the source iterator is interruptible. Then in this
-          // case, the maximum num of increment would be 10(|1...10|)
-          taskCancelledSemaphore.acquire()
-        }
+        // Block this code from being executed, until the job get cancelled. In this case, if the
+        // source iterator is interruptible, the max number of increment should be under
+        // `numElements`.
+        taskCancelledSemaphore.acquire()
         executionOfInterruptibleCounter.getAndIncrement()
     }
 
     taskStartedSemaphore.acquire()
     // Job is cancelled when:
     // 1. task in reduce stage has been started, guaranteed by previous line.
-    // 2. task in reduce stage is blocked after processing at most 10 records as
-    //    taskCancelledSemaphore is not released until cancelTasks event is posted
-    // After job being cancelled, task in reduce stage will be cancelled and no more iteration are
-    // executed.
+    // 2. task in reduce stage is blocked as taskCancelledSemaphore is not released until
+    //    JobCancelled event is posted.
+    // After job being cancelled, task in reduce stage will be cancelled asynchronously, thus
+    // partial of the inputs should not get processed (It's very unlikely that Spark can process
+    // 10000 elements between JobCancelled is posted and task is really killed).
     f.cancel()
 
     val e = intercept[SparkException](f.get()).getCause
@@ -378,7 +383,7 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft
 
     // Make sure tasks are indeed completed.
     taskCompletedSem.acquire()
-    assert(executionOfInterruptibleCounter.get() <= 10)
+    assert(executionOfInterruptibleCounter.get() < numElements)
  }
 
   def testCount() {

From 0f2aabc6bc64d2b5d46e59525111bd95fcd73610 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Mon, 9 Apr 2018 11:31:21 -0700
Subject: [PATCH 393/566] [SPARK-23816][CORE] Killed tasks should ignore
 FetchFailures.

SPARK-19276 ensured that FetchFailures do not get swallowed by other
layers of exception handling, but it also meant that a killed task could
look like a fetch failure.  This is particularly a problem with
speculative execution, where we expect to kill tasks as they are reading
shuffle data.  The fix is to ensure that we always check for killed
tasks first.

Added a new unit test which fails before the fix, ran it 1k times to
check for flakiness.  Full suite of tests on jenkins.

Author: Imran Rashid <irashid@cloudera.com>

Closes #20987 from squito/SPARK-23816.

(cherry picked from commit 10f45bb8233e6ac838dd4f053052c8556f5b54bd)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../org/apache/spark/executor/Executor.scala  | 26 +++---
 .../apache/spark/executor/ExecutorSuite.scala | 92 +++++++++++++++----
 2 files changed, 88 insertions(+), 30 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 2c3a8ef74800b..a9c31c741abd3 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -480,6 +480,19 @@ private[spark] class Executor(
         execBackend.statusUpdate(taskId, TaskState.FINISHED, serializedResult)
 
       } catch {
+        case t: TaskKilledException =>
+          logInfo(s"Executor killed $taskName (TID $taskId), reason: ${t.reason}")
+          setTaskFinishedAndClearInterruptStatus()
+          execBackend.statusUpdate(taskId, TaskState.KILLED, ser.serialize(TaskKilled(t.reason)))
+
+        case _: InterruptedException | NonFatal(_) if
+            task != null && task.reasonIfKilled.isDefined =>
+          val killReason = task.reasonIfKilled.getOrElse("unknown reason")
+          logInfo(s"Executor interrupted and killed $taskName (TID $taskId), reason: $killReason")
+          setTaskFinishedAndClearInterruptStatus()
+          execBackend.statusUpdate(
+            taskId, TaskState.KILLED, ser.serialize(TaskKilled(killReason)))
+
         case t: Throwable if hasFetchFailure && !Utils.isFatalError(t) =>
           val reason = task.context.fetchFailed.get.toTaskFailedReason
           if (!t.isInstanceOf[FetchFailedException]) {
@@ -494,19 +507,6 @@ private[spark] class Executor(
           setTaskFinishedAndClearInterruptStatus()
           execBackend.statusUpdate(taskId, TaskState.FAILED, ser.serialize(reason))
 
-        case t: TaskKilledException =>
-          logInfo(s"Executor killed $taskName (TID $taskId), reason: ${t.reason}")
-          setTaskFinishedAndClearInterruptStatus()
-          execBackend.statusUpdate(taskId, TaskState.KILLED, ser.serialize(TaskKilled(t.reason)))
-
-        case _: InterruptedException | NonFatal(_) if
-            task != null && task.reasonIfKilled.isDefined =>
-          val killReason = task.reasonIfKilled.getOrElse("unknown reason")
-          logInfo(s"Executor interrupted and killed $taskName (TID $taskId), reason: $killReason")
-          setTaskFinishedAndClearInterruptStatus()
-          execBackend.statusUpdate(
-            taskId, TaskState.KILLED, ser.serialize(TaskKilled(killReason)))
-
         case CausedBy(cDE: CommitDeniedException) =>
           val reason = cDE.toTaskCommitDeniedReason
           setTaskFinishedAndClearInterruptStatus()
diff --git a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala
index 105a178f2d94e..1a7bebe2c53cd 100644
--- a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala
@@ -22,6 +22,7 @@ import java.lang.Thread.UncaughtExceptionHandler
 import java.nio.ByteBuffer
 import java.util.Properties
 import java.util.concurrent.{CountDownLatch, TimeUnit}
+import java.util.concurrent.atomic.AtomicBoolean
 
 import scala.collection.mutable.Map
 import scala.concurrent.duration._
@@ -139,7 +140,7 @@ class ExecutorSuite extends SparkFunSuite with LocalSparkContext with MockitoSug
     // the fetch failure.  The executor should still tell the driver that the task failed due to a
     // fetch failure, not a generic exception from user code.
     val inputRDD = new FetchFailureThrowingRDD(sc)
-    val secondRDD = new FetchFailureHidingRDD(sc, inputRDD, throwOOM = false)
+    val secondRDD = new FetchFailureHidingRDD(sc, inputRDD, throwOOM = false, interrupt = false)
     val taskBinary = sc.broadcast(serializer.serialize((secondRDD, resultFunc)).array())
     val serializedTaskMetrics = serializer.serialize(TaskMetrics.registered).array()
     val task = new ResultTask(
@@ -173,17 +174,48 @@ class ExecutorSuite extends SparkFunSuite with LocalSparkContext with MockitoSug
   }
 
   test("SPARK-19276: OOMs correctly handled with a FetchFailure") {
+    val (failReason, uncaughtExceptionHandler) = testFetchFailureHandling(true)
+    assert(failReason.isInstanceOf[ExceptionFailure])
+    val exceptionCaptor = ArgumentCaptor.forClass(classOf[Throwable])
+    verify(uncaughtExceptionHandler).uncaughtException(any(), exceptionCaptor.capture())
+    assert(exceptionCaptor.getAllValues.size === 1)
+    assert(exceptionCaptor.getAllValues().get(0).isInstanceOf[OutOfMemoryError])
+  }
+
+  test("SPARK-23816: interrupts are not masked by a FetchFailure") {
+    // If killing the task causes a fetch failure, we still treat it as a task that was killed,
+    // as the fetch failure could easily be caused by interrupting the thread.
+    val (failReason, _) = testFetchFailureHandling(false)
+    assert(failReason.isInstanceOf[TaskKilled])
+  }
+
+  /**
+   * Helper for testing some cases where a FetchFailure should *not* get sent back, because its
+   * superceded by another error, either an OOM or intentionally killing a task.
+   * @param oom if true, throw an OOM after the FetchFailure; else, interrupt the task after the
+    *            FetchFailure
+   */
+  private def testFetchFailureHandling(
+      oom: Boolean): (TaskFailedReason, UncaughtExceptionHandler) = {
     // when there is a fatal error like an OOM, we don't do normal fetch failure handling, since it
     // may be a false positive.  And we should call the uncaught exception handler.
+    // SPARK-23816 also handle interrupts the same way, as killing an obsolete speculative task
+    // does not represent a real fetch failure.
     val conf = new SparkConf().setMaster("local").setAppName("executor suite test")
     sc = new SparkContext(conf)
     val serializer = SparkEnv.get.closureSerializer.newInstance()
     val resultFunc = (context: TaskContext, itr: Iterator[Int]) => itr.size
 
-    // Submit a job where a fetch failure is thrown, but then there is an OOM.  We should treat
-    // the fetch failure as a false positive, and just do normal OOM handling.
+    // Submit a job where a fetch failure is thrown, but then there is an OOM or interrupt.  We
+    // should treat the fetch failure as a false positive, and do normal OOM or interrupt handling.
     val inputRDD = new FetchFailureThrowingRDD(sc)
-    val secondRDD = new FetchFailureHidingRDD(sc, inputRDD, throwOOM = true)
+    if (!oom) {
+      // we are trying to setup a case where a task is killed after a fetch failure -- this
+      // is just a helper to coordinate between the task thread and this thread that will
+      // kill the task
+      ExecutorSuiteHelper.latches = new ExecutorSuiteHelper()
+    }
+    val secondRDD = new FetchFailureHidingRDD(sc, inputRDD, throwOOM = oom, interrupt = !oom)
     val taskBinary = sc.broadcast(serializer.serialize((secondRDD, resultFunc)).array())
     val serializedTaskMetrics = serializer.serialize(TaskMetrics.registered).array()
     val task = new ResultTask(
@@ -200,15 +232,8 @@ class ExecutorSuite extends SparkFunSuite with LocalSparkContext with MockitoSug
     val serTask = serializer.serialize(task)
     val taskDescription = createFakeTaskDescription(serTask)
 
-    val (failReason, uncaughtExceptionHandler) =
-      runTaskGetFailReasonAndExceptionHandler(taskDescription)
-    // make sure the task failure just looks like a OOM, not a fetch failure
-    assert(failReason.isInstanceOf[ExceptionFailure])
-    val exceptionCaptor = ArgumentCaptor.forClass(classOf[Throwable])
-    verify(uncaughtExceptionHandler).uncaughtException(any(), exceptionCaptor.capture())
-    assert(exceptionCaptor.getAllValues.size === 1)
-    assert(exceptionCaptor.getAllValues.get(0).isInstanceOf[OutOfMemoryError])
-  }
+    runTaskGetFailReasonAndExceptionHandler(taskDescription, killTask = !oom)
+ }
 
   test("Gracefully handle error in task deserialization") {
     val conf = new SparkConf
@@ -257,22 +282,39 @@ class ExecutorSuite extends SparkFunSuite with LocalSparkContext with MockitoSug
   }
 
   private def runTaskAndGetFailReason(taskDescription: TaskDescription): TaskFailedReason = {
-    runTaskGetFailReasonAndExceptionHandler(taskDescription)._1
+    runTaskGetFailReasonAndExceptionHandler(taskDescription, false)._1
   }
 
   private def runTaskGetFailReasonAndExceptionHandler(
-      taskDescription: TaskDescription): (TaskFailedReason, UncaughtExceptionHandler) = {
+      taskDescription: TaskDescription,
+      killTask: Boolean): (TaskFailedReason, UncaughtExceptionHandler) = {
     val mockBackend = mock[ExecutorBackend]
     val mockUncaughtExceptionHandler = mock[UncaughtExceptionHandler]
     var executor: Executor = null
+    val timedOut = new AtomicBoolean(false)
     try {
       executor = new Executor("id", "localhost", SparkEnv.get, userClassPath = Nil, isLocal = true,
         uncaughtExceptionHandler = mockUncaughtExceptionHandler)
       // the task will be launched in a dedicated worker thread
       executor.launchTask(mockBackend, taskDescription)
+      if (killTask) {
+        val killingThread = new Thread("kill-task") {
+          override def run(): Unit = {
+            // wait to kill the task until it has thrown a fetch failure
+            if (ExecutorSuiteHelper.latches.latch1.await(10, TimeUnit.SECONDS)) {
+              // now we can kill the task
+              executor.killAllTasks(true, "Killed task, eg. because of speculative execution")
+            } else {
+              timedOut.set(true)
+            }
+          }
+        }
+        killingThread.start()
+      }
       eventually(timeout(5.seconds), interval(10.milliseconds)) {
         assert(executor.numRunningTasks === 0)
       }
+      assert(!timedOut.get(), "timed out waiting to be ready to kill tasks")
     } finally {
       if (executor != null) {
         executor.stop()
@@ -282,8 +324,9 @@ class ExecutorSuite extends SparkFunSuite with LocalSparkContext with MockitoSug
     val statusCaptor = ArgumentCaptor.forClass(classOf[ByteBuffer])
     orderedMock.verify(mockBackend)
       .statusUpdate(meq(0L), meq(TaskState.RUNNING), statusCaptor.capture())
+    val finalState = if (killTask) TaskState.KILLED else TaskState.FAILED
     orderedMock.verify(mockBackend)
-      .statusUpdate(meq(0L), meq(TaskState.FAILED), statusCaptor.capture())
+      .statusUpdate(meq(0L), meq(finalState), statusCaptor.capture())
     // first statusUpdate for RUNNING has empty data
     assert(statusCaptor.getAllValues().get(0).remaining() === 0)
     // second update is more interesting
@@ -321,7 +364,8 @@ class SimplePartition extends Partition {
 class FetchFailureHidingRDD(
     sc: SparkContext,
     val input: FetchFailureThrowingRDD,
-    throwOOM: Boolean) extends RDD[Int](input) {
+    throwOOM: Boolean,
+    interrupt: Boolean) extends RDD[Int](input) {
   override def compute(split: Partition, context: TaskContext): Iterator[Int] = {
     val inItr = input.compute(split, context)
     try {
@@ -330,6 +374,15 @@ class FetchFailureHidingRDD(
       case t: Throwable =>
         if (throwOOM) {
           throw new OutOfMemoryError("OOM while handling another exception")
+        } else if (interrupt) {
+          // make sure our test is setup correctly
+          assert(TaskContext.get().asInstanceOf[TaskContextImpl].fetchFailed.isDefined)
+          // signal our test is ready for the task to get killed
+          ExecutorSuiteHelper.latches.latch1.countDown()
+          // then wait for another thread in the test to kill the task -- this latch
+          // is never actually decremented, we just wait to get killed.
+          ExecutorSuiteHelper.latches.latch2.await(10, TimeUnit.SECONDS)
+          throw new IllegalStateException("timed out waiting to be interrupted")
         } else {
           throw new RuntimeException("User Exception that hides the original exception", t)
         }
@@ -352,6 +405,11 @@ private class ExecutorSuiteHelper {
   @volatile var testFailedReason: TaskFailedReason = _
 }
 
+// helper for coordinating killing tasks
+private object ExecutorSuiteHelper {
+  var latches: ExecutorSuiteHelper = null
+}
+
 private class NonDeserializableTask extends FakeTask(0, 0) with Externalizable {
   def writeExternal(out: ObjectOutput): Unit = {}
   def readExternal(in: ObjectInput): Unit = {

From 320269e49d11e0a9de4c8f01210d92227a37fe8d Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@apache.org>
Date: Wed, 11 Apr 2018 19:44:01 +0800
Subject: [PATCH 394/566] [MINOR][DOCS] Fix R documentation generation
 instruction for roxygen2

## What changes were proposed in this pull request?

This PR proposes to fix `roxygen2` to `5.0.1` in `docs/README.md` for SparkR documentation generation.

If I use higher version and creates the doc, it shows the diff below. Not a big deal but it bothered me.

```diff
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 855eb5bf77f..159fca61e06 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
 -57,6 +57,6  Collate:
     'types.R'
     'utils.R'
     'window.R'
-RoxygenNote: 5.0.1
+RoxygenNote: 6.0.1
 VignetteBuilder: knitr
 NeedsCompilation: no
```

## How was this patch tested?

Manually tested. I met this every time I set the new environment for Spark dev but I have kept forgetting to fix it.

Author: hyukjinkwon <gurwls223@apache.org>

Closes #21020 from HyukjinKwon/minor-r-doc.

(cherry picked from commit 87611bba222a95158fc5b638a566bdf47346da8e)

Signed-off-by: hyukjinkwon <gurwls223@apache.org>

Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 docs/README.md | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index 225bb1b2040de..166a7e572982d 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -22,10 +22,13 @@ $ sudo gem install jekyll jekyll-redirect-from pygments.rb
 $ sudo pip install Pygments
 # Following is needed only for generating API docs
 $ sudo pip install sphinx pypandoc mkdocs
-$ sudo Rscript -e 'install.packages(c("knitr", "devtools", "roxygen2", "testthat", "rmarkdown"), repos="http://cran.stat.ucla.edu/")'
+$ sudo Rscript -e 'install.packages(c("knitr", "devtools", "testthat", "rmarkdown"), repos="http://cran.stat.ucla.edu/")'
+$ sudo Rscript -e 'devtools::install_version("roxygen2", version = "5.0.1", repos="http://cran.stat.ucla.edu/")'
 ```
 
-(Note: If you are on a system with both Ruby 1.9 and Ruby 2.0 you may need to replace gem with gem2.0)
+Note: If you are on a system with both Ruby 1.9 and Ruby 2.0 you may need to replace gem with gem2.0.
+
+Note: Other versions of roxygen2 might work in SparkR documentation generation but `RoxygenNote` field in `$SPARK_HOME/R/pkg/DESCRIPTION` is 5.0.1, which is updated if the version is mismatched.
 
 ## Generating the Documentation HTML
 
@@ -62,12 +65,12 @@ $ PRODUCTION=1 jekyll build
 
 ## API Docs (Scaladoc, Javadoc, Sphinx, roxygen2, MkDocs)
 
-You can build just the Spark scaladoc and javadoc by running `build/sbt unidoc` from the `SPARK_HOME` directory.
+You can build just the Spark scaladoc and javadoc by running `build/sbt unidoc` from the `$SPARK_HOME` directory.
 
 Similarly, you can build just the PySpark docs by running `make html` from the
-`SPARK_HOME/python/docs` directory. Documentation is only generated for classes that are listed as
-public in `__init__.py`. The SparkR docs can be built by running `SPARK_HOME/R/create-docs.sh`, and
-the SQL docs can be built by running `SPARK_HOME/sql/create-docs.sh`
+`$SPARK_HOME/python/docs` directory. Documentation is only generated for classes that are listed as
+public in `__init__.py`. The SparkR docs can be built by running `$SPARK_HOME/R/create-docs.sh`, and
+the SQL docs can be built by running `$SPARK_HOME/sql/create-docs.sh`
 after [building Spark](https://github.com/apache/spark#building-spark) first.
 
 When you run `jekyll build` in the `docs` directory, it will also copy over the scaladoc and javadoc for the various

From acfc156df551632007a47b7ec7a7c901a713082d Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Wed, 11 Apr 2018 11:41:50 -0700
Subject: [PATCH 395/566] [SPARK-22883][ML] ML test for StructuredStreaming:
 spark.ml.feature, I-M

This backports https://github.com/apache/spark/pull/20964 to branch-2.3.

## What changes were proposed in this pull request?

Adds structured streaming tests using testTransformer for these suites:
* IDF
* Imputer
* Interaction
* MaxAbsScaler
* MinHashLSH
* MinMaxScaler
* NGram

## How was this patch tested?

It is a bunch of tests!

Author: Joseph K. Bradley <josephdatabricks.com>

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #21042 from jkbradley/SPARK-22883-part2-2.3backport.
---
 .../apache/spark/ml/feature/IDFSuite.scala    | 14 +++---
 .../spark/ml/feature/ImputerSuite.scala       | 31 ++++++++++---
 .../spark/ml/feature/InteractionSuite.scala   | 46 ++++++++++---------
 .../spark/ml/feature/MaxAbsScalerSuite.scala  | 14 +++---
 .../spark/ml/feature/MinHashLSHSuite.scala    | 25 ++++++++--
 .../spark/ml/feature/MinMaxScalerSuite.scala  | 14 +++---
 .../apache/spark/ml/feature/NGramSuite.scala  |  2 +-
 7 files changed, 89 insertions(+), 57 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/IDFSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/IDFSuite.scala
index 005edf73d29be..cdd62be43b54c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/IDFSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/IDFSuite.scala
@@ -17,17 +17,15 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.feature.{IDFModel => OldIDFModel}
 import org.apache.spark.mllib.linalg.VectorImplicits._
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.Row
 
-class IDFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class IDFSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -57,7 +55,7 @@ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultRead
       Vectors.dense(0.0, 1.0, 2.0, 3.0),
       Vectors.sparse(numOfFeatures, Array(1), Array(1.0))
     )
-    val numOfData = data.size
+    val numOfData = data.length
     val idf = Vectors.dense(Array(0, 3, 1, 2).map { x =>
       math.log((numOfData + 1.0) / (x + 1.0))
     })
@@ -72,7 +70,7 @@ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultRead
 
     MLTestingUtils.checkCopyAndUids(idfEst, idfModel)
 
-    idfModel.transform(df).select("idfValue", "expected").collect().foreach {
+    testTransformer[(Vector, Vector)](df, idfModel, "idfValue", "expected") {
       case Row(x: Vector, y: Vector) =>
         assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.")
     }
@@ -85,7 +83,7 @@ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultRead
       Vectors.dense(0.0, 1.0, 2.0, 3.0),
       Vectors.sparse(numOfFeatures, Array(1), Array(1.0))
     )
-    val numOfData = data.size
+    val numOfData = data.length
     val idf = Vectors.dense(Array(0, 3, 1, 2).map { x =>
       if (x > 0) math.log((numOfData + 1.0) / (x + 1.0)) else 0
     })
@@ -99,7 +97,7 @@ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultRead
       .setMinDocFreq(1)
       .fit(df)
 
-    idfModel.transform(df).select("idfValue", "expected").collect().foreach {
+    testTransformer[(Vector, Vector)](df, idfModel, "idfValue", "expected") {
       case Row(x: Vector, y: Vector) =>
         assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.")
     }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
index c08b35b419266..75f63a623e6d8 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
@@ -16,13 +16,12 @@
  */
 package org.apache.spark.ml.feature
 
-import org.apache.spark.{SparkException, SparkFunSuite}
-import org.apache.spark.ml.util.DefaultReadWriteTest
-import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.SparkException
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.{DataFrame, Row}
 
-class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class ImputerSuite extends MLTest with DefaultReadWriteTest {
 
   test("Imputer for Double with default missing Value NaN") {
     val df = spark.createDataFrame( Seq(
@@ -76,6 +75,28 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
     ImputerSuite.iterateStrategyTest(imputer, df)
   }
 
+  test("Imputer should work with Structured Streaming") {
+    val localSpark = spark
+    import localSpark.implicits._
+    val df = Seq[(java.lang.Double, Double)](
+      (4.0, 4.0),
+      (10.0, 10.0),
+      (10.0, 10.0),
+      (Double.NaN, 8.0),
+      (null, 8.0)
+    ).toDF("value", "expected_mean_value")
+    val imputer = new Imputer()
+      .setInputCols(Array("value"))
+      .setOutputCols(Array("out"))
+      .setStrategy("mean")
+    val model = imputer.fit(df)
+    testTransformer[(java.lang.Double, Double)](df, model, "expected_mean_value", "out") {
+      case Row(exp: java.lang.Double, out: Double) =>
+        assert((exp.isNaN && out.isNaN) || (exp == out),
+          s"Imputed values differ. Expected: $exp, actual: $out")
+    }
+  }
+
   test("Imputer throws exception when surrogate cannot be computed") {
     val df = spark.createDataFrame( Seq(
       (0, Double.NaN, 1.0, 1.0),
@@ -164,8 +185,6 @@ object ImputerSuite {
    * @param df DataFrame with columns "id", "value", "expected_mean", "expected_median"
    */
   def iterateStrategyTest(imputer: Imputer, df: DataFrame): Unit = {
-    val inputCols = imputer.getInputCols
-
     Seq("mean", "median").foreach { strategy =>
       imputer.setStrategy(strategy)
       val model = imputer.fit(df)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/InteractionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/InteractionSuite.scala
index 54f059e5f143e..eea31fc7ae3f2 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/InteractionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/InteractionSuite.scala
@@ -19,15 +19,15 @@ package org.apache.spark.ml.feature
 
 import scala.collection.mutable.ArrayBuilder
 
-import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.SparkException
 import org.apache.spark.ml.attribute._
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
-import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.functions.col
 
-class InteractionSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class InteractionSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -63,9 +63,9 @@ class InteractionSuite extends SparkFunSuite with MLlibTestSparkContext with Def
 
   test("numeric interaction") {
     val data = Seq(
-      (2, Vectors.dense(3.0, 4.0)),
-      (1, Vectors.dense(1.0, 5.0))
-    ).toDF("a", "b")
+      (2, Vectors.dense(3.0, 4.0), Vectors.dense(6.0, 8.0)),
+      (1, Vectors.dense(1.0, 5.0), Vectors.dense(1.0, 5.0))
+    ).toDF("a", "b", "expected")
     val groupAttr = new AttributeGroup(
       "b",
       Array[Attribute](
@@ -73,14 +73,15 @@ class InteractionSuite extends SparkFunSuite with MLlibTestSparkContext with Def
         NumericAttribute.defaultAttr.withName("bar")))
     val df = data.select(
       col("a").as("a", NumericAttribute.defaultAttr.toMetadata()),
-      col("b").as("b", groupAttr.toMetadata()))
+      col("b").as("b", groupAttr.toMetadata()),
+      col("expected"))
     val trans = new Interaction().setInputCols(Array("a", "b")).setOutputCol("features")
+    testTransformer[(Int, Vector, Vector)](df, trans, "features", "expected") {
+      case Row(features: Vector, expected: Vector) =>
+        assert(features === expected)
+    }
+
     val res = trans.transform(df)
-    val expected = Seq(
-      (2, Vectors.dense(3.0, 4.0), Vectors.dense(6.0, 8.0)),
-      (1, Vectors.dense(1.0, 5.0), Vectors.dense(1.0, 5.0))
-    ).toDF("a", "b", "features")
-    assert(res.collect() === expected.collect())
     val attrs = AttributeGroup.fromStructField(res.schema("features"))
     val expectedAttrs = new AttributeGroup(
       "features",
@@ -92,9 +93,9 @@ class InteractionSuite extends SparkFunSuite with MLlibTestSparkContext with Def
 
   test("nominal interaction") {
     val data = Seq(
-      (2, Vectors.dense(3.0, 4.0)),
-      (1, Vectors.dense(1.0, 5.0))
-    ).toDF("a", "b")
+      (2, Vectors.dense(3.0, 4.0), Vectors.dense(0, 0, 0, 0, 3, 4)),
+      (1, Vectors.dense(1.0, 5.0), Vectors.dense(0, 0, 1, 5, 0, 0))
+    ).toDF("a", "b", "expected")
     val groupAttr = new AttributeGroup(
       "b",
       Array[Attribute](
@@ -103,14 +104,15 @@ class InteractionSuite extends SparkFunSuite with MLlibTestSparkContext with Def
     val df = data.select(
       col("a").as(
         "a", NominalAttribute.defaultAttr.withValues(Array("up", "down", "left")).toMetadata()),
-      col("b").as("b", groupAttr.toMetadata()))
+      col("b").as("b", groupAttr.toMetadata()),
+      col("expected"))
     val trans = new Interaction().setInputCols(Array("a", "b")).setOutputCol("features")
+    testTransformer[(Int, Vector, Vector)](df, trans, "features", "expected") {
+      case Row(features: Vector, expected: Vector) =>
+        assert(features === expected)
+    }
+
     val res = trans.transform(df)
-    val expected = Seq(
-      (2, Vectors.dense(3.0, 4.0), Vectors.dense(0, 0, 0, 0, 3, 4)),
-      (1, Vectors.dense(1.0, 5.0), Vectors.dense(0, 0, 1, 5, 0, 0))
-    ).toDF("a", "b", "features")
-    assert(res.collect() === expected.collect())
     val attrs = AttributeGroup.fromStructField(res.schema("features"))
     val expectedAttrs = new AttributeGroup(
       "features",
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/MaxAbsScalerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/MaxAbsScalerSuite.scala
index 918da4f9388d4..8dd0f0cb91e37 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/MaxAbsScalerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/MaxAbsScalerSuite.scala
@@ -14,15 +14,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{Vector, Vectors}
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
-import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.sql.Row
 
-class MaxAbsScalerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class MaxAbsScalerSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -45,9 +44,10 @@ class MaxAbsScalerSuite extends SparkFunSuite with MLlibTestSparkContext with De
       .setOutputCol("scaled")
 
     val model = scaler.fit(df)
-    model.transform(df).select("expected", "scaled").collect()
-      .foreach { case Row(vector1: Vector, vector2: Vector) =>
-      assert(vector1.equals(vector2), s"MaxAbsScaler ut error: $vector2 should be $vector1")
+    testTransformer[(Vector, Vector)](df, model, "expected", "scaled") {
+      case Row(expectedVec: Vector, actualVec: Vector) =>
+        assert(expectedVec === actualVec,
+          s"MaxAbsScaler error: Expected $expectedVec but computed $actualVec")
     }
 
     MLTestingUtils.checkCopyAndUids(scaler, model)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala
index 96df68dbdf053..085070a1098d7 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala
@@ -17,14 +17,13 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
-import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.Dataset
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
+import org.apache.spark.sql.{Dataset, Row}
 
-class MinHashLSHSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+
+class MinHashLSHSuite extends MLTest with DefaultReadWriteTest {
 
   @transient var dataset: Dataset[_] = _
 
@@ -167,4 +166,20 @@ class MinHashLSHSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
     assert(precision == 1.0)
     assert(recall >= 0.7)
   }
+
+  test("MinHashLSHModel.transform should work with Structured Streaming") {
+    val localSpark = spark
+    import localSpark.implicits._
+
+    val model = new MinHashLSHModel("mh", randCoefficients = Array((1, 0)))
+    model.set(model.inputCol, "keys")
+    testTransformer[Tuple1[Vector]](dataset.toDF(), model, "keys", model.getOutputCol) {
+      case Row(_: Vector, output: Seq[_]) =>
+        assert(output.length === model.randCoefficients.length)
+        // no AND-amplification yet: SPARK-18450, so each hash output is of length 1
+        output.foreach {
+          case hashOutput: Vector => assert(hashOutput.size === 1)
+        }
+    }
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/MinMaxScalerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/MinMaxScalerSuite.scala
index 51db74eb739ca..2d965f2ca2c54 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/MinMaxScalerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/MinMaxScalerSuite.scala
@@ -17,13 +17,11 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{Vector, Vectors}
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
-import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.sql.Row
 
-class MinMaxScalerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class MinMaxScalerSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -48,9 +46,9 @@ class MinMaxScalerSuite extends SparkFunSuite with MLlibTestSparkContext with De
       .setMax(5)
 
     val model = scaler.fit(df)
-    model.transform(df).select("expected", "scaled").collect()
-      .foreach { case Row(vector1: Vector, vector2: Vector) =>
-        assert(vector1.equals(vector2), "Transformed vector is different with expected.")
+    testTransformer[(Vector, Vector)](df, model, "expected", "scaled") {
+      case Row(vector1: Vector, vector2: Vector) =>
+        assert(vector1 === vector2, "Transformed vector is different with expected.")
     }
 
     MLTestingUtils.checkCopyAndUids(scaler, model)
@@ -114,7 +112,7 @@ class MinMaxScalerSuite extends SparkFunSuite with MLlibTestSparkContext with De
     val model = scaler.fit(df)
     model.transform(df).select("expected", "scaled").collect()
       .foreach { case Row(vector1: Vector, vector2: Vector) =>
-        assert(vector1.equals(vector2), "Transformed vector is different with expected.")
+        assert(vector1 === vector2, "Transformed vector is different with expected.")
       }
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala
index e5956ee9942aa..201a335e0d7be 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala
@@ -84,7 +84,7 @@ class NGramSuite extends MLTest with DefaultReadWriteTest {
 
   def testNGram(t: NGram, dataFrame: DataFrame): Unit = {
     testTransformer[(Seq[String], Seq[String])](dataFrame, t, "nGrams", "wantedNGrams") {
-      case Row(actualNGrams : Seq[String], wantedNGrams: Seq[String]) =>
+      case Row(actualNGrams : Seq[_], wantedNGrams: Seq[_]) =>
         assert(actualNGrams === wantedNGrams)
     }
   }

From 03a4dfd6901595fb4622587e3387ec1ac8d77237 Mon Sep 17 00:00:00 2001
From: JBauerKogentix <37910022+JBauerKogentix@users.noreply.github.com>
Date: Wed, 11 Apr 2018 15:52:13 -0700
Subject: [PATCH 396/566] typo rawPredicition changed to rawPrediction

MultilayerPerceptronClassifier had 4 occurrences

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: JBauerKogentix <37910022+JBauerKogentix@users.noreply.github.com>

Closes #21030 from JBauerKogentix/patch-1.

(cherry picked from commit 9d960de0814a1128318676cc2e91f447cdf0137f)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 python/pyspark/ml/classification.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 27ad1e80aa0d3..55d603070706b 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -1542,12 +1542,12 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03,
                  solver="l-bfgs", initialWeights=None, probabilityCol="probability",
-                 rawPredicitionCol="rawPrediction"):
+                 rawPredictionCol="rawPrediction"):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, \
                  solver="l-bfgs", initialWeights=None, probabilityCol="probability", \
-                 rawPredicitionCol="rawPrediction")
+                 rawPredictionCol="rawPrediction")
         """
         super(MultilayerPerceptronClassifier, self).__init__()
         self._java_obj = self._new_java_obj(
@@ -1561,12 +1561,12 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03,
                   solver="l-bfgs", initialWeights=None, probabilityCol="probability",
-                  rawPredicitionCol="rawPrediction"):
+                  rawPredictionCol="rawPrediction"):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, \
                   solver="l-bfgs", initialWeights=None, probabilityCol="probability", \
-                  rawPredicitionCol="rawPrediction"):
+                  rawPredictionCol="rawPrediction"):
         Sets params for MultilayerPerceptronClassifier.
         """
         kwargs = self._input_kwargs

From 571269519a49e0651575de18de81b788b6548afd Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Thu, 12 Apr 2018 15:58:04 +0800
Subject: [PATCH 397/566] [SPARK-23962][SQL][TEST] Fix race in
 currentExecutionIds().

SQLMetricsTestUtils.currentExecutionIds() was racing with the listener
bus, which lead to some flaky tests.  We should wait till the listener bus is
empty.

I tested by adding some Thread.sleep()s in SQLAppStatusListener, which
reproduced the exceptions I saw on Jenkins.  With this change, they went
away.

Author: Imran Rashid <irashid@cloudera.com>

Closes #21041 from squito/SPARK-23962.

(cherry picked from commit 6a2289ecf020a99cd9b3bcea7da5e78fb4e0303a)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala
index 122d28798136f..e22bbb642642a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala
@@ -34,6 +34,7 @@ trait SQLMetricsTestUtils extends SQLTestUtils {
   import testImplicits._
 
   protected def currentExecutionIds(): Set[Long] = {
+    spark.sparkContext.listenerBus.waitUntilEmpty(10000)
     statusStore.executionsList.map(_.executionId).toSet
   }
 

From 908c681c6786ef0d772a43508285cb8891fc524a Mon Sep 17 00:00:00 2001
From: Patrick Pisciuneri <Patrick.Pisciuneri@target.com>
Date: Fri, 13 Apr 2018 09:45:27 +0800
Subject: [PATCH 398/566] [SPARK-23867][SCHEDULER] use droppedCount in
 logWarning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

Get the count of dropped events for output in log message.

## How was this patch tested?

The fix is pretty trivial, but `./dev/run-tests` were run and were successful.

Please review http://spark.apache.org/contributing.html before opening a pull request.

vanzin cloud-fan

The contribution is my original work and I license the work to the project under the project’s open source license.

Author: Patrick Pisciuneri <Patrick.Pisciuneri@target.com>

Closes #20977 from phpisciuneri/fix-log-warning.

(cherry picked from commit 682002b6da844ed11324ee5ff4d00fc0294c0b31)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala b/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala
index 7e14938acd8e0..c1fedd63f6a90 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala
@@ -166,7 +166,7 @@ private class AsyncEventQueue(val name: String, conf: SparkConf, metrics: LiveLi
           val prevLastReportTimestamp = lastReportTimestamp
           lastReportTimestamp = System.currentTimeMillis()
           val previous = new java.util.Date(prevLastReportTimestamp)
-          logWarning(s"Dropped $droppedEvents events from $name since $previous.")
+          logWarning(s"Dropped $droppedCount events from $name since $previous.")
         }
       }
     }

From 2995b79d6a78bf632aa4c1c99bebfc213fb31c54 Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Thu, 12 Apr 2018 20:00:25 -0700
Subject: [PATCH 399/566] [SPARK-23748][SS] Fix SS continuous process doesn't
 support SubqueryAlias issue

## What changes were proposed in this pull request?

Current SS continuous doesn't support processing on temp table or `df.as("xxx")`, SS will throw an exception as LogicalPlan not supported, details described in [here](https://issues.apache.org/jira/browse/SPARK-23748).

So here propose to add this support.

## How was this patch tested?

new UT.

Author: jerryshao <sshao@hortonworks.com>

Closes #21017 from jerryshao/SPARK-23748.

(cherry picked from commit 14291b061b9b40eadbf4ed442f9a5021b8e09597)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../UnsupportedOperationChecker.scala         |  2 +-
 .../continuous/ContinuousSuite.scala          | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
index b55043c270644..ff9d6d7a7dded 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
@@ -345,7 +345,7 @@ object UnsupportedOperationChecker {
     plan.foreachUp { implicit subPlan =>
       subPlan match {
         case (_: Project | _: Filter | _: MapElements | _: MapPartitions |
-              _: DeserializeToObject | _: SerializeFromObject) =>
+              _: DeserializeToObject | _: SerializeFromObject | _: SubqueryAlias) =>
         case node if node.nodeName == "StreamingRelationV2" =>
         case node =>
           throwError(s"Continuous processing does not support ${node.nodeName} operations.")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala
index 4b4ed82dc6520..95406b3a8b468 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala
@@ -174,6 +174,25 @@ class ContinuousSuite extends ContinuousSuiteBase {
       "Continuous processing does not support current time operations."))
   }
 
+  test("subquery alias") {
+    val df = spark.readStream
+      .format("rate")
+      .option("numPartitions", "5")
+      .option("rowsPerSecond", "5")
+      .load()
+      .createOrReplaceTempView("rate")
+    val test = spark.sql("select value from rate where value > 5")
+
+    testStream(test, useV2Sink = true)(
+      StartStream(longContinuousTrigger),
+      AwaitEpoch(0),
+      Execute(waitForRateSourceTriggers(_, 2)),
+      IncrementEpoch(),
+      Execute(waitForRateSourceTriggers(_, 4)),
+      IncrementEpoch(),
+      CheckAnswerRowsContains(scala.Range(6, 20).map(Row(_))))
+  }
+
   test("repeatedly restart") {
     val df = spark.readStream
       .format("rate")

From dfdf1bb9be19bd31e398f97310391b391fabfcfd Mon Sep 17 00:00:00 2001
From: Fangshi Li <fli@linkedin.com>
Date: Fri, 13 Apr 2018 13:46:34 +0800
Subject: [PATCH 400/566] [SPARK-23815][CORE] Spark writer dynamic partition
 overwrite mode may fail to write output on multi level partition

## What changes were proposed in this pull request?

Spark introduced new writer mode to overwrite only related partitions in SPARK-20236. While we are using this feature in our production cluster, we found a bug when writing multi-level partitions on HDFS.

A simple test case to reproduce this issue:
val df = Seq(("1","2","3")).toDF("col1", "col2","col3")
df.write.partitionBy("col1","col2").mode("overwrite").save("/my/hdfs/location")

If HDFS location "/my/hdfs/location" does not exist, there will be no output.

This seems to be caused by the job commit change in SPARK-20236 in HadoopMapReduceCommitProtocol.

In the commit job process, the output has been written into staging dir /my/hdfs/location/.spark-staging.xxx/col1=1/col2=2, and then the code calls fs.rename to rename /my/hdfs/location/.spark-staging.xxx/col1=1/col2=2 to /my/hdfs/location/col1=1/col2=2. However, in our case the operation will fail on HDFS because /my/hdfs/location/col1=1 does not exists. HDFS rename can not create directory for more than one level.

This does not happen in the new unit test added with SPARK-20236 which uses local file system.

We are proposing a fix. When cleaning current partition dir /my/hdfs/location/col1=1/col2=2 before the rename op, if the delete op fails (because /my/hdfs/location/col1=1/col2=2 may not exist), we call mkdirs op to create the parent dir /my/hdfs/location/col1=1 (if the parent dir does not exist) so the following rename op can succeed.

Reference: in official HDFS document(https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/filesystem/filesystem.html), the rename command has precondition "dest must be root, or have a parent that exists"

## How was this patch tested?

We have tested this patch on our production cluster and it fixed the problem

Author: Fangshi Li <fli@linkedin.com>

Closes #20931 from fangshil/master.

(cherry picked from commit 4b07036799b01894826b47c73142fe282c607a57)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../internal/io/HadoopMapReduceCommitProtocol.scala  | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 6d20ef1f98a3c..3e60c50ada59b 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -186,7 +186,17 @@ class HadoopMapReduceCommitProtocol(
         logDebug(s"Clean up default partition directories for overwriting: $partitionPaths")
         for (part <- partitionPaths) {
           val finalPartPath = new Path(path, part)
-          fs.delete(finalPartPath, true)
+          if (!fs.delete(finalPartPath, true) && !fs.exists(finalPartPath.getParent)) {
+            // According to the official hadoop FileSystem API spec, delete op should assume
+            // the destination is no longer present regardless of return value, thus we do not
+            // need to double check if finalPartPath exists before rename.
+            // Also in our case, based on the spec, delete returns false only when finalPartPath
+            // does not exist. When this happens, we need to take action if parent of finalPartPath
+            // also does not exist(e.g. the scenario described on SPARK-23815), because
+            // FileSystem API spec on rename op says the rename dest(finalPartPath) must have
+            // a parent that exists, otherwise we may get unexpected result on the rename.
+            fs.mkdirs(finalPartPath.getParent)
+          }
           fs.rename(new Path(stagingDir, part), finalPartPath)
         }
       }

From d4f204c5321cdc3955a48e9717ba06aaebbc2ab4 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@apache.org>
Date: Sat, 14 Apr 2018 21:44:06 +0800
Subject: [PATCH 401/566] [SPARK-23942][PYTHON][SQL][BRANCH-2.3] Makes collect
 in PySpark as action for a query executor listener

## What changes were proposed in this pull request?

This PR proposes to add `collect` to  a query executor as an action.

Seems `collect` / `collect` with Arrow are not recognised via `QueryExecutionListener` as an action. For example, if we have a custom listener as below:

```scala
package org.apache.spark.sql

import org.apache.spark.internal.Logging
import org.apache.spark.sql.execution.QueryExecution
import org.apache.spark.sql.util.QueryExecutionListener

class TestQueryExecutionListener extends QueryExecutionListener with Logging {
  override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = {
    logError("Look at me! I'm 'onSuccess'")
  }

  override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = { }
}
```
and set `spark.sql.queryExecutionListeners` to `org.apache.spark.sql.TestQueryExecutionListener`

Other operations in PySpark or Scala side seems fine:

```python
>>> sql("SELECT * FROM range(1)").show()
```
```
18/04/09 17:02:04 ERROR TestQueryExecutionListener: Look at me! I'm 'onSuccess'
+---+
| id|
+---+
|  0|
+---+
```

```scala
scala> sql("SELECT * FROM range(1)").collect()
```
```
18/04/09 16:58:41 ERROR TestQueryExecutionListener: Look at me! I'm 'onSuccess'
res1: Array[org.apache.spark.sql.Row] = Array([0])
```

but ..

**Before**

```python
>>> sql("SELECT * FROM range(1)").collect()
```
```
[Row(id=0)]
```

```python
>>> spark.conf.set("spark.sql.execution.arrow.enabled", "true")
>>> sql("SELECT * FROM range(1)").toPandas()
```
```
   id
0   0
```

**After**

```python
>>> sql("SELECT * FROM range(1)").collect()
```
```
18/04/09 16:57:58 ERROR TestQueryExecutionListener: Look at me! I'm 'onSuccess'
[Row(id=0)]
```

```python
>>> spark.conf.set("spark.sql.execution.arrow.enabled", "true")
>>> sql("SELECT * FROM range(1)").toPandas()
```
```
18/04/09 17:53:26 ERROR TestQueryExecutionListener: Look at me! I'm 'onSuccess'
   id
0   0
```

## How was this patch tested?

I have manually tested as described above and unit test was added.

Author: hyukjinkwon <gurwls223@apache.org>

Closes #21060 from HyukjinKwon/PR_TOOL_PICK_PR_21007_BRANCH-2.3.
---
 python/pyspark/sql/tests.py                   | 99 ++++++++++++++++---
 .../scala/org/apache/spark/sql/Dataset.scala  | 20 ++--
 .../sql/TestQueryExecutionListener.scala      | 44 +++++++++
 3 files changed, 140 insertions(+), 23 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/TestQueryExecutionListener.scala

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index d806e5d92495c..daa97e1627eec 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -185,22 +185,12 @@ def __init__(self, key, value):
         self.value = value
 
 
-class ReusedSQLTestCase(ReusedPySparkTestCase):
-    @classmethod
-    def setUpClass(cls):
-        ReusedPySparkTestCase.setUpClass()
-        cls.spark = SparkSession(cls.sc)
-
-    @classmethod
-    def tearDownClass(cls):
-        ReusedPySparkTestCase.tearDownClass()
-        cls.spark.stop()
-
-    def assertPandasEqual(self, expected, result):
-        msg = ("DataFrames are not equal: " +
-               "\n\nExpected:\n%s\n%s" % (expected, expected.dtypes) +
-               "\n\nResult:\n%s\n%s" % (result, result.dtypes))
-        self.assertTrue(expected.equals(result), msg=msg)
+class SQLTestUtils(object):
+    """
+    This util assumes the instance of this to have 'spark' attribute, having a spark session.
+    It is usually used with 'ReusedSQLTestCase' class but can be used if you feel sure the
+    the implementation of this class has 'spark' attribute.
+    """
 
     @contextmanager
     def sql_conf(self, pairs):
@@ -209,6 +199,7 @@ def sql_conf(self, pairs):
         `value` to the configuration `key` and then restores it back when it exits.
         """
         assert isinstance(pairs, dict), "pairs should be a dictionary."
+        assert hasattr(self, "spark"), "it should have 'spark' attribute, having a spark session."
 
         keys = pairs.keys()
         new_values = pairs.values()
@@ -225,6 +216,24 @@ def sql_conf(self, pairs):
                     self.spark.conf.set(key, old_value)
 
 
+class ReusedSQLTestCase(ReusedPySparkTestCase, SQLTestUtils):
+    @classmethod
+    def setUpClass(cls):
+        ReusedPySparkTestCase.setUpClass()
+        cls.spark = SparkSession(cls.sc)
+
+    @classmethod
+    def tearDownClass(cls):
+        ReusedPySparkTestCase.tearDownClass()
+        cls.spark.stop()
+
+    def assertPandasEqual(self, expected, result):
+        msg = ("DataFrames are not equal: " +
+               "\n\nExpected:\n%s\n%s" % (expected, expected.dtypes) +
+               "\n\nResult:\n%s\n%s" % (result, result.dtypes))
+        self.assertTrue(expected.equals(result), msg=msg)
+
+
 class DataTypeTests(unittest.TestCase):
     # regression test for SPARK-6055
     def test_data_type_eq(self):
@@ -2980,6 +2989,64 @@ def test_sparksession_with_stopped_sparkcontext(self):
             sc.stop()
 
 
+class QueryExecutionListenerTests(unittest.TestCase, SQLTestUtils):
+    # These tests are separate because it uses 'spark.sql.queryExecutionListeners' which is
+    # static and immutable. This can't be set or unset, for example, via `spark.conf`.
+
+    @classmethod
+    def setUpClass(cls):
+        import glob
+        from pyspark.find_spark_home import _find_spark_home
+
+        SPARK_HOME = _find_spark_home()
+        filename_pattern = (
+            "sql/core/target/scala-*/test-classes/org/apache/spark/sql/"
+            "TestQueryExecutionListener.class")
+        if not glob.glob(os.path.join(SPARK_HOME, filename_pattern)):
+            raise unittest.SkipTest(
+                "'org.apache.spark.sql.TestQueryExecutionListener' is not "
+                "available. Will skip the related tests.")
+
+        # Note that 'spark.sql.queryExecutionListeners' is a static immutable configuration.
+        cls.spark = SparkSession.builder \
+            .master("local[4]") \
+            .appName(cls.__name__) \
+            .config(
+                "spark.sql.queryExecutionListeners",
+                "org.apache.spark.sql.TestQueryExecutionListener") \
+            .getOrCreate()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.spark.stop()
+
+    def tearDown(self):
+        self.spark._jvm.OnSuccessCall.clear()
+
+    def test_query_execution_listener_on_collect(self):
+        self.assertFalse(
+            self.spark._jvm.OnSuccessCall.isCalled(),
+            "The callback from the query execution listener should not be called before 'collect'")
+        self.spark.sql("SELECT * FROM range(1)").collect()
+        self.assertTrue(
+            self.spark._jvm.OnSuccessCall.isCalled(),
+            "The callback from the query execution listener should be called after 'collect'")
+
+    @unittest.skipIf(
+        not _have_pandas or not _have_pyarrow,
+        _pandas_requirement_message or _pyarrow_requirement_message)
+    def test_query_execution_listener_on_collect_with_arrow(self):
+        with self.sql_conf({"spark.sql.execution.arrow.enabled": True}):
+            self.assertFalse(
+                self.spark._jvm.OnSuccessCall.isCalled(),
+                "The callback from the query execution listener should not be "
+                "called before 'toPandas'")
+            self.spark.sql("SELECT * FROM range(1)").toPandas()
+            self.assertTrue(
+                self.spark._jvm.OnSuccessCall.isCalled(),
+                "The callback from the query execution listener should be called after 'toPandas'")
+
+
 class UDFInitializationTests(unittest.TestCase):
     def tearDown(self):
         if SparkSession._instantiatedSession is not None:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 0aee1d7be5788..917168162b236 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -3189,10 +3189,10 @@ class Dataset[T] private[sql](
 
   private[sql] def collectToPython(): Int = {
     EvaluatePython.registerPicklers()
-    withNewExecutionId {
+    withAction("collectToPython", queryExecution) { plan =>
       val toJava: (Any) => Any = EvaluatePython.toJava(_, schema)
-      val iter = new SerDeUtil.AutoBatchedPickler(
-        queryExecution.executedPlan.executeCollect().iterator.map(toJava))
+      val iter: Iterator[Array[Byte]] = new SerDeUtil.AutoBatchedPickler(
+        plan.executeCollect().iterator.map(toJava))
       PythonRDD.serveIterator(iter, "serve-DataFrame")
     }
   }
@@ -3201,8 +3201,9 @@ class Dataset[T] private[sql](
    * Collect a Dataset as ArrowPayload byte arrays and serve to PySpark.
    */
   private[sql] def collectAsArrowToPython(): Int = {
-    withNewExecutionId {
-      val iter = toArrowPayload.collect().iterator.map(_.asPythonSerializable)
+    withAction("collectAsArrowToPython", queryExecution) { plan =>
+      val iter: Iterator[Array[Byte]] =
+        toArrowPayload(plan).collect().iterator.map(_.asPythonSerializable)
       PythonRDD.serveIterator(iter, "serve-Arrow")
     }
   }
@@ -3311,14 +3312,19 @@ class Dataset[T] private[sql](
   }
 
   /** Convert to an RDD of ArrowPayload byte arrays */
-  private[sql] def toArrowPayload: RDD[ArrowPayload] = {
+  private[sql] def toArrowPayload(plan: SparkPlan): RDD[ArrowPayload] = {
     val schemaCaptured = this.schema
     val maxRecordsPerBatch = sparkSession.sessionState.conf.arrowMaxRecordsPerBatch
     val timeZoneId = sparkSession.sessionState.conf.sessionLocalTimeZone
-    queryExecution.toRdd.mapPartitionsInternal { iter =>
+    plan.execute().mapPartitionsInternal { iter =>
       val context = TaskContext.get()
       ArrowConverters.toPayloadIterator(
         iter, schemaCaptured, maxRecordsPerBatch, timeZoneId, context)
     }
   }
+
+  // This is only used in tests, for now.
+  private[sql] def toArrowPayload: RDD[ArrowPayload] = {
+    toArrowPayload(queryExecution.executedPlan)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestQueryExecutionListener.scala b/sql/core/src/test/scala/org/apache/spark/sql/TestQueryExecutionListener.scala
new file mode 100644
index 0000000000000..d2a6358ee822b
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TestQueryExecutionListener.scala
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import java.util.concurrent.atomic.AtomicBoolean
+
+import org.apache.spark.sql.execution.QueryExecution
+import org.apache.spark.sql.util.QueryExecutionListener
+
+
+class TestQueryExecutionListener extends QueryExecutionListener {
+  override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = {
+    OnSuccessCall.isOnSuccessCalled.set(true)
+  }
+
+  override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = { }
+}
+
+/**
+ * This has a variable to check if `onSuccess` is actually called or not. Currently, this is for
+ * the test case in PySpark. See SPARK-23942.
+ */
+object OnSuccessCall {
+  val isOnSuccessCalled = new AtomicBoolean(false)
+
+  def isCalled(): Boolean = isOnSuccessCalled.get()
+
+  def clear(): Unit = isOnSuccessCalled.set(false)
+}

From 9857e249c20f842868cb4681ea374b8e316c3ead Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Tue, 17 Apr 2018 21:45:20 +0800
Subject: [PATCH 402/566] [SPARK-23835][SQL] Add not-null check to Tuples'
 arguments deserialization

## What changes were proposed in this pull request?

There was no check on nullability for arguments of `Tuple`s. This could lead to have weird behavior when a null value had to be deserialized into a non-nullable Scala object: in those cases, the `null` got silently transformed in a valid value (like `-1` for `Int`), corresponding to the default value we are using in the SQL codebase. This situation was very likely to happen when deserializing to a Tuple of primitive Scala types (like Double, Int, ...).

The PR adds the `AssertNotNull` to arguments of tuples which have been asked to be converted to non-nullable types.

## How was this patch tested?

added UT

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #20976 from mgaido91/SPARK-23835.

(cherry picked from commit 0a9172a05e604a4a94adbb9208c8c02362afca00)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/kafka010/KafkaContinuousSinkSuite.scala    |  6 +++---
 .../apache/spark/sql/kafka010/KafkaSinkSuite.scala |  2 +-
 .../spark/sql/catalyst/ScalaReflection.scala       | 14 +++++++-------
 .../spark/sql/catalyst/ScalaReflectionSuite.scala  | 12 +++++++++++-
 .../scala/org/apache/spark/sql/DatasetSuite.scala  |  5 +++++
 5 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
index fc890a0cfdac3..ddfc0c1a4be2d 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
@@ -79,7 +79,7 @@ class KafkaContinuousSinkSuite extends KafkaContinuousTest {
     val reader = createKafkaReader(topic)
       .selectExpr("CAST(key as STRING) key", "CAST(value as STRING) value")
       .selectExpr("CAST(key as INT) key", "CAST(value as INT) value")
-      .as[(Int, Int)]
+      .as[(Option[Int], Int)]
       .map(_._2)
 
     try {
@@ -119,7 +119,7 @@ class KafkaContinuousSinkSuite extends KafkaContinuousTest {
     val reader = createKafkaReader(topic)
       .selectExpr("CAST(key as STRING) key", "CAST(value as STRING) value")
       .selectExpr("CAST(key as INT) key", "CAST(value as INT) value")
-      .as[(Int, Int)]
+      .as[(Option[Int], Int)]
       .map(_._2)
 
     try {
@@ -167,7 +167,7 @@ class KafkaContinuousSinkSuite extends KafkaContinuousTest {
     val reader = createKafkaReader(topic)
       .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
       .selectExpr("CAST(key AS INT)", "CAST(value AS INT)")
-      .as[(Int, Int)]
+      .as[(Option[Int], Int)]
       .map(_._2)
 
     try {
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala
index 42f8b4c7657e2..7079ac6453ffc 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala
@@ -138,7 +138,7 @@ class KafkaSinkSuite extends StreamTest with SharedSQLContext {
     val reader = createKafkaReader(topic)
       .selectExpr("CAST(key as STRING) key", "CAST(value as STRING) value")
       .selectExpr("CAST(key as INT) key", "CAST(value as INT) value")
-      .as[(Int, Int)]
+      .as[(Option[Int], Int)]
       .map(_._2)
 
     try {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 9a4bf0075a178..fabf8955330eb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -382,22 +382,22 @@ object ScalaReflection extends ScalaReflection {
           val clsName = getClassNameFromType(fieldType)
           val newTypePath = s"""- field (class: "$clsName", name: "$fieldName")""" +: walkedTypePath
           // For tuples, we based grab the inner fields by ordinal instead of name.
-          if (cls.getName startsWith "scala.Tuple") {
+          val constructor = if (cls.getName startsWith "scala.Tuple") {
             deserializerFor(
               fieldType,
               Some(addToPathOrdinal(i, dataType, newTypePath)),
               newTypePath)
           } else {
-            val constructor = deserializerFor(
+            deserializerFor(
               fieldType,
               Some(addToPath(fieldName, dataType, newTypePath)),
               newTypePath)
+          }
 
-            if (!nullable) {
-              AssertNotNull(constructor, newTypePath)
-            } else {
-              constructor
-            }
+          if (!nullable) {
+            AssertNotNull(constructor, newTypePath)
+          } else {
+            constructor
           }
         }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
index 8c3db48a01f12..353b8344658f2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
@@ -21,7 +21,7 @@ import java.sql.{Date, Timestamp}
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
-import org.apache.spark.sql.catalyst.expressions.{BoundReference, Literal, SpecificInternalRow, UpCast}
+import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression, Literal, SpecificInternalRow, UpCast}
 import org.apache.spark.sql.catalyst.expressions.objects.{AssertNotNull, NewInstance}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -365,4 +365,14 @@ class ScalaReflectionSuite extends SparkFunSuite {
         StructField("_2", NullType, nullable = true))),
       nullable = true))
   }
+
+  test("SPARK-23835: add null check to non-nullable types in Tuples") {
+    def numberOfCheckedArguments(deserializer: Expression): Int = {
+      assert(deserializer.isInstanceOf[NewInstance])
+      deserializer.asInstanceOf[NewInstance].arguments.count(_.isInstanceOf[AssertNotNull])
+    }
+    assert(numberOfCheckedArguments(deserializerFor[(Double, Double)]) == 2)
+    assert(numberOfCheckedArguments(deserializerFor[(java.lang.Double, Int)]) == 1)
+    assert(numberOfCheckedArguments(deserializerFor[(java.lang.Integer, java.lang.Integer)]) == 0)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 9b745befcb611..e0f4d2ba685e1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1453,6 +1453,11 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
     val group2 = cached.groupBy("x").agg(min(col("z")) as "value")
     checkAnswer(group1.union(group2), Row(4, 5) :: Row(1, 2) :: Row(4, 6) :: Row(1, 3) :: Nil)
   }
+
+  test("SPARK-23835: null primitive data type should throw NullPointerException") {
+    val ds = Seq[(Option[Int], Option[Int])]((Some(1), None)).toDS()
+    intercept[NullPointerException](ds.as[(Int, Int)].collect())
+  }
 }
 
 case class TestDataUnion(x: Int, y: Int, z: Int)

From 564019b926eddec39d0485217e693a1e6b4b8e14 Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Wed, 18 Apr 2018 00:35:44 +0800
Subject: [PATCH 403/566] [SPARK-23986][SQL] freshName can generate non-unique
 names

## What changes were proposed in this pull request?

We are using `CodegenContext.freshName` to get a unique name for any new variable we are adding. Unfortunately, this method currently fails to create a unique name when we request more than one instance of variables with starting name `name1` and an instance with starting name `name11`.

The PR changes the way a new name is generated by `CodegenContext.freshName` so that we generate unique names in this scenario too.

## How was this patch tested?

added UT

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #21080 from mgaido91/SPARK-23986.

(cherry picked from commit f39e82ce150b6a7ea038e6858ba7adbaba3cad88)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../catalyst/expressions/codegen/CodeGenerator.scala  | 11 +++--------
 .../catalyst/expressions/CodeGenerationSuite.scala    | 10 ++++++++++
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 2631e7ee33951..504e851c5dbc4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -564,14 +564,9 @@ class CodegenContext {
     } else {
       s"${freshNamePrefix}_$name"
     }
-    if (freshNameIds.contains(fullName)) {
-      val id = freshNameIds(fullName)
-      freshNameIds(fullName) = id + 1
-      s"$fullName$id"
-    } else {
-      freshNameIds += fullName -> 1
-      fullName
-    }
+    val id = freshNameIds.getOrElse(fullName, 0)
+    freshNameIds(fullName) = id + 1
+    s"${fullName}_$id"
   }
 
   /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
index e3ce5af5936a0..d0d6318b8dd0f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
@@ -486,4 +486,14 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper {
       assert(!ctx.subExprEliminationExprs.contains(ref))
     }
   }
+
+  test("SPARK-23986: freshName can generate duplicated names") {
+    val ctx = new CodegenContext
+    val names1 = ctx.freshName("myName1") :: ctx.freshName("myName1") ::
+      ctx.freshName("myName11") :: Nil
+    assert(names1.distinct.length == 3)
+    val names2 = ctx.freshName("a") :: ctx.freshName("a") ::
+      ctx.freshName("a_1") :: ctx.freshName("a_0") :: Nil
+    assert(names2.distinct.length == 4)
+  }
 }

From 6b99d5bc3f3898a0aff30468a623a3f64bb20b62 Mon Sep 17 00:00:00 2001
From: jinxing <jinxing6042@126.com>
Date: Tue, 17 Apr 2018 15:53:29 -0500
Subject: [PATCH 404/566] [SPARK-23948] Trigger mapstage's job listener in
 submitMissingTasks

## What changes were proposed in this pull request?

SparkContext submitted a map stage from `submitMapStage` to `DAGScheduler`,
`markMapStageJobAsFinished` is called only in (https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala#L933 and https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala#L1314);

But think about below scenario:
1. stage0 and stage1 are all `ShuffleMapStage` and stage1 depends on stage0;
2. We submit stage1 by `submitMapStage`;
3. When stage 1 running, `FetchFailed` happened, stage0 and stage1 got resubmitted as stage0_1 and stage1_1;
4. When stage0_1 running, speculated tasks in old stage1 come as succeeded, but stage1 is not inside `runningStages`. So even though all splits(including the speculated tasks) in stage1 succeeded, job listener in stage1 will not be called;
5. stage0_1 finished, stage1_1 starts running. When `submitMissingTasks`, there is no missing tasks. But in current code, job listener is not triggered.

We should call the job listener for map stage in `5`.

## How was this patch tested?

Not added yet.

Author: jinxing <jinxing6042126.com>

(cherry picked from commit 3990daaf3b6ca2c5a9f7790030096262efb12cb2)

Author: jinxing <jinxing6042@126.com>

Closes #21085 from squito/cp.
---
 .../apache/spark/scheduler/DAGScheduler.scala | 33 ++++++------
 .../spark/scheduler/DAGSchedulerSuite.scala   | 52 +++++++++++++++++++
 2 files changed, 70 insertions(+), 15 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 8c46a84323392..78b6b34b5d2bb 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1092,17 +1092,16 @@ class DAGScheduler(
       // the stage as completed here in case there are no tasks to run
       markStageAsFinished(stage, None)
 
-      val debugString = stage match {
+      stage match {
         case stage: ShuffleMapStage =>
-          s"Stage ${stage} is actually done; " +
-            s"(available: ${stage.isAvailable}," +
-            s"available outputs: ${stage.numAvailableOutputs}," +
-            s"partitions: ${stage.numPartitions})"
+          logDebug(s"Stage ${stage} is actually done; " +
+              s"(available: ${stage.isAvailable}," +
+              s"available outputs: ${stage.numAvailableOutputs}," +
+              s"partitions: ${stage.numPartitions})")
+          markMapStageJobsAsFinished(stage)
         case stage : ResultStage =>
-          s"Stage ${stage} is actually done; (partitions: ${stage.numPartitions})"
+          logDebug(s"Stage ${stage} is actually done; (partitions: ${stage.numPartitions})")
       }
-      logDebug(debugString)
-
       submitWaitingChildStages(stage)
     }
   }
@@ -1307,13 +1306,7 @@ class DAGScheduler(
                   shuffleStage.findMissingPartitions().mkString(", "))
                 submitStage(shuffleStage)
               } else {
-                // Mark any map-stage jobs waiting on this stage as finished
-                if (shuffleStage.mapStageJobs.nonEmpty) {
-                  val stats = mapOutputTracker.getStatistics(shuffleStage.shuffleDep)
-                  for (job <- shuffleStage.mapStageJobs) {
-                    markMapStageJobAsFinished(job, stats)
-                  }
-                }
+                markMapStageJobsAsFinished(shuffleStage)
                 submitWaitingChildStages(shuffleStage)
               }
             }
@@ -1433,6 +1426,16 @@ class DAGScheduler(
     }
   }
 
+  private[scheduler] def markMapStageJobsAsFinished(shuffleStage: ShuffleMapStage): Unit = {
+    // Mark any map-stage jobs waiting on this stage as finished
+    if (shuffleStage.isAvailable && shuffleStage.mapStageJobs.nonEmpty) {
+      val stats = mapOutputTracker.getStatistics(shuffleStage.shuffleDep)
+      for (job <- shuffleStage.mapStageJobs) {
+        markMapStageJobAsFinished(job, stats)
+      }
+    }
+  }
+
   /**
    * Responds to an executor being lost. This is called inside the event loop, so it assumes it can
    * modify the scheduler's internal state. Use executorLost() to post a loss event from outside.
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index d812b5bd92c1b..8b6ec37625eec 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -2146,6 +2146,58 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi
     assertDataStructuresEmpty()
   }
 
+  test("Trigger mapstage's job listener in submitMissingTasks") {
+    val rdd1 = new MyRDD(sc, 2, Nil)
+    val dep1 = new ShuffleDependency(rdd1, new HashPartitioner(2))
+    val rdd2 = new MyRDD(sc, 2, List(dep1), tracker = mapOutputTracker)
+    val dep2 = new ShuffleDependency(rdd2, new HashPartitioner(2))
+
+    val listener1 = new SimpleListener
+    val listener2 = new SimpleListener
+
+    submitMapStage(dep1, listener1)
+    submitMapStage(dep2, listener2)
+
+    // Complete the stage0.
+    assert(taskSets(0).stageId === 0)
+    complete(taskSets(0), Seq(
+      (Success, makeMapStatus("hostA", rdd1.partitions.length)),
+      (Success, makeMapStatus("hostB", rdd1.partitions.length))))
+    assert(mapOutputTracker.getMapSizesByExecutorId(dep1.shuffleId, 0).map(_._1).toSet ===
+        HashSet(makeBlockManagerId("hostA"), makeBlockManagerId("hostB")))
+    assert(listener1.results.size === 1)
+
+    // When attempting stage1, trigger a fetch failure.
+    assert(taskSets(1).stageId === 1)
+    complete(taskSets(1), Seq(
+      (Success, makeMapStatus("hostC", rdd2.partitions.length)),
+      (FetchFailed(makeBlockManagerId("hostA"), dep1.shuffleId, 0, 0, "ignored"), null)))
+    scheduler.resubmitFailedStages()
+    // Stage1 listener should not have a result yet
+    assert(listener2.results.size === 0)
+
+    // Speculative task succeeded in stage1.
+    runEvent(makeCompletionEvent(
+      taskSets(1).tasks(1),
+      Success,
+      makeMapStatus("hostD", rdd2.partitions.length)))
+    // stage1 listener still should not have a result, though there's no missing partitions
+    // in it. Because stage1 has been failed and is not inside `runningStages` at this moment.
+    assert(listener2.results.size === 0)
+
+    // Stage0 should now be running as task set 2; make its task succeed
+    assert(taskSets(2).stageId === 0)
+    complete(taskSets(2), Seq(
+      (Success, makeMapStatus("hostC", rdd2.partitions.length))))
+    assert(mapOutputTracker.getMapSizesByExecutorId(dep1.shuffleId, 0).map(_._1).toSet ===
+        Set(makeBlockManagerId("hostC"), makeBlockManagerId("hostB")))
+
+    // After stage0 is finished, stage1 will be submitted and found there is no missing
+    // partitions in it. Then listener got triggered.
+    assert(listener2.results.size === 1)
+    assertDataStructuresEmpty()
+  }
+
   /**
    * In this test, we run a map stage where one of the executors fails but we still receive a
    * "zombie" complete message from that executor. We want to make sure the stage is not reported

From a1c56b66970a683e458e3f44fd6788110e869093 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Wed, 18 Apr 2018 08:22:05 -0700
Subject: [PATCH 405/566] [SPARK-24007][SQL] EqualNullSafe for FloatType and
 DoubleType might generate a wrong result by codegen.

## What changes were proposed in this pull request?

`EqualNullSafe` for `FloatType` and `DoubleType` might generate a wrong result by codegen.

```scala
scala> val df = Seq((Some(-1.0d), None), (None, Some(-1.0d))).toDF()
df: org.apache.spark.sql.DataFrame = [_1: double, _2: double]

scala> df.show()
+----+----+
|  _1|  _2|
+----+----+
|-1.0|null|
|null|-1.0|
+----+----+

scala> df.filter("_1 <=> _2").show()
+----+----+
|  _1|  _2|
+----+----+
|-1.0|null|
|null|-1.0|
+----+----+
```

The result should be empty but the result remains two rows.

## How was this patch tested?

Added a test.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #21094 from ueshin/issues/SPARK-24007/equalnullsafe.

(cherry picked from commit f09a9e9418c1697d198de18f340b1288f5eb025c)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../sql/catalyst/expressions/codegen/CodeGenerator.scala   | 6 ++++--
 .../spark/sql/catalyst/expressions/PredicateSuite.scala    | 7 +++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 504e851c5dbc4..9cf5839ff1c91 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -764,8 +764,10 @@ class CodegenContext {
    */
   def genEqual(dataType: DataType, c1: String, c2: String): String = dataType match {
     case BinaryType => s"java.util.Arrays.equals($c1, $c2)"
-    case FloatType => s"(java.lang.Float.isNaN($c1) && java.lang.Float.isNaN($c2)) || $c1 == $c2"
-    case DoubleType => s"(java.lang.Double.isNaN($c1) && java.lang.Double.isNaN($c2)) || $c1 == $c2"
+    case FloatType =>
+      s"((java.lang.Float.isNaN($c1) && java.lang.Float.isNaN($c2)) || $c1 == $c2)"
+    case DoubleType =>
+      s"((java.lang.Double.isNaN($c1) && java.lang.Double.isNaN($c2)) || $c1 == $c2)"
     case dt: DataType if isPrimitiveType(dt) => s"$c1 == $c2"
     case dt: DataType if dt.isInstanceOf[AtomicType] => s"$c1.equals($c2)"
     case array: ArrayType => genComp(array, c1, c2) + " == 0"
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index 8a8f8e10225fa..1bfd180ae4393 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -442,4 +442,11 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
     InSet(Literal(1), Set(1, 2, 3, 4)).genCode(ctx)
     assert(ctx.inlinedMutableStates.isEmpty)
   }
+
+  test("SPARK-24007: EqualNullSafe for FloatType and DoubleType might generate a wrong result") {
+    checkEvaluation(EqualNullSafe(Literal(null, FloatType), Literal(-1.0f)), false)
+    checkEvaluation(EqualNullSafe(Literal(-1.0f), Literal(null, FloatType)), false)
+    checkEvaluation(EqualNullSafe(Literal(null, DoubleType), Literal(-1.0d)), false)
+    checkEvaluation(EqualNullSafe(Literal(-1.0d), Literal(null, DoubleType)), false)
+  }
 }

From 5bcb7bdccf967ff5ad3d8c76f4ad8c9c4031e7c2 Mon Sep 17 00:00:00 2001
From: Bruce Robbins <bersprockets@gmail.com>
Date: Fri, 13 Apr 2018 14:05:04 -0700
Subject: [PATCH 406/566] [SPARK-23963][SQL] Properly handle large number of
 columns in query on text-based Hive table

## What changes were proposed in this pull request?

TableReader would get disproportionately slower as the number of columns in the query increased.

I fixed the way TableReader was looking up metadata for each column in the row. Previously, it had been looking up this data in linked lists, accessing each linked list by an index (column number). Now it looks up this data in arrays, where indexing by column number works better.

## How was this patch tested?

Manual testing
All sbt unit tests
python sql tests

Author: Bruce Robbins <bersprockets@gmail.com>

Closes #21043 from bersprockets/tabreadfix.
---
 .../src/main/scala/org/apache/spark/sql/hive/TableReader.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index cc8907a0bbc93..b5444a4217924 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -381,7 +381,7 @@ private[hive] object HadoopTableReader extends HiveInspectors with Logging {
 
     val (fieldRefs, fieldOrdinals) = nonPartitionKeyAttrs.map { case (attr, ordinal) =>
       soi.getStructFieldRef(attr.name) -> ordinal
-    }.unzip
+    }.toArray.unzip
 
     /**
      * Builds specific unwrappers ahead of time according to object inspector

From 130641102ceecf2a795d7f0dc6412c7e56eb03a8 Mon Sep 17 00:00:00 2001
From: Gabor Somogyi <gabor.g.somogyi@gmail.com>
Date: Wed, 18 Apr 2018 16:37:41 -0700
Subject: [PATCH 407/566] [SPARK-23775][TEST] Make DataFrameRangeSuite not
 flaky

## What changes were proposed in this pull request?

DataFrameRangeSuite.test("Cancelling stage in a query with Range.") stays sometimes in an infinite loop and times out the build.

There were multiple issues with the test:

1. The first valid stageId is zero when the test started alone and not in a suite and the following code waits until timeout:

```
eventually(timeout(10.seconds), interval(1.millis)) {
  assert(DataFrameRangeSuite.stageToKill > 0)
}
```

2. The `DataFrameRangeSuite.stageToKill` was overwritten by the task's thread after the reset which ended up in canceling the same stage 2 times. This caused the infinite wait.

This PR solves this mentioned flakyness by removing the shared `DataFrameRangeSuite.stageToKill` and using `wait` and `CountDownLatch` for synhronization.

## How was this patch tested?

Existing unit test.

Author: Gabor Somogyi <gabor.g.somogyi@gmail.com>

Closes #20888 from gaborgsomogyi/SPARK-23775.

(cherry picked from commit 0c94e48bc50717e1627c0d2acd5382d9adc73c97)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../spark/sql/DataFrameRangeSuite.scala       | 78 +++++++++++--------
 1 file changed, 45 insertions(+), 33 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameRangeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameRangeSuite.scala
index 57a930dfaf320..a0fd74088ce8b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameRangeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameRangeSuite.scala
@@ -17,14 +17,16 @@
 
 package org.apache.spark.sql
 
+import java.util.concurrent.{CountDownLatch, TimeUnit}
+
 import scala.concurrent.duration._
 import scala.math.abs
 import scala.util.Random
 
 import org.scalatest.concurrent.Eventually
 
-import org.apache.spark.{SparkException, TaskContext}
-import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
+import org.apache.spark.{SparkContext, SparkException}
+import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -152,39 +154,53 @@ class DataFrameRangeSuite extends QueryTest with SharedSQLContext with Eventuall
   }
 
   test("Cancelling stage in a query with Range.") {
-    val listener = new SparkListener {
-      override def onJobStart(jobStart: SparkListenerJobStart): Unit = {
-        eventually(timeout(10.seconds), interval(1.millis)) {
-          assert(DataFrameRangeSuite.stageToKill > 0)
+    // Save and restore the value because SparkContext is shared
+    val savedInterruptOnCancel = sparkContext
+      .getLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL)
+
+    try {
+      sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "true")
+
+      for (codegen <- Seq(true, false)) {
+        // This countdown latch used to make sure with all the stages cancelStage called in listener
+        val latch = new CountDownLatch(2)
+
+        val listener = new SparkListener {
+          override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = {
+            sparkContext.cancelStage(taskStart.stageId)
+            latch.countDown()
+          }
         }
-        sparkContext.cancelStage(DataFrameRangeSuite.stageToKill)
-      }
-    }
 
-    sparkContext.addSparkListener(listener)
-    for (codegen <- Seq(true, false)) {
-      withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> codegen.toString()) {
-        DataFrameRangeSuite.stageToKill = -1
-        val ex = intercept[SparkException] {
-          spark.range(0, 100000000000L, 1, 1).map { x =>
-            DataFrameRangeSuite.stageToKill = TaskContext.get().stageId()
-            x
-          }.toDF("id").agg(sum("id")).collect()
+        sparkContext.addSparkListener(listener)
+        withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> codegen.toString()) {
+          val ex = intercept[SparkException] {
+            sparkContext.range(0, 10000L, numSlices = 10).mapPartitions { x =>
+              x.synchronized {
+                x.wait()
+              }
+              x
+            }.toDF("id").agg(sum("id")).collect()
+          }
+          ex.getCause() match {
+            case null =>
+              assert(ex.getMessage().contains("cancelled"))
+            case cause: SparkException =>
+              assert(cause.getMessage().contains("cancelled"))
+            case cause: Throwable =>
+              fail("Expected the cause to be SparkException, got " + cause.toString() + " instead.")
+          }
         }
-        ex.getCause() match {
-          case null =>
-            assert(ex.getMessage().contains("cancelled"))
-          case cause: SparkException =>
-            assert(cause.getMessage().contains("cancelled"))
-          case cause: Throwable =>
-            fail("Expected the cause to be SparkException, got " + cause.toString() + " instead.")
+        latch.await(20, TimeUnit.SECONDS)
+        eventually(timeout(20.seconds)) {
+          assert(sparkContext.statusTracker.getExecutorInfos.map(_.numRunningTasks()).sum == 0)
         }
+        sparkContext.removeSparkListener(listener)
       }
-      eventually(timeout(20.seconds)) {
-        assert(sparkContext.statusTracker.getExecutorInfos.map(_.numRunningTasks()).sum == 0)
-      }
+    } finally {
+      sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL,
+        savedInterruptOnCancel)
     }
-    sparkContext.removeSparkListener(listener)
   }
 
   test("SPARK-20430 Initialize Range parameters in a driver side") {
@@ -204,7 +220,3 @@ class DataFrameRangeSuite extends QueryTest with SharedSQLContext with Eventuall
     }
   }
 }
-
-object DataFrameRangeSuite {
-  @volatile var stageToKill = -1
-}

From 32bec6ca3d9e47587c84f928d4166475fe29f596 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 19 Apr 2018 10:00:57 +0800
Subject: [PATCH 408/566] [SPARK-24014][PYSPARK] Add onStreamingStarted method
 to StreamingListener

## What changes were proposed in this pull request?

The `StreamingListener` in PySpark side seems to be lack of `onStreamingStarted` method. This patch adds it and a test for it.

This patch also includes a trivial doc improvement for `createDirectStream`.

Original PR is #21057.

## How was this patch tested?

Added test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #21098 from viirya/SPARK-24014.

(cherry picked from commit 8bb0df2c65355dfdcd28e362ff661c6c7ebc99c0)
Signed-off-by: jerryshao <sshao@hortonworks.com>
---
 python/pyspark/streaming/kafka.py    | 3 ++-
 python/pyspark/streaming/listener.py | 6 ++++++
 python/pyspark/streaming/tests.py    | 7 +++++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/streaming/kafka.py b/python/pyspark/streaming/kafka.py
index fdb9308604489..ed2e0e7d10fa2 100644
--- a/python/pyspark/streaming/kafka.py
+++ b/python/pyspark/streaming/kafka.py
@@ -104,7 +104,8 @@ def createDirectStream(ssc, topics, kafkaParams, fromOffsets=None,
         :param topics:  list of topic_name to consume.
         :param kafkaParams: Additional params for Kafka.
         :param fromOffsets: Per-topic/partition Kafka offsets defining the (inclusive) starting
-                            point of the stream.
+                            point of the stream (a dictionary mapping `TopicAndPartition` to
+                            integers).
         :param keyDecoder:  A function used to decode key (default is utf8_decoder).
         :param valueDecoder:  A function used to decode value (default is utf8_decoder).
         :param messageHandler: A function used to convert KafkaMessageAndMetadata. You can assess
diff --git a/python/pyspark/streaming/listener.py b/python/pyspark/streaming/listener.py
index b830797f5c0a0..d4ecc215aea99 100644
--- a/python/pyspark/streaming/listener.py
+++ b/python/pyspark/streaming/listener.py
@@ -23,6 +23,12 @@ class StreamingListener(object):
     def __init__(self):
         pass
 
+    def onStreamingStarted(self, streamingStarted):
+        """
+        Called when the streaming has been started.
+        """
+        pass
+
     def onReceiverStarted(self, receiverStarted):
         """
         Called when a receiver has been started
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index ca28c9bf3bee3..1ec418a7cea91 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -507,6 +507,10 @@ def __init__(self):
             self.batchInfosCompleted = []
             self.batchInfosStarted = []
             self.batchInfosSubmitted = []
+            self.streamingStartedTime = []
+
+        def onStreamingStarted(self, streamingStarted):
+            self.streamingStartedTime.append(streamingStarted.time)
 
         def onBatchSubmitted(self, batchSubmitted):
             self.batchInfosSubmitted.append(batchSubmitted.batchInfo())
@@ -530,9 +534,12 @@ def func(dstream):
         batchInfosSubmitted = batch_collector.batchInfosSubmitted
         batchInfosStarted = batch_collector.batchInfosStarted
         batchInfosCompleted = batch_collector.batchInfosCompleted
+        streamingStartedTime = batch_collector.streamingStartedTime
 
         self.wait_for(batchInfosCompleted, 4)
 
+        self.assertEqual(len(streamingStartedTime), 1)
+
         self.assertGreaterEqual(len(batchInfosSubmitted), 4)
         for info in batchInfosSubmitted:
             self.assertGreaterEqual(info.batchTime().milliseconds(), 0)

From 7fb11176f285b4de47e61511c09acbbb79e5c44c Mon Sep 17 00:00:00 2001
From: wuyi <ngone_5451@163.com>
Date: Thu, 19 Apr 2018 09:00:33 -0500
Subject: [PATCH 409/566] [SPARK-24021][CORE] fix bug in BlacklistTracker's
 updateBlacklistForFetchFailure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

There‘s a miswrite in BlacklistTracker's updateBlacklistForFetchFailure:
```
val blacklistedExecsOnNode =
    nodeToBlacklistedExecs.getOrElseUpdate(exec, HashSet[String]())
blacklistedExecsOnNode += exec
```
where first **exec** should be **host**.
## How was this patch tested?

adjust existed test.

Author: wuyi <ngone_5451@163.com>

Closes #21104 from Ngone51/SPARK-24021.

(cherry picked from commit 0deaa5251326a32a3d2d2b8851193ca926303972)
Signed-off-by: Imran Rashid <irashid@cloudera.com>
---
 .../scala/org/apache/spark/scheduler/BlacklistTracker.scala  | 2 +-
 .../org/apache/spark/scheduler/BlacklistTrackerSuite.scala   | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
index 952598f6de19d..30cf75d43ee09 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
@@ -210,7 +210,7 @@ private[scheduler] class BlacklistTracker (
         updateNextExpiryTime()
         killBlacklistedExecutor(exec)
 
-        val blacklistedExecsOnNode = nodeToBlacklistedExecs.getOrElseUpdate(exec, HashSet[String]())
+        val blacklistedExecsOnNode = nodeToBlacklistedExecs.getOrElseUpdate(host, HashSet[String]())
         blacklistedExecsOnNode += exec
       }
     }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala
index 88a57c599e240..00867ef1308a2 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala
@@ -574,6 +574,9 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     verify(allocationClientMock, never).killExecutors(any(), any(), any(), any())
     verify(allocationClientMock, never).killExecutorsOnHost(any())
 
+    assert(blacklist.nodeToBlacklistedExecs.contains("hostA"))
+    assert(blacklist.nodeToBlacklistedExecs("hostA").contains("1"))
+
     // Enable auto-kill. Blacklist an executor and make sure killExecutors is called.
     conf.set(config.BLACKLIST_KILL_ENABLED, true)
     blacklist = new BlacklistTracker(listenerBusMock, conf, Some(allocationClientMock), clock)
@@ -589,6 +592,8 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
       1000 + blacklist.BLACKLIST_TIMEOUT_MILLIS)
     assert(blacklist.nextExpiryTime === 1000 + blacklist.BLACKLIST_TIMEOUT_MILLIS)
     assert(blacklist.nodeIdToBlacklistExpiryTime.isEmpty)
+    assert(blacklist.nodeToBlacklistedExecs.contains("hostA"))
+    assert(blacklist.nodeToBlacklistedExecs("hostA").contains("1"))
 
     // Enable external shuffle service to see if all the executors on this node will be killed.
     conf.set(config.SHUFFLE_SERVICE_ENABLED, true)

From fb968215ca014c5cf40097a3c4588bbee11e2c02 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 19 Apr 2018 17:54:53 +0200
Subject: [PATCH 410/566] [SPARK-23989][SQL] exchange should copy data before
 non-serialized shuffle

## What changes were proposed in this pull request?

In Spark SQL, we usually reuse the `UnsafeRow` instance and need to copy the data when a place buffers non-serialized objects.

Shuffle may buffer objects if we don't make it to the bypass merge shuffle or unsafe shuffle.

`ShuffleExchangeExec.needToCopyObjectsBeforeShuffle` misses the case that, if `spark.sql.shuffle.partitions` is large enough, we could fail to run unsafe shuffle and go with the non-serialized shuffle.

This bug is very hard to hit since users wouldn't set such a large number of partitions(16 million) for Spark SQL exchange.

TODO: test

## How was this patch tested?

todo.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #21101 from cloud-fan/shuffle.

(cherry picked from commit 6e19f7683fc73fabe7cdaac4eb1982d2e3e607b7)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../exchange/ShuffleExchangeExec.scala        | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala
index 4d95ee34f30de..b89203719541b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala
@@ -153,12 +153,9 @@ object ShuffleExchangeExec {
    * See SPARK-2967, SPARK-4479, and SPARK-7375 for more discussion of this issue.
    *
    * @param partitioner the partitioner for the shuffle
-   * @param serializer the serializer that will be used to write rows
    * @return true if rows should be copied before being shuffled, false otherwise
    */
-  private def needToCopyObjectsBeforeShuffle(
-      partitioner: Partitioner,
-      serializer: Serializer): Boolean = {
+  private def needToCopyObjectsBeforeShuffle(partitioner: Partitioner): Boolean = {
     // Note: even though we only use the partitioner's `numPartitions` field, we require it to be
     // passed instead of directly passing the number of partitions in order to guard against
     // corner-cases where a partitioner constructed with `numPartitions` partitions may output
@@ -167,22 +164,24 @@ object ShuffleExchangeExec {
     val shuffleManager = SparkEnv.get.shuffleManager
     val sortBasedShuffleOn = shuffleManager.isInstanceOf[SortShuffleManager]
     val bypassMergeThreshold = conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200)
+    val numParts = partitioner.numPartitions
     if (sortBasedShuffleOn) {
-      val bypassIsSupported = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager]
-      if (bypassIsSupported && partitioner.numPartitions <= bypassMergeThreshold) {
+      if (numParts <= bypassMergeThreshold) {
         // If we're using the original SortShuffleManager and the number of output partitions is
         // sufficiently small, then Spark will fall back to the hash-based shuffle write path, which
         // doesn't buffer deserialized records.
         // Note that we'll have to remove this case if we fix SPARK-6026 and remove this bypass.
         false
-      } else if (serializer.supportsRelocationOfSerializedObjects) {
+      } else if (numParts <= SortShuffleManager.MAX_SHUFFLE_OUTPUT_PARTITIONS_FOR_SERIALIZED_MODE) {
         // SPARK-4550 and  SPARK-7081 extended sort-based shuffle to serialize individual records
         // prior to sorting them. This optimization is only applied in cases where shuffle
         // dependency does not specify an aggregator or ordering and the record serializer has
-        // certain properties. If this optimization is enabled, we can safely avoid the copy.
+        // certain properties and the number of partitions doesn't exceed the limitation. If this
+        // optimization is enabled, we can safely avoid the copy.
         //
-        // Exchange never configures its ShuffledRDDs with aggregators or key orderings, so we only
-        // need to check whether the optimization is enabled and supported by our serializer.
+        // Exchange never configures its ShuffledRDDs with aggregators or key orderings, and the
+        // serializer in Spark SQL always satisfy the properties, so we only need to check whether
+        // the number of partitions exceeds the limitation.
         false
       } else {
         // Spark's SortShuffleManager uses `ExternalSorter` to buffer records in memory, so we must
@@ -298,7 +297,7 @@ object ShuffleExchangeExec {
         rdd
       }
 
-      if (needToCopyObjectsBeforeShuffle(part, serializer)) {
+      if (needToCopyObjectsBeforeShuffle(part)) {
         newRdd.mapPartitionsInternal { iter =>
           val getPartitionKey = getPartitionKeyExtractor()
           iter.map { row => (part.getPartition(getPartitionKey(row)), row.copy()) }

From be184d16e86f96a748d6bf1642c1c319d2a09f5c Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Thu, 19 Apr 2018 09:48:34 -0700
Subject: [PATCH 411/566] [SPARK-23340][SQL][BRANCH-2.3] Upgrade Apache ORC to
 1.4.3

## What changes were proposed in this pull request?

This PR updates Apache ORC dependencies to 1.4.3 released on February 9th. Apache ORC 1.4.2 release removes unnecessary dependencies and 1.4.3 has 5 more patches (https://s.apache.org/Fll8).

Especially, the following ORC-285 is fixed at 1.4.3.

```scala
scala> val df = Seq(Array.empty[Float]).toDF()

scala> df.write.format("orc").save("/tmp/floatarray")

scala> spark.read.orc("/tmp/floatarray")
res1: org.apache.spark.sql.DataFrame = [value: array<float>]

scala> spark.read.orc("/tmp/floatarray").show()
18/02/12 22:09:10 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1)
java.io.IOException: Error reading file: file:/tmp/floatarray/part-00000-9c0b461b-4df1-4c23-aac1-3e4f349ac7d6-c000.snappy.orc
	at org.apache.orc.impl.RecordReaderImpl.nextBatch(RecordReaderImpl.java:1191)
	at org.apache.orc.mapreduce.OrcMapreduceRecordReader.ensureBatch(OrcMapreduceRecordReader.java:78)
...
Caused by: java.io.EOFException: Read past EOF for compressed stream Stream for column 2 kind DATA position: 0 length: 0 range: 0 offset: 0 limit: 0
```

## How was this patch tested?

Pass the Jenkins test.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #21093 from dongjoon-hyun/SPARK-23340-2.
---
 dev/deps/spark-deps-hadoop-2.6                         |  4 ++--
 dev/deps/spark-deps-hadoop-2.7                         |  4 ++--
 pom.xml                                                |  6 +-----
 .../sql/execution/datasources/orc/OrcSourceSuite.scala |  9 +++++++++
 .../apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala  | 10 ++++++++++
 5 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 3b2a88537ca50..3ded7aea11ebc 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -156,8 +156,8 @@ objenesis-2.1.jar
 okhttp-3.8.1.jar
 okio-1.13.0.jar
 opencsv-2.3.jar
-orc-core-1.4.1-nohive.jar
-orc-mapreduce-1.4.1-nohive.jar
+orc-core-1.4.3-nohive.jar
+orc-mapreduce-1.4.3-nohive.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 757da00788e54..344f22d2360e5 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -157,8 +157,8 @@ objenesis-2.1.jar
 okhttp-3.8.1.jar
 okio-1.13.0.jar
 opencsv-2.3.jar
-orc-core-1.4.1-nohive.jar
-orc-mapreduce-1.4.1-nohive.jar
+orc-core-1.4.3-nohive.jar
+orc-mapreduce-1.4.3-nohive.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar
diff --git a/pom.xml b/pom.xml
index 651ce7104c59f..9c2d931cab91e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -130,7 +130,7 @@
     <hive.version.short>1.2.1</hive.version.short>
     <derby.version>10.12.1.1</derby.version>
     <parquet.version>1.8.2</parquet.version>
-    <orc.version>1.4.1</orc.version>
+    <orc.version>1.4.3</orc.version>
     <orc.classifier>nohive</orc.classifier>
     <hive.parquet.version>1.6.0</hive.parquet.version>
     <jetty.version>9.3.20.v20170531</jetty.version>
@@ -1739,10 +1739,6 @@
             <groupId>org.apache.hive</groupId>
             <artifactId>hive-storage-api</artifactId>
           </exclusion>
-          <exclusion>
-            <groupId>io.airlift</groupId>
-            <artifactId>slice</artifactId>
-          </exclusion>
         </exclusions>
       </dependency>
       <dependency>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
index 6f5f2fd795f74..523f7cf77e103 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
@@ -160,6 +160,15 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll {
       }
     }
   }
+
+  test("SPARK-23340 Empty float/double array columns raise EOFException") {
+    Seq(Seq(Array.empty[Float]).toDF(), Seq(Array.empty[Double]).toDF()).foreach { df =>
+      withTempPath { path =>
+        df.write.format("orc").save(path.getCanonicalPath)
+        checkAnswer(spark.read.orc(path.getCanonicalPath), df)
+      }
+    }
+  }
 }
 
 class OrcSourceSuite extends OrcSuite with SharedSQLContext {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala
index 92b2f069cacd6..597b0f56a55e4 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala
@@ -208,4 +208,14 @@ class HiveOrcQuerySuite extends OrcQueryTest with TestHiveSingleton {
       }
     }
   }
+
+  test("SPARK-23340 Empty float/double array columns raise EOFException") {
+    withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "false") {
+      withTable("spark_23340") {
+        sql("CREATE TABLE spark_23340(a array<float>, b array<double>) STORED AS ORC")
+        sql("INSERT INTO spark_23340 VALUES (array(), array())")
+        checkAnswer(spark.table("spark_23340"), Seq(Row(Array.empty[Float], Array.empty[Double])))
+      }
+    }
+  }
 }

From 9b562d6fef765cb8357dbc31390e60b5947a9069 Mon Sep 17 00:00:00 2001
From: Gabor Somogyi <gabor.g.somogyi@gmail.com>
Date: Thu, 19 Apr 2018 15:06:27 -0700
Subject: [PATCH 412/566] [SPARK-24022][TEST] Make SparkContextSuite not flaky

## What changes were proposed in this pull request?

SparkContextSuite.test("Cancelling stages/jobs with custom reasons.") could stay in an infinite loop because of the problem found and fixed in [SPARK-23775](https://issues.apache.org/jira/browse/SPARK-23775).

This PR solves this mentioned flakyness by removing shared variable usages when cancel happens in a loop and using wait and CountDownLatch for synhronization.

## How was this patch tested?

Existing unit test.

Author: Gabor Somogyi <gabor.g.somogyi@gmail.com>

Closes #21105 from gaborgsomogyi/SPARK-24022.

(cherry picked from commit e55953b0bf2a80b34127ba123417ee54955a6064)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../org/apache/spark/SparkContextSuite.scala  | 61 ++++++++-----------
 1 file changed, 26 insertions(+), 35 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
index b30bd74812b36..ce9f2be1c02dd 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark
 import java.io.File
 import java.net.{MalformedURLException, URI}
 import java.nio.charset.StandardCharsets
-import java.util.concurrent.{Semaphore, TimeUnit}
+import java.util.concurrent.{CountDownLatch, Semaphore, TimeUnit}
 
 import scala.concurrent.duration._
 
@@ -498,45 +498,36 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu
 
   test("Cancelling stages/jobs with custom reasons.") {
     sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local"))
+    sc.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "true")
     val REASON = "You shall not pass"
-    val slices = 10
 
-    val listener = new SparkListener {
-      override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = {
-        if (SparkContextSuite.cancelStage) {
-          eventually(timeout(10.seconds)) {
-            assert(SparkContextSuite.isTaskStarted)
+    for (cancelWhat <- Seq("stage", "job")) {
+      // This countdown latch used to make sure stage or job canceled in listener
+      val latch = new CountDownLatch(1)
+
+      val listener = cancelWhat match {
+        case "stage" =>
+          new SparkListener {
+            override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = {
+              sc.cancelStage(taskStart.stageId, REASON)
+              latch.countDown()
+            }
           }
-          sc.cancelStage(taskStart.stageId, REASON)
-          SparkContextSuite.cancelStage = false
-          SparkContextSuite.semaphore.release(slices)
-        }
-      }
-
-      override def onJobStart(jobStart: SparkListenerJobStart): Unit = {
-        if (SparkContextSuite.cancelJob) {
-          eventually(timeout(10.seconds)) {
-            assert(SparkContextSuite.isTaskStarted)
+        case "job" =>
+          new SparkListener {
+            override def onJobStart(jobStart: SparkListenerJobStart): Unit = {
+              sc.cancelJob(jobStart.jobId, REASON)
+              latch.countDown()
+            }
           }
-          sc.cancelJob(jobStart.jobId, REASON)
-          SparkContextSuite.cancelJob = false
-          SparkContextSuite.semaphore.release(slices)
-        }
       }
-    }
-    sc.addSparkListener(listener)
-
-    for (cancelWhat <- Seq("stage", "job")) {
-      SparkContextSuite.semaphore.drainPermits()
-      SparkContextSuite.isTaskStarted = false
-      SparkContextSuite.cancelStage = (cancelWhat == "stage")
-      SparkContextSuite.cancelJob = (cancelWhat == "job")
+      sc.addSparkListener(listener)
 
       val ex = intercept[SparkException] {
-        sc.range(0, 10000L, numSlices = slices).mapPartitions { x =>
-          SparkContextSuite.isTaskStarted = true
-          // Block waiting for the listener to cancel the stage or job.
-          SparkContextSuite.semaphore.acquire()
+        sc.range(0, 10000L, numSlices = 10).mapPartitions { x =>
+          x.synchronized {
+            x.wait()
+          }
           x
         }.count()
       }
@@ -550,9 +541,11 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu
           fail("Expected the cause to be SparkException, got " + cause.toString() + " instead.")
       }
 
+      latch.await(20, TimeUnit.SECONDS)
       eventually(timeout(20.seconds)) {
         assert(sc.statusTracker.getExecutorInfos.map(_.numRunningTasks()).sum == 0)
       }
+      sc.removeSparkListener(listener)
     }
   }
 
@@ -637,8 +630,6 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu
 }
 
 object SparkContextSuite {
-  @volatile var cancelJob = false
-  @volatile var cancelStage = false
   @volatile var isTaskStarted = false
   @volatile var taskKilled = false
   @volatile var taskSucceeded = false

From 8eb64a5e222137578ab14b6c91f08a24bd32a3e4 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 20 Apr 2018 10:22:14 -0700
Subject: [PATCH 413/566] Revert "[SPARK-23775][TEST] Make DataFrameRangeSuite
 not flaky"

This reverts commit 130641102ceecf2a795d7f0dc6412c7e56eb03a8.
---
 .../spark/sql/DataFrameRangeSuite.scala       | 78 ++++++++-----------
 1 file changed, 33 insertions(+), 45 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameRangeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameRangeSuite.scala
index a0fd74088ce8b..57a930dfaf320 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameRangeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameRangeSuite.scala
@@ -17,16 +17,14 @@
 
 package org.apache.spark.sql
 
-import java.util.concurrent.{CountDownLatch, TimeUnit}
-
 import scala.concurrent.duration._
 import scala.math.abs
 import scala.util.Random
 
 import org.scalatest.concurrent.Eventually
 
-import org.apache.spark.{SparkContext, SparkException}
-import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart}
+import org.apache.spark.{SparkException, TaskContext}
+import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -154,53 +152,39 @@ class DataFrameRangeSuite extends QueryTest with SharedSQLContext with Eventuall
   }
 
   test("Cancelling stage in a query with Range.") {
-    // Save and restore the value because SparkContext is shared
-    val savedInterruptOnCancel = sparkContext
-      .getLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL)
-
-    try {
-      sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "true")
-
-      for (codegen <- Seq(true, false)) {
-        // This countdown latch used to make sure with all the stages cancelStage called in listener
-        val latch = new CountDownLatch(2)
-
-        val listener = new SparkListener {
-          override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = {
-            sparkContext.cancelStage(taskStart.stageId)
-            latch.countDown()
-          }
+    val listener = new SparkListener {
+      override def onJobStart(jobStart: SparkListenerJobStart): Unit = {
+        eventually(timeout(10.seconds), interval(1.millis)) {
+          assert(DataFrameRangeSuite.stageToKill > 0)
         }
+        sparkContext.cancelStage(DataFrameRangeSuite.stageToKill)
+      }
+    }
 
-        sparkContext.addSparkListener(listener)
-        withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> codegen.toString()) {
-          val ex = intercept[SparkException] {
-            sparkContext.range(0, 10000L, numSlices = 10).mapPartitions { x =>
-              x.synchronized {
-                x.wait()
-              }
-              x
-            }.toDF("id").agg(sum("id")).collect()
-          }
-          ex.getCause() match {
-            case null =>
-              assert(ex.getMessage().contains("cancelled"))
-            case cause: SparkException =>
-              assert(cause.getMessage().contains("cancelled"))
-            case cause: Throwable =>
-              fail("Expected the cause to be SparkException, got " + cause.toString() + " instead.")
-          }
+    sparkContext.addSparkListener(listener)
+    for (codegen <- Seq(true, false)) {
+      withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> codegen.toString()) {
+        DataFrameRangeSuite.stageToKill = -1
+        val ex = intercept[SparkException] {
+          spark.range(0, 100000000000L, 1, 1).map { x =>
+            DataFrameRangeSuite.stageToKill = TaskContext.get().stageId()
+            x
+          }.toDF("id").agg(sum("id")).collect()
         }
-        latch.await(20, TimeUnit.SECONDS)
-        eventually(timeout(20.seconds)) {
-          assert(sparkContext.statusTracker.getExecutorInfos.map(_.numRunningTasks()).sum == 0)
+        ex.getCause() match {
+          case null =>
+            assert(ex.getMessage().contains("cancelled"))
+          case cause: SparkException =>
+            assert(cause.getMessage().contains("cancelled"))
+          case cause: Throwable =>
+            fail("Expected the cause to be SparkException, got " + cause.toString() + " instead.")
         }
-        sparkContext.removeSparkListener(listener)
       }
-    } finally {
-      sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL,
-        savedInterruptOnCancel)
+      eventually(timeout(20.seconds)) {
+        assert(sparkContext.statusTracker.getExecutorInfos.map(_.numRunningTasks()).sum == 0)
+      }
     }
+    sparkContext.removeSparkListener(listener)
   }
 
   test("SPARK-20430 Initialize Range parameters in a driver side") {
@@ -220,3 +204,7 @@ class DataFrameRangeSuite extends QueryTest with SharedSQLContext with Eventuall
     }
   }
 }
+
+object DataFrameRangeSuite {
+  @volatile var stageToKill = -1
+}

From d91410029331ff475843ea660b5f98ba15f23892 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Sat, 21 Apr 2018 10:45:12 -0700
Subject: [PATCH 414/566] [SPARK-24033][SQL] Fix Mismatched of Window Frame
 specifiedwindowframe(RowFrame, -1, -1)

## What changes were proposed in this pull request?

When the OffsetWindowFunction's frame is `UnaryMinus(Literal(1))` but the specified window frame has been simplified to `Literal(-1)` by some optimizer rules e.g., `ConstantFolding`. Thus, they do not match and cause the following error:
```
org.apache.spark.sql.AnalysisException: Window Frame specifiedwindowframe(RowFrame, -1, -1) must match the required frame specifiedwindowframe(RowFrame, -1, -1);
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.failAnalysis(CheckAnalysis.scala:41)
at org.apache.spark.sql.catalyst.analysis.Analyzer.failAnalysis(Analyzer.scala:91)
at
```
## How was this patch tested?
Added a test

Author: gatorsmile <gatorsmile@gmail.com>

Closes #21115 from gatorsmile/fixLag.

(cherry picked from commit 7bc853d08973a6bd839ad2222911eb0a0f413677)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../catalyst/expressions/windowExpressions.scala   |  5 ++++-
 .../spark/sql/DataFrameWindowFramesSuite.scala     | 14 ++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
index dd13d9a3bba51..f2e23473af66e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
@@ -363,7 +363,10 @@ abstract class OffsetWindowFunction
   override lazy val frame: WindowFrame = {
     val boundary = direction match {
       case Ascending => offset
-      case Descending => UnaryMinus(offset)
+      case Descending => UnaryMinus(offset) match {
+          case e: Expression if e.foldable => Literal.create(e.eval(EmptyRow), e.dataType)
+          case o => o
+      }
     }
     SpecifiedWindowFrame(RowFrame, boundary, boundary)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala
index 0ee9b0edc02b2..2a0b2b85e10a9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala
@@ -402,4 +402,18 @@ class DataFrameWindowFramesSuite extends QueryTest with SharedSQLContext {
         Row(7, 3000) :: Row(8, 3000) :: Row(9, 5500) ::
         Row(10, 6000) :: Nil)
   }
+
+  test("SPARK-24033: Analysis Failure of OffsetWindowFunction") {
+    val ds = Seq((1, 1), (1, 2), (1, 3), (2, 1), (2, 2)).toDF("n", "i")
+    val res =
+      Row(1, 1, null) :: Row (1, 2, 1) :: Row(1, 3, 2) :: Row(2, 1, null) :: Row(2, 2, 1) :: Nil
+    checkAnswer(
+      ds.withColumn("m",
+        lead("i", -1).over(Window.partitionBy("n").orderBy("i").rowsBetween(-1, -1))),
+      res)
+    checkAnswer(
+      ds.withColumn("m",
+        lag("i", 1).over(Window.partitionBy("n").orderBy("i").rowsBetween(-1, -1))),
+      res)
+  }
 }

From c2f4ee7baf07501cc1f8a23dd21d14aea53606c7 Mon Sep 17 00:00:00 2001
From: Mykhailo Shtelma <mykhailo.shtelma@bearingpoint.com>
Date: Sat, 21 Apr 2018 23:33:57 -0700
Subject: [PATCH 415/566] [SPARK-23799][SQL] FilterEstimation.evaluateInSet
 produces devision by zero in a case of empty table with analyzed statistics

>What changes were proposed in this pull request?

During evaluation of IN conditions, if the source data frame, is represented by a plan, that uses hive table with columns, which were previously analysed, and the plan has conditions for these fields, that cannot be satisfied (which leads us to an empty data frame), FilterEstimation.evaluateInSet method produces NumberFormatException and ClassCastException.
In order to fix this bug, method FilterEstimation.evaluateInSet at first checks, if distinct count is not zero, and also checks if colStat.min and colStat.max  are defined, and only in this case proceeds with the calculation. If at least one of the conditions is not satisfied, zero is returned.

>How was this patch tested?

In order to test the PR two tests were implemented: one in FilterEstimationSuite, that tests the plan with the statistics that violates the conditions mentioned above,  and another one in StatisticsCollectionSuite, that test the whole process of analysis/optimisation of the query, that leads to the problems, mentioned in the first section.

Author: Mykhailo Shtelma <mykhailo.shtelma@bearingpoint.com>
Author: smikesh <mshtelma@gmail.com>

Closes #21052 from mshtelma/filter_estimation_evaluateInSet_Bugs.

(cherry picked from commit c48085aa91c60615a4de3b391f019f46f3fcdbe3)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../statsEstimation/FilterEstimation.scala    |  4 +++
 .../FilterEstimationSuite.scala               | 11 ++++++++
 .../spark/sql/StatisticsCollectionSuite.scala | 28 +++++++++++++++++++
 3 files changed, 43 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
index 4cc32de2d32d7..5b8b698f8eb36 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
@@ -388,6 +388,10 @@ case class FilterEstimation(plan: Filter) extends Logging {
     val dataType = attr.dataType
     var newNdv = ndv
 
+    if (ndv.toDouble == 0 || colStat.min.isEmpty || colStat.max.isEmpty)  {
+      return Some(0.0)
+    }
+
     // use [min, max] to filter the original hSet
     dataType match {
       case _: NumericType | BooleanType | DateType | TimestampType =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
index 2b1fe987a7960..e14b75ec42b67 100755
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
@@ -355,6 +355,17 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
       expectedRowCount = 3)
   }
 
+  test("evaluateInSet with all zeros") {
+    validateEstimatedStats(
+      Filter(InSet(attrString, Set(3, 4, 5)),
+        StatsTestPlan(Seq(attrString), 0,
+          AttributeMap(Seq(attrString ->
+            ColumnStat(distinctCount = Some(0), min = None, max = None,
+              nullCount = Some(0), avgLen = Some(0), maxLen = Some(0)))))),
+      Seq(attrString -> ColumnStat(distinctCount = Some(0))),
+      expectedRowCount = 0)
+  }
+
   test("cint NOT IN (3, 4, 5)") {
     validateEstimatedStats(
       Filter(Not(InSet(attrInt, Set(3, 4, 5))), childStatsTestPlan(Seq(attrInt), 10L)),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index b11e798532056..3f877749a50b3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -372,4 +372,32 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
       }
     }
   }
+
+  test("Simple queries must be working, if CBO is turned on") {
+    withSQLConf(SQLConf.CBO_ENABLED.key -> "true") {
+      withTable("TBL1", "TBL") {
+        import org.apache.spark.sql.functions._
+        val df = spark.range(1000L).select('id,
+          'id * 2 as "FLD1",
+          'id * 12 as "FLD2",
+          lit("aaa") + 'id as "fld3")
+        df.write
+          .mode(SaveMode.Overwrite)
+          .bucketBy(10, "id", "FLD1", "FLD2")
+          .sortBy("id", "FLD1", "FLD2")
+          .saveAsTable("TBL")
+        sql("ANALYZE TABLE TBL COMPUTE STATISTICS ")
+        sql("ANALYZE TABLE TBL COMPUTE STATISTICS FOR COLUMNS ID, FLD1, FLD2, FLD3")
+        val df2 = spark.sql(
+          """
+             |SELECT t1.id, t1.fld1, t1.fld2, t1.fld3
+             |FROM tbl t1
+             |JOIN tbl t2 on t1.id=t2.id
+             |WHERE  t1.fld3 IN (-123.23,321.23)
+          """.stripMargin)
+        df2.createTempView("TBL2")
+        sql("SELECT * FROM tbl2 WHERE fld3 IN ('qqq', 'qwe')  ").queryExecution.executedPlan
+      }
+    }
+  }
 }

From 8eb9a411d89e43110e9553e0d19a16cdc37bf789 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Mon, 23 Apr 2018 13:20:32 -0700
Subject: [PATCH 416/566] [SPARK-23004][SS] Ensure StateStore.commit is called
 only once in a streaming aggregation task
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

A structured streaming query with a streaming aggregation can throw the following error in rare cases. 

```
java.lang.IllegalStateException: Cannot commit after already committed or aborted
	at org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider.org$apache$spark$sql$execution$streaming$state$HDFSBackedStateStoreProvider$$verify(HDFSBackedStateStoreProvider.scala:643)
	at org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider$HDFSBackedStateStore.commit(HDFSBackedStateStoreProvider.scala:135)
	at org.apache.spark.sql.execution.streaming.StateStoreSaveExec$$anonfun$doExecute$3$$anon$2$$anonfun$hasNext$2.apply$mcV$sp(statefulOperators.scala:359)
	at org.apache.spark.sql.execution.streaming.StateStoreWriter$class.timeTakenMs(statefulOperators.scala:102)
	at org.apache.spark.sql.execution.streaming.StateStoreSaveExec.timeTakenMs(statefulOperators.scala:251)
	at org.apache.spark.sql.execution.streaming.StateStoreSaveExec$$anonfun$doExecute$3$$anon$2.hasNext(statefulOperators.scala:359)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.processInputs(ObjectAggregationIterator.scala:188)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.<init>(ObjectAggregationIterator.scala:78)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec$$anonfun$doExecute$1$$anonfun$2.apply(ObjectHashAggregateExec.scala:114)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec$$anonfun$doExecute$1$$anonfun$2.apply(ObjectHashAggregateExec.scala:105)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndexInternal$1$$anonfun$apply$24.apply(RDD.scala:830)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndexInternal$1$$anonfun$apply$24.apply(RDD.scala:830)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:42)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:336)
```

This can happen when the following conditions are accidentally hit. 
 - Streaming aggregation with aggregation function that is a subset of [`TypedImperativeAggregation`](https://github.com/apache/spark/blob/76b8b840ddc951ee6203f9cccd2c2b9671c1b5e8/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala#L473) (for example, `collect_set`, `collect_list`, `percentile`, etc.). 
 - Query running in `update}` mode
 - After the shuffle, a partition has exactly 128 records. 

This causes StateStore.commit to be called twice. See the [JIRA](https://issues.apache.org/jira/browse/SPARK-23004) for a more detailed explanation. The solution is to use `NextIterator` or `CompletionIterator`, each of which has a flag to prevent the "onCompletion" task from being called more than once. In this PR, I chose to implement using `NextIterator`.

## How was this patch tested?

Added unit test that I have confirm will fail without the fix.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #21124 from tdas/SPARK-23004.

(cherry picked from commit 770add81c3474e754867d7105031a5eaf27159bd)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../streaming/statefulOperators.scala         | 40 +++++++++----------
 .../streaming/StreamingAggregationSuite.scala | 25 ++++++++++++
 2 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
index b9b07a2e688f9..c9354ac0ec78a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
@@ -340,37 +340,35 @@ case class StateStoreSaveExec(
           // Update and output modified rows from the StateStore.
           case Some(Update) =>
 
-            val updatesStartTimeNs = System.nanoTime
-
-            new Iterator[InternalRow] {
-
+            new NextIterator[InternalRow] {
               // Filter late date using watermark if specified
               private[this] val baseIterator = watermarkPredicateForData match {
                 case Some(predicate) => iter.filter((row: InternalRow) => !predicate.eval(row))
                 case None => iter
               }
+              private val updatesStartTimeNs = System.nanoTime
 
-              override def hasNext: Boolean = {
-                if (!baseIterator.hasNext) {
-                  allUpdatesTimeMs += NANOSECONDS.toMillis(System.nanoTime - updatesStartTimeNs)
-
-                  // Remove old aggregates if watermark specified
-                  allRemovalsTimeMs += timeTakenMs { removeKeysOlderThanWatermark(store) }
-                  commitTimeMs += timeTakenMs { store.commit() }
-                  setStoreMetrics(store)
-                  false
+              override protected def getNext(): InternalRow = {
+                if (baseIterator.hasNext) {
+                  val row = baseIterator.next().asInstanceOf[UnsafeRow]
+                  val key = getKey(row)
+                  store.put(key, row)
+                  numOutputRows += 1
+                  numUpdatedStateRows += 1
+                  row
                 } else {
-                  true
+                  finished = true
+                  null
                 }
               }
 
-              override def next(): InternalRow = {
-                val row = baseIterator.next().asInstanceOf[UnsafeRow]
-                val key = getKey(row)
-                store.put(key, row)
-                numOutputRows += 1
-                numUpdatedStateRows += 1
-                row
+              override protected def close(): Unit = {
+                allUpdatesTimeMs += NANOSECONDS.toMillis(System.nanoTime - updatesStartTimeNs)
+
+                // Remove old aggregates if watermark specified
+                allRemovalsTimeMs += timeTakenMs { removeKeysOlderThanWatermark(store) }
+                commitTimeMs += timeTakenMs { store.commit() }
+                setStoreMetrics(store)
               }
             }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
index 1cae8cb8d47f1..382da13430781 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
@@ -536,6 +536,31 @@ class StreamingAggregationSuite extends StateStoreMetricsTest
     )
   }
 
+  test("SPARK-23004: Ensure that TypedImperativeAggregate functions do not throw errors") {
+    // See the JIRA SPARK-23004 for more details. In short, this test reproduces the error
+    // by ensuring the following.
+    // - A streaming query with a streaming aggregation.
+    // - Aggregation function 'collect_list' that is a subclass of TypedImperativeAggregate.
+    // - Post shuffle partition has exactly 128 records (i.e. the threshold at which
+    //   ObjectHashAggregateExec falls back to sort-based aggregation). This is done by having a
+    //   micro-batch with 128 records that shuffle to a single partition.
+    // This test throws the exact error reported in SPARK-23004 without the corresponding fix.
+    withSQLConf("spark.sql.shuffle.partitions" -> "1") {
+      val input = MemoryStream[Int]
+      val df = input.toDF().toDF("value")
+        .selectExpr("value as group", "value")
+        .groupBy("group")
+        .agg(collect_list("value"))
+      testStream(df, outputMode = OutputMode.Update)(
+        AddData(input, (1 to spark.sqlContext.conf.objectAggSortBasedFallbackThreshold): _*),
+        AssertOnQuery { q =>
+          q.processAllAvailable()
+          true
+        }
+      )
+    }
+  }
+
   /** Add blocks of data to the `BlockRDDBackedSource`. */
   case class AddBlockData(source: BlockRDDBackedSource, data: Seq[Int]*) extends AddData {
     override def addData(query: Option[StreamExecution]): (Source, Offset) = {

From 1c3e8205d04d8e40a9d73633631534a728d7b1fe Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Mon, 23 Apr 2018 13:57:57 -0700
Subject: [PATCH 417/566] Revert "[SPARK-23799][SQL]
 FilterEstimation.evaluateInSet produces devision by zero in a case of empty
 table with analyzed statistics"

This reverts commit c2f4ee7baf07501cc1f8a23dd21d14aea53606c7.
---
 .../statsEstimation/FilterEstimation.scala    |  4 ---
 .../FilterEstimationSuite.scala               | 11 --------
 .../spark/sql/StatisticsCollectionSuite.scala | 28 -------------------
 3 files changed, 43 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
index 5b8b698f8eb36..4cc32de2d32d7 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
@@ -388,10 +388,6 @@ case class FilterEstimation(plan: Filter) extends Logging {
     val dataType = attr.dataType
     var newNdv = ndv
 
-    if (ndv.toDouble == 0 || colStat.min.isEmpty || colStat.max.isEmpty)  {
-      return Some(0.0)
-    }
-
     // use [min, max] to filter the original hSet
     dataType match {
       case _: NumericType | BooleanType | DateType | TimestampType =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
index e14b75ec42b67..2b1fe987a7960 100755
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
@@ -355,17 +355,6 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
       expectedRowCount = 3)
   }
 
-  test("evaluateInSet with all zeros") {
-    validateEstimatedStats(
-      Filter(InSet(attrString, Set(3, 4, 5)),
-        StatsTestPlan(Seq(attrString), 0,
-          AttributeMap(Seq(attrString ->
-            ColumnStat(distinctCount = Some(0), min = None, max = None,
-              nullCount = Some(0), avgLen = Some(0), maxLen = Some(0)))))),
-      Seq(attrString -> ColumnStat(distinctCount = Some(0))),
-      expectedRowCount = 0)
-  }
-
   test("cint NOT IN (3, 4, 5)") {
     validateEstimatedStats(
       Filter(Not(InSet(attrInt, Set(3, 4, 5))), childStatsTestPlan(Seq(attrInt), 10L)),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index 3f877749a50b3..b11e798532056 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -372,32 +372,4 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
       }
     }
   }
-
-  test("Simple queries must be working, if CBO is turned on") {
-    withSQLConf(SQLConf.CBO_ENABLED.key -> "true") {
-      withTable("TBL1", "TBL") {
-        import org.apache.spark.sql.functions._
-        val df = spark.range(1000L).select('id,
-          'id * 2 as "FLD1",
-          'id * 12 as "FLD2",
-          lit("aaa") + 'id as "fld3")
-        df.write
-          .mode(SaveMode.Overwrite)
-          .bucketBy(10, "id", "FLD1", "FLD2")
-          .sortBy("id", "FLD1", "FLD2")
-          .saveAsTable("TBL")
-        sql("ANALYZE TABLE TBL COMPUTE STATISTICS ")
-        sql("ANALYZE TABLE TBL COMPUTE STATISTICS FOR COLUMNS ID, FLD1, FLD2, FLD3")
-        val df2 = spark.sql(
-          """
-             |SELECT t1.id, t1.fld1, t1.fld2, t1.fld3
-             |FROM tbl t1
-             |JOIN tbl t2 on t1.id=t2.id
-             |WHERE  t1.fld3 IN (-123.23,321.23)
-          """.stripMargin)
-        df2.createTempView("TBL2")
-        sql("SELECT * FROM tbl2 WHERE fld3 IN ('qqq', 'qwe')  ").queryExecution.executedPlan
-      }
-    }
-  }
 }

From 096defdd7bb1d687edff06fffdc6cda2ccd022b3 Mon Sep 17 00:00:00 2001
From: seancxmao <seancxmao@gmail.com>
Date: Tue, 24 Apr 2018 16:16:07 +0800
Subject: [PATCH 418/566] [MINOR][DOCS] Fix comments of
 SQLExecution#withExecutionId

## What changes were proposed in this pull request?
Fix comment. Change `BroadcastHashJoin.broadcastFuture` to `BroadcastExchangeExec.relationFuture`: https://github.com/apache/spark/blob/d28d5732ae205771f1f443b15b10e64dcffb5ff0/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala#L66

## How was this patch tested?
N/A

Author: seancxmao <seancxmao@gmail.com>

Closes #21113 from seancxmao/SPARK-13136.

(cherry picked from commit c303b1b6766a3dc5961713f98f62cd7d7ac7972a)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 .../scala/org/apache/spark/sql/execution/SQLExecution.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
index e991da7df0bde..2c5102b1e5ee7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
@@ -88,7 +88,7 @@ object SQLExecution {
   /**
    * Wrap an action with a known executionId. When running a different action in a different
    * thread from the original one, this method can be used to connect the Spark jobs in this action
-   * with the known executionId, e.g., `BroadcastHashJoin.broadcastFuture`.
+   * with the known executionId, e.g., `BroadcastExchangeExec.relationFuture`.
    */
   def withExecutionId[T](sc: SparkContext, executionId: String)(body: => T): T = {
     val oldExecutionId = sc.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)

From 07ec75ca00a8ad6608ec577e771fa4d2651a9661 Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Thu, 26 Apr 2018 13:27:33 +0800
Subject: [PATCH 419/566] [SPARK-24062][THRIFT SERVER] Fix SASL encryption
 cannot enabled issue in thrift server

## What changes were proposed in this pull request?

For the details of the exception please see [SPARK-24062](https://issues.apache.org/jira/browse/SPARK-24062).

The issue is:

Spark on Yarn stores SASL secret in current UGI's credentials, this credentials will be distributed to AM and executors, so that executors and drive share the same secret to communicate. But STS/Hive library code will refresh the current UGI by UGI's loginFromKeytab() after Spark application is started, this will create a new UGI in the current driver's context with empty tokens and secret keys, so secret key is lost in the current context's UGI, that's why Spark driver throws secret key not found exception.

In Spark 2.2 code, Spark also stores this secret key in SecurityManager's class variable, so even UGI is refreshed, the secret is still existed in the object, so STS with SASL can still be worked in Spark 2.2. But in Spark 2.3, we always search key from current UGI, which makes it fail to work in Spark 2.3.

To fix this issue, there're two possible solutions:

1. Fix in STS/Hive library, when a new UGI is refreshed, copy the secret key from original UGI to the new one. The difficulty is that some codes to refresh the UGI is existed in Hive library, which makes us hard to change the code.
2. Roll back the logics in SecurityManager to match Spark 2.2, so that this issue can be fixed.

2nd solution seems a simple one. So I will propose a PR with 2nd solution.

## How was this patch tested?

Verified in local cluster.

CC vanzin  tgravescs  please help to review. Thanks!

Author: jerryshao <sshao@hortonworks.com>

Closes #21138 from jerryshao/SPARK-24062.

(cherry picked from commit ffaf0f9fd407aeba7006f3d785ea8a0e51187357)
Signed-off-by: jerryshao <sshao@hortonworks.com>
---
 .../main/scala/org/apache/spark/SecurityManager.scala | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SecurityManager.scala b/core/src/main/scala/org/apache/spark/SecurityManager.scala
index 5b15a1c57779d..1d4a071f36506 100644
--- a/core/src/main/scala/org/apache/spark/SecurityManager.scala
+++ b/core/src/main/scala/org/apache/spark/SecurityManager.scala
@@ -227,6 +227,7 @@ private[spark] class SecurityManager(
   setViewAclsGroups(sparkConf.get("spark.ui.view.acls.groups", ""));
   setModifyAclsGroups(sparkConf.get("spark.modify.acls.groups", ""));
 
+  private var secretKey: String = _
   logInfo("SecurityManager: authentication " + (if (authOn) "enabled" else "disabled") +
     "; ui acls " + (if (aclsOn) "enabled" else "disabled") +
     "; users  with view permissions: " + viewAcls.toString() +
@@ -504,6 +505,12 @@ private[spark] class SecurityManager(
       val creds = UserGroupInformation.getCurrentUser().getCredentials()
       Option(creds.getSecretKey(SECRET_LOOKUP_KEY))
         .map { bytes => new String(bytes, UTF_8) }
+        // Secret key may not be found in current UGI's credentials.
+        // This happens when UGI is refreshed in the driver side by UGI's loginFromKeytab but not
+        // copy secret key from original UGI to the new one. This exists in ThriftServer's Hive
+        // logic. So as a workaround, storing secret key in a local variable to make it visible
+        // in different context.
+        .orElse(Option(secretKey))
         .orElse(Option(sparkConf.getenv(ENV_AUTH_SECRET)))
         .orElse(sparkConf.getOption(SPARK_AUTH_SECRET_CONF))
         .getOrElse {
@@ -541,8 +548,8 @@ private[spark] class SecurityManager(
     rnd.nextBytes(secretBytes)
 
     val creds = new Credentials()
-    val secretStr = HashCodes.fromBytes(secretBytes).toString()
-    creds.addSecretKey(SECRET_LOOKUP_KEY, secretStr.getBytes(UTF_8))
+    secretKey = HashCodes.fromBytes(secretBytes).toString()
+    creds.addSecretKey(SECRET_LOOKUP_KEY, secretKey.getBytes(UTF_8))
     UserGroupInformation.getCurrentUser().addCredentials(creds)
   }
 

From 4a10df0f66f74ec2c995f9832d1ab74112bdeb16 Mon Sep 17 00:00:00 2001
From: Dilip Biswal <dbiswal@us.ibm.com>
Date: Fri, 27 Apr 2018 11:43:29 -0700
Subject: [PATCH 420/566] [SPARK-24085][SQL] Query returns
 UnsupportedOperationException when scalar subquery is present in partitioning
 expression

## What changes were proposed in this pull request?
In this case, the partition pruning happens before the planning phase of scalar subquery expressions.
For scalar subquery expressions, the planning occurs late in the cycle (after the physical planning)  in "PlanSubqueries" just before execution. Currently we try to execute the scalar subquery expression as part of partition pruning and fail as it implements Unevaluable.

The fix attempts to ignore the Subquery expressions from partition pruning computation. Another option can be to somehow plan the subqueries before the partition pruning. Since this may not be a commonly occuring expression, i am opting for a simpler fix.

Repro
``` SQL
CREATE TABLE test_prc_bug (
id_value string
)
partitioned by (id_type string)
location '/tmp/test_prc_bug'
stored as parquet;

insert into test_prc_bug values ('1','a');
insert into test_prc_bug values ('2','a');
insert into test_prc_bug values ('3','b');
insert into test_prc_bug values ('4','b');

select * from test_prc_bug
where id_type = (select 'b');
```
## How was this patch tested?
Added test in SubquerySuite and hive/SQLQuerySuite

Author: Dilip Biswal <dbiswal@us.ibm.com>

Closes #21174 from dilipbiswal/spark-24085.

(cherry picked from commit 3fd297af6dc568357c97abf86760c570309d6597)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../datasources/FileSourceStrategy.scala      |  5 ++-
 .../PruneFileSourcePartitions.scala           |  4 ++-
 .../org/apache/spark/sql/SubquerySuite.scala  | 15 +++++++++
 .../sql/hive/execution/SQLQuerySuite.scala    | 31 +++++++++++++++++++
 4 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
index 16b22717b8d92..0a568d6b8adce 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
@@ -76,7 +76,10 @@ object FileSourceStrategy extends Strategy with Logging {
           fsRelation.partitionSchema, fsRelation.sparkSession.sessionState.analyzer.resolver)
       val partitionSet = AttributeSet(partitionColumns)
       val partitionKeyFilters =
-        ExpressionSet(normalizedFilters.filter(_.references.subsetOf(partitionSet)))
+        ExpressionSet(normalizedFilters
+          .filterNot(SubqueryExpression.hasSubquery(_))
+          .filter(_.references.subsetOf(partitionSet)))
+
       logInfo(s"Pruning directories with: ${partitionKeyFilters.mkString(",")}")
 
       val dataColumns =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
index 3b830accb83f0..16b2367bfdd5c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
@@ -55,7 +55,9 @@ private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] {
           partitionSchema, sparkSession.sessionState.analyzer.resolver)
       val partitionSet = AttributeSet(partitionColumns)
       val partitionKeyFilters =
-        ExpressionSet(normalizedFilters.filter(_.references.subsetOf(partitionSet)))
+        ExpressionSet(normalizedFilters
+          .filterNot(SubqueryExpression.hasSubquery(_))
+          .filter(_.references.subsetOf(partitionSet)))
 
       if (partitionKeyFilters.nonEmpty) {
         val prunedFileIndex = catalogFileIndex.filterPartitions(partitionKeyFilters.toSeq)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
index 31e8b0e8dede0..acef62d81ee12 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -955,4 +955,19 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
     // before the fix this would throw AnalysisException
     spark.range(10).where("(id,id) in (select id, null from range(3))").count
   }
+
+  test("SPARK-24085 scalar subquery in partitioning expression") {
+    withTable("parquet_part") {
+      Seq("1" -> "a", "2" -> "a", "3" -> "b", "4" -> "b")
+        .toDF("id_value", "id_type")
+        .write
+        .mode(SaveMode.Overwrite)
+        .partitionBy("id_type")
+        .format("parquet")
+        .saveAsTable("parquet_part")
+      checkAnswer(
+        sql("SELECT * FROM parquet_part WHERE id_type = (SELECT 'b')"),
+        Row("3", "b") :: Row("4", "b") :: Nil)
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 5e14f5397e9e6..081d854d771a7 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -2154,4 +2154,35 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       }
     }
   }
+
+  test("SPARK-24085 scalar subquery in partitioning expression") {
+    Seq("orc", "parquet").foreach { format =>
+      Seq(true, false).foreach { isConverted =>
+        withSQLConf(
+          HiveUtils.CONVERT_METASTORE_ORC.key -> s"$isConverted",
+          HiveUtils.CONVERT_METASTORE_PARQUET.key -> s"$isConverted",
+          "hive.exec.dynamic.partition.mode" -> "nonstrict") {
+          withTable(format) {
+            withTempPath { tempDir =>
+              sql(
+                s"""
+                  |CREATE TABLE ${format} (id_value string)
+                  |PARTITIONED BY (id_type string)
+                  |LOCATION '${tempDir.toURI}'
+                  |STORED AS ${format}
+                """.stripMargin)
+              sql(s"insert into $format values ('1','a')")
+              sql(s"insert into $format values ('2','a')")
+              sql(s"insert into $format values ('3','b')")
+              sql(s"insert into $format values ('4','b')")
+              checkAnswer(
+                sql(s"SELECT * FROM $format WHERE id_type = (SELECT 'b')"),
+                Row("3", "b") :: Row("4", "b") :: Nil)
+            }
+          }
+        }
+      }
+    }
+  }
+
 }

From df45ddb9dea9bf42d18c1164cf35067c7cac5d6f Mon Sep 17 00:00:00 2001
From: Juliusz Sompolski <julek@databricks.com>
Date: Fri, 27 Apr 2018 14:14:28 -0700
Subject: [PATCH 421/566] [SPARK-24104] SQLAppStatusListener overwrites metrics
 onDriverAccumUpdates instead of updating them

## What changes were proposed in this pull request?

Event `SparkListenerDriverAccumUpdates` may happen multiple times in a query - e.g. every `FileSourceScanExec` and `BroadcastExchangeExec` call `postDriverMetricUpdates`.
In Spark 2.2 `SQLListener` updated the map with new values. `SQLAppStatusListener` overwrites it.
Unless `update` preserved it in the KV store (dependant on `exec.lastWriteTime`), only the metrics from the last operator that does `postDriverMetricUpdates` are preserved.

## How was this patch tested?

Unit test added.

Author: Juliusz Sompolski <julek@databricks.com>

Closes #21171 from juliuszsompolski/SPARK-24104.

(cherry picked from commit 8614edd445264007144caa6743a8c2ca2b5082e0)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../execution/ui/SQLAppStatusListener.scala   |  2 +-
 .../ui/SQLAppStatusListenerSuite.scala        | 24 +++++++++++++++----
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
index 2b6bb48467eb3..d254af400a7cf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
@@ -289,7 +289,7 @@ class SQLAppStatusListener(
   private def onDriverAccumUpdates(event: SparkListenerDriverAccumUpdates): Unit = {
     val SparkListenerDriverAccumUpdates(executionId, accumUpdates) = event
     Option(liveExecutions.get(executionId)).foreach { exec =>
-      exec.driverAccumUpdates = accumUpdates.toMap
+      exec.driverAccumUpdates = exec.driverAccumUpdates ++ accumUpdates
       update(exec)
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala
index f3f08839c1d3a..02df45d1b7989 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala
@@ -443,7 +443,8 @@ class SQLAppStatusListenerSuite extends SparkFunSuite with SharedSQLContext with
     val oldCount = statusStore.executionsList().size
 
     val expectedAccumValue = 12345
-    val physicalPlan = MyPlan(sqlContext.sparkContext, expectedAccumValue)
+    val expectedAccumValue2 = 54321
+    val physicalPlan = MyPlan(sqlContext.sparkContext, expectedAccumValue, expectedAccumValue2)
     val dummyQueryExecution = new QueryExecution(spark, LocalRelation()) {
       override lazy val sparkPlan = physicalPlan
       override lazy val executedPlan = physicalPlan
@@ -466,10 +467,14 @@ class SQLAppStatusListenerSuite extends SparkFunSuite with SharedSQLContext with
     val execId = statusStore.executionsList().last.executionId
     val metrics = statusStore.executionMetrics(execId)
     val driverMetric = physicalPlan.metrics("dummy")
+    val driverMetric2 = physicalPlan.metrics("dummy2")
     val expectedValue = SQLMetrics.stringValue(driverMetric.metricType, Seq(expectedAccumValue))
+    val expectedValue2 = SQLMetrics.stringValue(driverMetric2.metricType, Seq(expectedAccumValue2))
 
     assert(metrics.contains(driverMetric.id))
     assert(metrics(driverMetric.id) === expectedValue)
+    assert(metrics.contains(driverMetric2.id))
+    assert(metrics(driverMetric2.id) === expectedValue2)
   }
 
   test("roundtripping SparkListenerDriverAccumUpdates through JsonProtocol (SPARK-18462)") {
@@ -562,20 +567,31 @@ class SQLAppStatusListenerSuite extends SparkFunSuite with SharedSQLContext with
  * A dummy [[org.apache.spark.sql.execution.SparkPlan]] that updates a [[SQLMetrics]]
  * on the driver.
  */
-private case class MyPlan(sc: SparkContext, expectedValue: Long) extends LeafExecNode {
+private case class MyPlan(sc: SparkContext, expectedValue: Long, expectedValue2: Long)
+  extends LeafExecNode {
+
   override def sparkContext: SparkContext = sc
   override def output: Seq[Attribute] = Seq()
 
   override val metrics: Map[String, SQLMetric] = Map(
-    "dummy" -> SQLMetrics.createMetric(sc, "dummy"))
+    "dummy" -> SQLMetrics.createMetric(sc, "dummy"),
+    "dummy2" -> SQLMetrics.createMetric(sc, "dummy2"))
 
   override def doExecute(): RDD[InternalRow] = {
     longMetric("dummy") += expectedValue
+    longMetric("dummy2") += expectedValue2
+
+    // postDriverMetricUpdates may happen multiple time in a query.
+    // (normally from different operators, but for the sake of testing, from one operator)
+    SQLMetrics.postDriverMetricUpdates(
+      sc,
+      sc.getLocalProperty(SQLExecution.EXECUTION_ID_KEY),
+      Seq(metrics("dummy")))
 
     SQLMetrics.postDriverMetricUpdates(
       sc,
       sc.getLocalProperty(SQLExecution.EXECUTION_ID_KEY),
-      metrics.values.toSeq)
+      Seq(metrics("dummy2")))
     sc.emptyRDD
   }
 }

From 235ec9ee74f80900c9bbfbe966856e96c947564e Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@apache.org>
Date: Mon, 30 Apr 2018 09:40:46 +0800
Subject: [PATCH 422/566] [MINOR][DOCS] Fix a broken link for Arrow's supported
 types in the programming guide

## What changes were proposed in this pull request?

This PR fixes a broken link for Arrow's supported types in the programming guide.

## How was this patch tested?

Manually tested via `SKIP_API=1 jekyll watch`.

"Supported SQL Types" here in https://spark.apache.org/docs/latest/sql-programming-guide.html#enabling-for-conversion-tofrom-pandas is broken. It should be https://spark.apache.org/docs/latest/sql-programming-guide.html#supported-sql-types

Author: hyukjinkwon <gurwls223@apache.org>

Closes #21191 from HyukjinKwon/minor-arrow-link.

(cherry picked from commit 56f501e1c0cec3be7d13008bd2c0182ec83ed2a2)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 docs/sql-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 06e5cedc53fe5..651e4408aecb5 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1699,7 +1699,7 @@ Using the above optimizations with Arrow will produce the same results as when A
 enabled. Note that even with Arrow, `toPandas()` results in the collection of all records in the
 DataFrame to the driver program and should be done on a small subset of the data. Not all Spark
 data types are currently supported and an error can be raised if a column has an unsupported type,
-see [Supported SQL Types](#supported-sql-arrow-types). If an error occurs during `createDataFrame()`,
+see [Supported SQL Types](#supported-sql-types). If an error occurs during `createDataFrame()`,
 Spark will fall back to create the DataFrame without Arrow.
 
 ## Pandas UDFs (a.k.a. Vectorized UDFs)

From 52a420ff62fabcc649d7acf85a72a3c0cd6437e8 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 1 May 2018 09:06:23 +0800
Subject: [PATCH 423/566] [SPARK-23853][PYSPARK][TEST] Run Hive-related PySpark
 tests only for `-Phive`

## What changes were proposed in this pull request?

When `PyArrow` or `Pandas` are not available, the corresponding PySpark tests are skipped automatically. Currently, PySpark tests fail when we are not using `-Phive`. This PR aims to skip Hive related PySpark tests when `-Phive` is not given.

**BEFORE**
```bash
$ build/mvn -DskipTests clean package
$ python/run-tests.py --python-executables python2.7 --modules pyspark-sql
File "/Users/dongjoon/spark/python/pyspark/sql/readwriter.py", line 295, in pyspark.sql.readwriter.DataFrameReader.table
...
IllegalArgumentException: u"Error while instantiating 'org.apache.spark.sql.hive.HiveExternalCatalog':"
**********************************************************************
   1 of   3 in pyspark.sql.readwriter.DataFrameReader.table
***Test Failed*** 1 failures.
```

**AFTER**
```bash
$ build/mvn -DskipTests clean package
$ python/run-tests.py --python-executables python2.7 --modules pyspark-sql
...
Tests passed in 138 seconds

Skipped tests in pyspark.sql.tests with python2.7:
...
    test_hivecontext (pyspark.sql.tests.HiveSparkSubmitTests) ... skipped 'Hive is not available.'
```

## How was this patch tested?

This is a test-only change. First, this should pass the Jenkins. Then, manually do the following.

```bash
build/mvn -DskipTests clean package
python/run-tests.py --python-executables python2.7 --modules pyspark-sql
```

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #21141 from dongjoon-hyun/SPARK-23853.

(cherry picked from commit b857fb549f3bf4e6f289ba11f3903db0a3696dec)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 python/pyspark/sql/readwriter.py |  2 +-
 python/pyspark/sql/tests.py      | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index e70aa9ef4aa97..28c10aa0cad5f 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -956,7 +956,7 @@ def _test():
     globs = pyspark.sql.readwriter.__dict__.copy()
     sc = SparkContext('local[4]', 'PythonTest')
     try:
-        spark = SparkSession.builder.enableHiveSupport().getOrCreate()
+        spark = SparkSession.builder.getOrCreate()
     except py4j.protocol.Py4JError:
         spark = SparkSession(sc)
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index daa97e1627eec..f8da7d89c1fd0 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -2935,6 +2935,26 @@ def test_create_dateframe_from_pandas_with_dst(self):
 
 class HiveSparkSubmitTests(SparkSubmitTests):
 
+    @classmethod
+    def setUpClass(cls):
+        # get a SparkContext to check for availability of Hive
+        sc = SparkContext('local[4]', cls.__name__)
+        cls.hive_available = True
+        try:
+            sc._jvm.org.apache.hadoop.hive.conf.HiveConf()
+        except py4j.protocol.Py4JError:
+            cls.hive_available = False
+        except TypeError:
+            cls.hive_available = False
+        finally:
+            # we don't need this SparkContext for the test
+            sc.stop()
+
+    def setUp(self):
+        super(HiveSparkSubmitTests, self).setUp()
+        if not self.hive_available:
+            self.skipTest("Hive is not available.")
+
     def test_hivecontext(self):
         # This test checks that HiveContext is using Hive metastore (SPARK-16224).
         # It sets a metastore url and checks if there is a derby dir created by

From 682f05d638a67b869d0e170f6c99683445e2449d Mon Sep 17 00:00:00 2001
From: Bounkong Khamphousone <bounkong.khamphousone@ebiznext.com>
Date: Tue, 1 May 2018 08:28:21 -0700
Subject: [PATCH 424/566] [SPARK-23941][MESOS] Mesos task failed on specific
 spark app name
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?
Shell escaped the name passed to spark-submit and change how conf attributes are shell escaped.

## How was this patch tested?
This test has been tested manually with Hive-on-spark with mesos or with the use case described in the issue with the sparkPi application with a custom name which contains illegal shell characters.

With this PR, hive-on-spark on mesos works like a charm with hive 3.0.0-SNAPSHOT.

I state that this contribution is my original work and that I license the work to the project under the project’s open source license

Author: Bounkong Khamphousone <bounkong.khamphousone@ebiznext.com>

Closes #21014 from tiboun/fix/SPARK-23941.

(cherry picked from commit 6782359a04356e4cde32940861bf2410ef37f445)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../spark/scheduler/cluster/mesos/MesosClusterScheduler.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
index d224a7325820a..b36f46456f9a5 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
@@ -530,9 +530,9 @@ private[spark] class MesosClusterScheduler(
       .filter { case (key, _) => !replicatedOptionsBlacklist.contains(key) }
       .toMap
     (defaultConf ++ driverConf).foreach { case (key, value) =>
-      options ++= Seq("--conf", s""""$key=${shellEscape(value)}"""".stripMargin) }
+      options ++= Seq("--conf", s"${key}=${value}") }
 
-    options
+    options.map(shellEscape)
   }
 
   /**

From 88abf7b9bff7162648265490356bc3f999e86dd3 Mon Sep 17 00:00:00 2001
From: WangJinhai02 <jinhai.wang02@ele.me>
Date: Wed, 2 May 2018 22:40:14 +0800
Subject: [PATCH 425/566] [SPARK-24107][CORE] ChunkedByteBuffer.writeFully
 method has not reset the limit value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

JIRA Issue: https://issues.apache.org/jira/browse/SPARK-24107?jql=text%20~%20%22ChunkedByteBuffer%22

ChunkedByteBuffer.writeFully method has not reset the limit value. When
chunks larger than bufferWriteChunkSize, such as 80 * 1024 * 1024 larger than
config.BUFFER_WRITE_CHUNK_SIZE(64 * 1024 * 1024)，only while once, will lost 16 * 1024 * 1024 byte

Author: WangJinhai02 <jinhai.wang02@ele.me>

Closes #21175 from manbuyun/bugfix-ChunkedByteBuffer.

(cherry picked from commit 152eaf6ae698cd0df7f5a5be3f17ee46e0be929d)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/util/io/ChunkedByteBuffer.scala       | 13 +++++++++----
 .../spark/io/ChunkedByteBufferSuite.scala       | 17 +++++++++++++++--
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
index 7367af7888bd8..3ae8dfcc1cb66 100644
--- a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
@@ -63,10 +63,15 @@ private[spark] class ChunkedByteBuffer(var chunks: Array[ByteBuffer]) {
    */
   def writeFully(channel: WritableByteChannel): Unit = {
     for (bytes <- getChunks()) {
-      while (bytes.remaining() > 0) {
-        val ioSize = Math.min(bytes.remaining(), bufferWriteChunkSize)
-        bytes.limit(bytes.position() + ioSize)
-        channel.write(bytes)
+      val curChunkLimit = bytes.limit()
+      while (bytes.hasRemaining) {
+        try {
+          val ioSize = Math.min(bytes.remaining(), bufferWriteChunkSize)
+          bytes.limit(bytes.position() + ioSize)
+          channel.write(bytes)
+        } finally {
+          bytes.limit(curChunkLimit)
+        }
       }
     }
   }
diff --git a/core/src/test/scala/org/apache/spark/io/ChunkedByteBufferSuite.scala b/core/src/test/scala/org/apache/spark/io/ChunkedByteBufferSuite.scala
index 3b798e36b0499..2107559572d78 100644
--- a/core/src/test/scala/org/apache/spark/io/ChunkedByteBufferSuite.scala
+++ b/core/src/test/scala/org/apache/spark/io/ChunkedByteBufferSuite.scala
@@ -21,11 +21,12 @@ import java.nio.ByteBuffer
 
 import com.google.common.io.ByteStreams
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{SharedSparkContext, SparkFunSuite}
+import org.apache.spark.internal.config
 import org.apache.spark.network.util.ByteArrayWritableChannel
 import org.apache.spark.util.io.ChunkedByteBuffer
 
-class ChunkedByteBufferSuite extends SparkFunSuite {
+class ChunkedByteBufferSuite extends SparkFunSuite with SharedSparkContext {
 
   test("no chunks") {
     val emptyChunkedByteBuffer = new ChunkedByteBuffer(Array.empty[ByteBuffer])
@@ -56,6 +57,18 @@ class ChunkedByteBufferSuite extends SparkFunSuite {
     assert(chunkedByteBuffer.getChunks().head.position() === 0)
   }
 
+  test("SPARK-24107: writeFully() write buffer which is larger than bufferWriteChunkSize") {
+    try {
+      sc.conf.set(config.BUFFER_WRITE_CHUNK_SIZE, 32L * 1024L * 1024L)
+      val chunkedByteBuffer = new ChunkedByteBuffer(Array(ByteBuffer.allocate(40 * 1024 * 1024)))
+      val byteArrayWritableChannel = new ByteArrayWritableChannel(chunkedByteBuffer.size.toInt)
+      chunkedByteBuffer.writeFully(byteArrayWritableChannel)
+      assert(byteArrayWritableChannel.length() === chunkedByteBuffer.size)
+    } finally {
+      sc.conf.remove(config.BUFFER_WRITE_CHUNK_SIZE)
+    }
+  }
+
   test("toArray()") {
     val empty = ByteBuffer.wrap(Array.empty[Byte])
     val bytes = ByteBuffer.wrap(Array.tabulate(8)(_.toByte))

From b3adb5300031b191411b84c9ab1d203949bf342c Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 2 May 2018 12:02:02 -0700
Subject: [PATCH 426/566] [SPARK-23971][BACKPORT-2.3] Should not leak Spark
 sessions across test suites

This PR is to backport the PR https://github.com/apache/spark/pull/21058 to Apache 2.3. This should be the cause why we saw the test regressions in Apache 2.3 branches:

https://amplab.cs.berkeley.edu/jenkins/job/spark-branch-2.3-test-sbt-hadoop-2.6/317/testReport/org.apache.spark.sql.execution.datasources.parquet/ParquetQuerySuite/SPARK_15678__not_use_cache_on_overwrite/history/

https://amplab.cs.berkeley.edu/jenkins/job/spark-branch-2.3-test-sbt-hadoop-2.7/318/testReport/junit/org.apache.spark.sql/DataFrameSuite/inputFiles/history/

---

## What changes were proposed in this pull request?

Many suites currently leak Spark sessions (sometimes with stopped SparkContexts) via the thread-local active Spark session and default Spark session. We should attempt to clean these up and detect when this happens to improve the reproducibility of tests.

## How was this patch tested?

Existing tests

Author: Eric Liang <ekl@databricks.com>

Closes #21197 from gatorsmile/backportSPARK-23971.
---
 .../org/apache/spark/SharedSparkSession.java  |  9 ++++++--
 .../org/apache/spark/sql/SparkSession.scala   | 23 +++++++++++++++++--
 .../apache/spark/sql/SessionStateSuite.scala  |  2 ++
 .../spark/sql/test/SharedSparkSession.scala   | 22 ++++++++++++++----
 4 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/mllib/src/test/java/org/apache/spark/SharedSparkSession.java b/mllib/src/test/java/org/apache/spark/SharedSparkSession.java
index 43779878890db..35a250955b282 100644
--- a/mllib/src/test/java/org/apache/spark/SharedSparkSession.java
+++ b/mllib/src/test/java/org/apache/spark/SharedSparkSession.java
@@ -42,7 +42,12 @@ public void setUp() throws IOException {
 
   @After
   public void tearDown() {
-    spark.stop();
-    spark = null;
+    try {
+      spark.stop();
+      spark = null;
+    } finally {
+      SparkSession.clearDefaultSession();
+      SparkSession.clearActiveSession();
+    }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 6cd5de887b169..b699ccd08ff40 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -44,7 +44,7 @@ import org.apache.spark.sql.sources.BaseRelation
 import org.apache.spark.sql.streaming._
 import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.sql.util.ExecutionListenerManager
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{CallSite, Utils}
 
 
 /**
@@ -81,6 +81,9 @@ class SparkSession private(
     @transient private[sql] val extensions: SparkSessionExtensions)
   extends Serializable with Closeable with Logging { self =>
 
+  // The call site where this SparkSession was constructed.
+  private val creationSite: CallSite = Utils.getCallSite()
+
   private[sql] def this(sc: SparkContext) {
     this(sc, None, None, new SparkSessionExtensions)
   }
@@ -763,7 +766,7 @@ class SparkSession private(
 
 
 @InterfaceStability.Stable
-object SparkSession {
+object SparkSession extends Logging {
 
   /**
    * Builder for [[SparkSession]].
@@ -1079,4 +1082,20 @@ object SparkSession {
     }
   }
 
+  private[spark] def cleanupAnyExistingSession(): Unit = {
+    val session = getActiveSession.orElse(getDefaultSession)
+    if (session.isDefined) {
+      logWarning(
+        s"""An existing Spark session exists as the active or default session.
+           |This probably means another suite leaked it. Attempting to stop it before continuing.
+           |This existing Spark session was created at:
+           |
+           |${session.get.creationSite.longForm}
+           |
+         """.stripMargin)
+      session.get.stop()
+      SparkSession.clearActiveSession()
+      SparkSession.clearDefaultSession()
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala
index 8386f324d8fed..77710fd277808 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala
@@ -43,6 +43,8 @@ class SessionStateSuite extends SparkFunSuite {
     if (activeSession != null) {
       activeSession.stop()
       activeSession = null
+      SparkSession.clearActiveSession()
+      SparkSession.clearDefaultSession()
     }
     super.afterAll()
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala
index e758c865b908f..8968dbf36d507 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala
@@ -60,6 +60,7 @@ trait SharedSparkSession
   protected implicit def sqlContext: SQLContext = _spark.sqlContext
 
   protected def createSparkSession: TestSparkSession = {
+    SparkSession.cleanupAnyExistingSession()
     new TestSparkSession(sparkConf)
   }
 
@@ -92,11 +93,22 @@ trait SharedSparkSession
    * Stop the underlying [[org.apache.spark.SparkContext]], if any.
    */
   protected override def afterAll(): Unit = {
-    super.afterAll()
-    if (_spark != null) {
-      _spark.sessionState.catalog.reset()
-      _spark.stop()
-      _spark = null
+    try {
+      super.afterAll()
+    } finally {
+      try {
+        if (_spark != null) {
+          try {
+            _spark.sessionState.catalog.reset()
+          } finally {
+            _spark.stop()
+            _spark = null
+          }
+        }
+      } finally {
+        SparkSession.clearActiveSession()
+        SparkSession.clearDefaultSession()
+      }
     }
   }
 

From 0fe53b64e9ee68956e5f5cd454942af432f58fc1 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Thu, 3 May 2018 15:15:05 +0800
Subject: [PATCH 427/566] [SPARK-23489][SQL][TEST]
 HiveExternalCatalogVersionsSuite should verify the downloaded file

## What changes were proposed in this pull request?

Although [SPARK-22654](https://issues.apache.org/jira/browse/SPARK-22654) made `HiveExternalCatalogVersionsSuite` download from Apache mirrors three times, it has been flaky because it didn't verify the downloaded file. Some Apache mirrors terminate the downloading abnormally, the *corrupted* file shows the following errors.

```
gzip: stdin: not in gzip format
tar: Child returned status 1
tar: Error is not recoverable: exiting now
22:46:32.700 WARN org.apache.spark.sql.hive.HiveExternalCatalogVersionsSuite:

===== POSSIBLE THREAD LEAK IN SUITE o.a.s.sql.hive.HiveExternalCatalogVersionsSuite, thread names: Keep-Alive-Timer =====

*** RUN ABORTED ***
  java.io.IOException: Cannot run program "./bin/spark-submit" (in directory "/tmp/test-spark/spark-2.2.0"): error=2, No such file or directory
```

This has been reported weirdly in two ways. For example, the above case is reported as Case 2 `no failures`.

- Case 1. [Test Result (1 failure / +1)](https://amplab.cs.berkeley.edu/jenkins/view/Spark%20QA%20Test%20(Dashboard)/job/spark-master-test-sbt-hadoop-2.7/4389/)
- Case 2. [Test Result (no failures)](https://amplab.cs.berkeley.edu/jenkins/view/Spark%20QA%20Test%20(Dashboard)/job/spark-master-test-maven-hadoop-2.6/4811/)

This PR aims to make `HiveExternalCatalogVersionsSuite` more robust by verifying the downloaded `tgz` file by extracting and checking the existence of `bin/spark-submit`. If it turns out that the file is empty or corrupted, `HiveExternalCatalogVersionsSuite` will do retry logic like the download failure.

## How was this patch tested?

Pass the Jenkins.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #21210 from dongjoon-hyun/SPARK-23489.

(cherry picked from commit c9bfd1c6f8d16890ea1e5bc2bcb654a3afb32591)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../HiveExternalCatalogVersionsSuite.scala    | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
index 6ca58e68d31eb..ea86ab9772bc7 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
@@ -67,7 +67,21 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
       logInfo(s"Downloading Spark $version from $url")
       try {
         getFileFromUrl(url, path, filename)
-        return
+        val downloaded = new File(sparkTestingDir, filename).getCanonicalPath
+        val targetDir = new File(sparkTestingDir, s"spark-$version").getCanonicalPath
+
+        Seq("mkdir", targetDir).!
+        val exitCode = Seq("tar", "-xzf", downloaded, "-C", targetDir, "--strip-components=1").!
+        Seq("rm", downloaded).!
+
+        // For a corrupted file, `tar` returns non-zero values. However, we also need to check
+        // the extracted file because `tar` returns 0 for empty file.
+        val sparkSubmit = new File(sparkTestingDir, s"spark-$version/bin/spark-submit")
+        if (exitCode == 0 && sparkSubmit.exists()) {
+          return
+        } else {
+          Seq("rm", "-rf", targetDir).!
+        }
       } catch {
         case ex: Exception => logWarning(s"Failed to download Spark $version from $url", ex)
       }
@@ -75,20 +89,6 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
     fail(s"Unable to download Spark $version")
   }
 
-
-  private def downloadSpark(version: String): Unit = {
-    tryDownloadSpark(version, sparkTestingDir.getCanonicalPath)
-
-    val downloaded = new File(sparkTestingDir, s"spark-$version-bin-hadoop2.7.tgz").getCanonicalPath
-    val targetDir = new File(sparkTestingDir, s"spark-$version").getCanonicalPath
-
-    Seq("mkdir", targetDir).!
-
-    Seq("tar", "-xzf", downloaded, "-C", targetDir, "--strip-components=1").!
-
-    Seq("rm", downloaded).!
-  }
-
   private def genDataDir(name: String): String = {
     new File(tmpDataDir, name).getCanonicalPath
   }
@@ -161,7 +161,7 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
     PROCESS_TABLES.testingVersions.zipWithIndex.foreach { case (version, index) =>
       val sparkHome = new File(sparkTestingDir, s"spark-$version")
       if (!sparkHome.exists()) {
-        downloadSpark(version)
+        tryDownloadSpark(version, sparkTestingDir.getCanonicalPath)
       }
 
       val args = Seq(

From 10e2f1fc02e1b5f794f7721466a1f755c8979e53 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 3 May 2018 19:56:30 +0800
Subject: [PATCH 428/566] [SPARK-24166][SQL] InMemoryTableScanExec should not
 access SQLConf at executor side

## What changes were proposed in this pull request?

This PR is extracted from https://github.com/apache/spark/pull/21190 , to make it easier to backport.

`InMemoryTableScanExec#createAndDecompressColumn` is executed inside `rdd.map`, we can't access `conf.offHeapColumnVectorEnabled` there.

## How was this patch tested?

it's tested in #21190

Author: Wenchen Fan <wenchen@databricks.com>

Closes #21223 from cloud-fan/minor1.

(cherry picked from commit 991b526992bcf1dc1268578b650916569b12f583)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../execution/columnar/InMemoryTableScanExec.scala    | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
index e73e1378d52e3..08b2751ba5789 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
@@ -78,10 +78,12 @@ case class InMemoryTableScanExec(
 
   private lazy val columnarBatchSchema = new StructType(columnIndices.map(i => relationSchema(i)))
 
-  private def createAndDecompressColumn(cachedColumnarBatch: CachedBatch): ColumnarBatch = {
+  private def createAndDecompressColumn(
+      cachedColumnarBatch: CachedBatch,
+      offHeapColumnVectorEnabled: Boolean): ColumnarBatch = {
     val rowCount = cachedColumnarBatch.numRows
     val taskContext = Option(TaskContext.get())
-    val columnVectors = if (!conf.offHeapColumnVectorEnabled || taskContext.isEmpty) {
+    val columnVectors = if (!offHeapColumnVectorEnabled || taskContext.isEmpty) {
       OnHeapColumnVector.allocateColumns(rowCount, columnarBatchSchema)
     } else {
       OffHeapColumnVector.allocateColumns(rowCount, columnarBatchSchema)
@@ -101,10 +103,13 @@ case class InMemoryTableScanExec(
 
   private lazy val inputRDD: RDD[InternalRow] = {
     val buffers = filteredCachedBatches()
+    val offHeapColumnVectorEnabled = conf.offHeapColumnVectorEnabled
     if (supportsBatch) {
       // HACK ALERT: This is actually an RDD[ColumnarBatch].
       // We're taking advantage of Scala's type erasure here to pass these batches along.
-      buffers.map(createAndDecompressColumn).asInstanceOf[RDD[InternalRow]]
+      buffers
+        .map(createAndDecompressColumn(_, offHeapColumnVectorEnabled))
+        .asInstanceOf[RDD[InternalRow]]
     } else {
       val numOutputRows = longMetric("numOutputRows")
 

From bfe50b6843938100e7bad59071b027689a22ab83 Mon Sep 17 00:00:00 2001
From: Ala Luszczak <ala@databricks.com>
Date: Thu, 3 May 2018 16:52:40 +0200
Subject: [PATCH 429/566] [SPARK-24133][SQL] Backport [] Check for integer
 overflows when resizing WritableColumnVectors

`ColumnVector`s store string data in one big byte array. Since the array size is capped at just under Integer.MAX_VALUE, a single `ColumnVector` cannot store more than 2GB of string data.
But since the Parquet files commonly contain large blobs stored as strings, and `ColumnVector`s by default carry 4096 values, it's entirely possible to go past that limit. In such cases a negative capacity is requested from `WritableColumnVector.reserve()`. The call succeeds (requested capacity is smaller than already allocated capacity), and consequently `java.lang.ArrayIndexOutOfBoundsException` is thrown when the reader actually attempts to put the data into the array.

This change introduces a simple check for integer overflow to `WritableColumnVector.reserve()` which should help catch the error earlier and provide more informative exception. Additionally, the error message in `WritableColumnVector.throwUnsupportedException()` was corrected.

New units tests were added.

Author: Ala Luszczak <aladatabricks.com>

Closes #21206 from ala/overflow-reserve.

(cherry picked from commit 8bd27025b7cf0b44726b6f4020d294ef14dbbb7e)
Signed-off-by: Ala Luszczak <aladatabricks.com>

Author: Ala Luszczak <ala@databricks.com>

Closes #21227 from ala/cherry-pick-overflow-reserve.
---
 .../execution/vectorized/WritableColumnVector.java  | 13 ++++++++-----
 .../execution/vectorized/ColumnarBatchSuite.scala   |  7 +++++++
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index 48eebcf2f0746..36a92b66e988a 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -81,7 +81,9 @@ public void close() {
   }
 
   public void reserve(int requiredCapacity) {
-    if (requiredCapacity > capacity) {
+    if (requiredCapacity < 0) {
+      throwUnsupportedException(requiredCapacity, null);
+    } else if (requiredCapacity > capacity) {
       int newCapacity = (int) Math.min(MAX_CAPACITY, requiredCapacity * 2L);
       if (requiredCapacity <= newCapacity) {
         try {
@@ -96,10 +98,11 @@ public void reserve(int requiredCapacity) {
   }
 
   private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
-    String message = "Cannot reserve additional contiguous bytes in the vectorized reader " +
-        "(requested = " + requiredCapacity + " bytes). As a workaround, you can disable the " +
-        "vectorized reader by setting " + SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key() +
-        " to false.";
+    String message = "Cannot reserve additional contiguous bytes in the vectorized reader (" +
+        (requiredCapacity >= 0 ? "requested " + requiredCapacity + " bytes" : "integer overflow") +
+        "). As a workaround, you can disable the vectorized reader. For parquet file format, " +
+        "refer to " + SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key() + "; for orc file format," +
+        " refer to " + SQLConf.ORC_VECTORIZED_READER_ENABLED().key() + ".";
     throw new RuntimeException(message, cause);
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index 772f687526008..f57f07b498261 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -1333,4 +1333,11 @@ class ColumnarBatchSuite extends SparkFunSuite {
 
       column.close()
   }
+
+  testVector("WritableColumnVector.reserve(): requested capacity is negative", 1024, ByteType) {
+    column =>
+      val ex = intercept[RuntimeException] { column.reserve(-1) }
+      assert(ex.getMessage.contains(
+          "Cannot reserve additional contiguous bytes in the vectorized reader (integer overflow)"))
+  }
 }

From 61e7bc0c145b0da3129e1dac46d72cf0db5e1d94 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 3 May 2018 23:36:09 +0800
Subject: [PATCH 430/566] [SPARK-24169][SQL] JsonToStructs should not access
 SQLConf at executor side

## What changes were proposed in this pull request?

This PR is extracted from #21190 , to make it easier to backport.

`JsonToStructs` can be serialized to executors and evaluate, we should not call `SQLConf.get.getConf(SQLConf.FROM_JSON_FORCE_NULLABLE_SCHEMA)` in the body.

## How was this patch tested?

tested in #21190

Author: Wenchen Fan <wenchen@databricks.com>

Closes #21226 from cloud-fan/minor4.

(cherry picked from commit 96a50016bb0fb1cc57823a6706bff2467d671efd)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../catalyst/analysis/FunctionRegistry.scala  |  4 +-
 .../expressions/jsonExpressions.scala         | 16 ++--
 .../expressions/JsonExpressionsSuite.scala    | 76 +++++++++----------
 .../org/apache/spark/sql/functions.scala      |  2 +-
 .../sql-tests/results/json-functions.sql.out  |  4 +-
 5 files changed, 54 insertions(+), 48 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 747016beb06e7..f3dfd69d96529 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -526,7 +526,9 @@ object FunctionRegistry {
         // Otherwise, find a constructor method that matches the number of arguments, and use that.
         val params = Seq.fill(expressions.size)(classOf[Expression])
         val f = constructors.find(_.getParameterTypes.toSeq == params).getOrElse {
-          val validParametersCount = constructors.map(_.getParameterCount).distinct.sorted
+          val validParametersCount = constructors
+            .filter(_.getParameterTypes.forall(_ == classOf[Expression]))
+            .map(_.getParameterCount).distinct.sorted
           val expectedNumberOfParameters = if (validParametersCount.length == 1) {
             validParametersCount.head.toString
           } else {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index fdd672c416a03..34161f0f03f4a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -514,11 +514,10 @@ case class JsonToStructs(
     schema: DataType,
     options: Map[String, String],
     child: Expression,
-    timeZoneId: Option[String] = None)
+    timeZoneId: Option[String],
+    forceNullableSchema: Boolean)
   extends UnaryExpression with TimeZoneAwareExpression with CodegenFallback with ExpectsInputTypes {
 
-  val forceNullableSchema = SQLConf.get.getConf(SQLConf.FROM_JSON_FORCE_NULLABLE_SCHEMA)
-
   // The JSON input data might be missing certain fields. We force the nullability
   // of the user-provided schema to avoid data corruptions. In particular, the parquet-mr encoder
   // can generate incorrect files if values are missing in columns declared as non-nullable.
@@ -532,14 +531,21 @@ case class JsonToStructs(
       schema = JsonExprUtils.validateSchemaLiteral(schema),
       options = Map.empty[String, String],
       child = child,
-      timeZoneId = None)
+      timeZoneId = None,
+      forceNullableSchema = SQLConf.get.getConf(SQLConf.FROM_JSON_FORCE_NULLABLE_SCHEMA))
 
   def this(child: Expression, schema: Expression, options: Expression) =
     this(
       schema = JsonExprUtils.validateSchemaLiteral(schema),
       options = JsonExprUtils.convertToMapData(options),
       child = child,
-      timeZoneId = None)
+      timeZoneId = None,
+      forceNullableSchema = SQLConf.get.getConf(SQLConf.FROM_JSON_FORCE_NULLABLE_SCHEMA))
+
+  // Used in `org.apache.spark.sql.functions`
+  def this(schema: DataType, options: Map[String, String], child: Expression) =
+    this(schema, options, child, timeZoneId = None,
+      forceNullableSchema = SQLConf.get.getConf(SQLConf.FROM_JSON_FORCE_NULLABLE_SCHEMA))
 
   override def checkInputDataTypes(): TypeCheckResult = nullableSchema match {
     case _: StructType | ArrayType(_: StructType, _) =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
index 7812319756eae..00e97637eee7e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
@@ -392,7 +392,7 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with
     val jsonData = """{"a": 1}"""
     val schema = StructType(StructField("a", IntegerType) :: Nil)
     checkEvaluation(
-      JsonToStructs(schema, Map.empty, Literal(jsonData), gmtId),
+      JsonToStructs(schema, Map.empty, Literal(jsonData), gmtId, true),
       InternalRow(1)
     )
   }
@@ -401,13 +401,13 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with
     val jsonData = """{"a" 1}"""
     val schema = StructType(StructField("a", IntegerType) :: Nil)
     checkEvaluation(
-      JsonToStructs(schema, Map.empty, Literal(jsonData), gmtId),
+      JsonToStructs(schema, Map.empty, Literal(jsonData), gmtId, true),
       null
     )
 
     // Other modes should still return `null`.
     checkEvaluation(
-      JsonToStructs(schema, Map("mode" -> PermissiveMode.name), Literal(jsonData), gmtId),
+      JsonToStructs(schema, Map("mode" -> PermissiveMode.name), Literal(jsonData), gmtId, true),
       null
     )
   }
@@ -416,62 +416,62 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with
     val input = """[{"a": 1}, {"a": 2}]"""
     val schema = ArrayType(StructType(StructField("a", IntegerType) :: Nil))
     val output = InternalRow(1) :: InternalRow(2) :: Nil
-    checkEvaluation(JsonToStructs(schema, Map.empty, Literal(input), gmtId), output)
+    checkEvaluation(JsonToStructs(schema, Map.empty, Literal(input), gmtId, true), output)
   }
 
   test("from_json - input=object, schema=array, output=array of single row") {
     val input = """{"a": 1}"""
     val schema = ArrayType(StructType(StructField("a", IntegerType) :: Nil))
     val output = InternalRow(1) :: Nil
-    checkEvaluation(JsonToStructs(schema, Map.empty, Literal(input), gmtId), output)
+    checkEvaluation(JsonToStructs(schema, Map.empty, Literal(input), gmtId, true), output)
   }
 
   test("from_json - input=empty array, schema=array, output=empty array") {
     val input = "[ ]"
     val schema = ArrayType(StructType(StructField("a", IntegerType) :: Nil))
     val output = Nil
-    checkEvaluation(JsonToStructs(schema, Map.empty, Literal(input), gmtId), output)
+    checkEvaluation(JsonToStructs(schema, Map.empty, Literal(input), gmtId, true), output)
   }
 
   test("from_json - input=empty object, schema=array, output=array of single row with null") {
     val input = "{ }"
     val schema = ArrayType(StructType(StructField("a", IntegerType) :: Nil))
     val output = InternalRow(null) :: Nil
-    checkEvaluation(JsonToStructs(schema, Map.empty, Literal(input), gmtId), output)
+    checkEvaluation(JsonToStructs(schema, Map.empty, Literal(input), gmtId, true), output)
   }
 
   test("from_json - input=array of single object, schema=struct, output=single row") {
     val input = """[{"a": 1}]"""
     val schema = StructType(StructField("a", IntegerType) :: Nil)
     val output = InternalRow(1)
-    checkEvaluation(JsonToStructs(schema, Map.empty, Literal(input), gmtId), output)
+    checkEvaluation(JsonToStructs(schema, Map.empty, Literal(input), gmtId, true), output)
   }
 
   test("from_json - input=array, schema=struct, output=null") {
     val input = """[{"a": 1}, {"a": 2}]"""
     val schema = StructType(StructField("a", IntegerType) :: Nil)
     val output = null
-    checkEvaluation(JsonToStructs(schema, Map.empty, Literal(input), gmtId), output)
+    checkEvaluation(JsonToStructs(schema, Map.empty, Literal(input), gmtId, true), output)
   }
 
   test("from_json - input=empty array, schema=struct, output=null") {
     val input = """[]"""
     val schema = StructType(StructField("a", IntegerType) :: Nil)
     val output = null
-    checkEvaluation(JsonToStructs(schema, Map.empty, Literal(input), gmtId), output)
+    checkEvaluation(JsonToStructs(schema, Map.empty, Literal(input), gmtId, true), output)
   }
 
   test("from_json - input=empty object, schema=struct, output=single row with null") {
     val input = """{  }"""
     val schema = StructType(StructField("a", IntegerType) :: Nil)
     val output = InternalRow(null)
-    checkEvaluation(JsonToStructs(schema, Map.empty, Literal(input), gmtId), output)
+    checkEvaluation(JsonToStructs(schema, Map.empty, Literal(input), gmtId, true), output)
   }
 
   test("from_json null input column") {
     val schema = StructType(StructField("a", IntegerType) :: Nil)
     checkEvaluation(
-      JsonToStructs(schema, Map.empty, Literal.create(null, StringType), gmtId),
+      JsonToStructs(schema, Map.empty, Literal.create(null, StringType), gmtId, true),
       null
     )
   }
@@ -479,7 +479,7 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with
   test("SPARK-20549: from_json bad UTF-8") {
     val schema = StructType(StructField("a", IntegerType) :: Nil)
     checkEvaluation(
-      JsonToStructs(schema, Map.empty, Literal(badJson), gmtId),
+      JsonToStructs(schema, Map.empty, Literal(badJson), gmtId, true),
       null)
   }
 
@@ -491,14 +491,14 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with
     c.set(2016, 0, 1, 0, 0, 0)
     c.set(Calendar.MILLISECOND, 123)
     checkEvaluation(
-      JsonToStructs(schema, Map.empty, Literal(jsonData1), gmtId),
+      JsonToStructs(schema, Map.empty, Literal(jsonData1), gmtId, true),
       InternalRow(c.getTimeInMillis * 1000L)
     )
     // The result doesn't change because the json string includes timezone string ("Z" here),
     // which means the string represents the timestamp string in the timezone regardless of
     // the timeZoneId parameter.
     checkEvaluation(
-      JsonToStructs(schema, Map.empty, Literal(jsonData1), Option("PST")),
+      JsonToStructs(schema, Map.empty, Literal(jsonData1), Option("PST"), true),
       InternalRow(c.getTimeInMillis * 1000L)
     )
 
@@ -512,7 +512,8 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with
           schema,
           Map("timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss"),
           Literal(jsonData2),
-          Option(tz.getID)),
+          Option(tz.getID),
+          true),
         InternalRow(c.getTimeInMillis * 1000L)
       )
       checkEvaluation(
@@ -521,7 +522,8 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with
           Map("timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss",
             DateTimeUtils.TIMEZONE_OPTION -> tz.getID),
           Literal(jsonData2),
-          gmtId),
+          gmtId,
+          true),
         InternalRow(c.getTimeInMillis * 1000L)
       )
     }
@@ -530,7 +532,7 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with
   test("SPARK-19543: from_json empty input column") {
     val schema = StructType(StructField("a", IntegerType) :: Nil)
     checkEvaluation(
-      JsonToStructs(schema, Map.empty, Literal.create(" ", StringType), gmtId),
+      JsonToStructs(schema, Map.empty, Literal.create(" ", StringType), gmtId, true),
       null
     )
   }
@@ -685,27 +687,23 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with
 
   test("from_json missing fields") {
     for (forceJsonNullableSchema <- Seq(false, true)) {
-      withSQLConf(SQLConf.FROM_JSON_FORCE_NULLABLE_SCHEMA.key -> forceJsonNullableSchema.toString) {
-        val input =
-          """{
-            |  "a": 1,
-            |  "c": "foo"
-            |}
-            |""".stripMargin
-        val jsonSchema = new StructType()
-          .add("a", LongType, nullable = false)
-          .add("b", StringType, nullable = false)
-          .add("c", StringType, nullable = false)
-        val output = InternalRow(1L, null, UTF8String.fromString("foo"))
-        checkEvaluation(
-          JsonToStructs(jsonSchema, Map.empty, Literal.create(input, StringType), gmtId),
-          output
-        )
-        val schema = JsonToStructs(jsonSchema, Map.empty, Literal.create(input, StringType), gmtId)
-          .dataType
-        val schemaToCompare = if (forceJsonNullableSchema) jsonSchema.asNullable else jsonSchema
-        assert(schemaToCompare == schema)
-      }
+      val input =
+        """{
+          |  "a": 1,
+          |  "c": "foo"
+          |}
+          |""".stripMargin
+      val jsonSchema = new StructType()
+        .add("a", LongType, nullable = false)
+        .add("b", StringType, nullable = false)
+        .add("c", StringType, nullable = false)
+      val output = InternalRow(1L, null, UTF8String.fromString("foo"))
+      val expr = JsonToStructs(
+        jsonSchema, Map.empty, Literal.create(input, StringType), gmtId, forceJsonNullableSchema)
+      checkEvaluation(expr, output)
+      val schema = expr.dataType
+      val schemaToCompare = if (forceJsonNullableSchema) jsonSchema.asNullable else jsonSchema
+      assert(schemaToCompare == schema)
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index c9ca9a8996344..e31a9c8e3af34 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -3132,7 +3132,7 @@ object functions {
    * @since 2.2.0
    */
   def from_json(e: Column, schema: DataType, options: Map[String, String]): Column = withExpr {
-    JsonToStructs(schema, options, e.expr)
+    new JsonToStructs(schema, options, e.expr)
   }
 
   /**
diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
index 581dddc89d0bb..14a69128ffb41 100644
--- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
@@ -129,7 +129,7 @@ select to_json()
 struct<>
 -- !query 12 output
 org.apache.spark.sql.AnalysisException
-Invalid number of arguments for function to_json. Expected: one of 1, 2 and 3; Found: 0; line 1 pos 7
+Invalid number of arguments for function to_json. Expected: one of 1 and 2; Found: 0; line 1 pos 7
 
 
 -- !query 13
@@ -225,7 +225,7 @@ select from_json()
 struct<>
 -- !query 21 output
 org.apache.spark.sql.AnalysisException
-Invalid number of arguments for function from_json. Expected: one of 2, 3 and 4; Found: 0; line 1 pos 7
+Invalid number of arguments for function from_json. Expected: one of 2 and 3; Found: 0; line 1 pos 7
 
 
 -- !query 22

From 8509284e1ec048d5afa87d41071c0429924e45c9 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Thu, 3 May 2018 10:59:18 -0500
Subject: [PATCH 431/566] [SPARK-23433][CORE] Late zombie task completions
 update all tasksets

Fetch failure lead to multiple tasksets which are active for a given
stage.  While there is only one "active" version of the taskset, the
earlier attempts can still have running tasks, which can complete
successfully.  So a task completion needs to update every taskset
so that it knows the partition is completed.  That way the final active
taskset does not try to submit another task for the same partition,
and so that it knows when it is completed and when it should be
marked as a "zombie".

Added a regression test.

Author: Imran Rashid <irashid@cloudera.com>

Closes #21131 from squito/SPARK-23433.

(cherry picked from commit 94641fe6cc68e5977dd8663b8f232a287a783acb)
Signed-off-by: Imran Rashid <irashid@cloudera.com>
---
 .../spark/scheduler/TaskSchedulerImpl.scala   |  14 +++
 .../spark/scheduler/TaskSetManager.scala      |  20 +++-
 .../scheduler/TaskSchedulerImplSuite.scala    | 104 ++++++++++++++++++
 3 files changed, 137 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index 0c11806b3981b..8e97b3da33820 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -689,6 +689,20 @@ private[spark] class TaskSchedulerImpl(
     }
   }
 
+  /**
+   * Marks the task has completed in all TaskSetManagers for the given stage.
+   *
+   * After stage failure and retry, there may be multiple TaskSetManagers for the stage.
+   * If an earlier attempt of a stage completes a task, we should ensure that the later attempts
+   * do not also submit those same tasks.  That also means that a task completion from an  earlier
+   * attempt can lead to the entire stage getting marked as successful.
+   */
+  private[scheduler] def markPartitionCompletedInAllTaskSets(stageId: Int, partitionId: Int) = {
+    taskSetsByStageIdAndAttempt.getOrElse(stageId, Map()).values.foreach { tsm =>
+      tsm.markPartitionCompleted(partitionId)
+    }
+  }
+
 }
 
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index c3ed11bfe352a..b52e376e7b870 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -74,6 +74,8 @@ private[spark] class TaskSetManager(
   val ser = env.closureSerializer.newInstance()
 
   val tasks = taskSet.tasks
+  private[scheduler] val partitionToIndex = tasks.zipWithIndex
+    .map { case (t, idx) => t.partitionId -> idx }.toMap
   val numTasks = tasks.length
   val copiesRunning = new Array[Int](numTasks)
 
@@ -154,7 +156,7 @@ private[spark] class TaskSetManager(
   private[scheduler] val speculatableTasks = new HashSet[Int]
 
   // Task index, start and finish time for each task attempt (indexed by task ID)
-  private val taskInfos = new HashMap[Long, TaskInfo]
+  private[scheduler] val taskInfos = new HashMap[Long, TaskInfo]
 
   // Use a MedianHeap to record durations of successful tasks so we know when to launch
   // speculative tasks. This is only used when speculation is enabled, to avoid the overhead
@@ -755,6 +757,9 @@ private[spark] class TaskSetManager(
       logInfo("Ignoring task-finished event for " + info.id + " in stage " + taskSet.id +
         " because task " + index + " has already completed successfully")
     }
+    // There may be multiple tasksets for this stage -- we let all of them know that the partition
+    // was completed.  This may result in some of the tasksets getting completed.
+    sched.markPartitionCompletedInAllTaskSets(stageId, tasks(index).partitionId)
     // This method is called by "TaskSchedulerImpl.handleSuccessfulTask" which holds the
     // "TaskSchedulerImpl" lock until exiting. To avoid the SPARK-7655 issue, we should not
     // "deserialize" the value when holding a lock to avoid blocking other threads. So we call
@@ -765,6 +770,19 @@ private[spark] class TaskSetManager(
     maybeFinishTaskSet()
   }
 
+  private[scheduler] def markPartitionCompleted(partitionId: Int): Unit = {
+    partitionToIndex.get(partitionId).foreach { index =>
+      if (!successful(index)) {
+        tasksSuccessful += 1
+        successful(index) = true
+        if (tasksSuccessful == numTasks) {
+          isZombie = true
+        }
+        maybeFinishTaskSet()
+      }
+    }
+  }
+
   /**
    * Marks the task as failed, re-adds it to the list of pending tasks, and notifies the
    * DAG Scheduler.
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
index 6003899bb7bef..33f2ea1c94e75 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
@@ -917,4 +917,108 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B
       taskScheduler.initialize(new FakeSchedulerBackend)
     }
   }
+
+  test("Completions in zombie tasksets update status of non-zombie taskset") {
+    val taskScheduler = setupSchedulerWithMockTaskSetBlacklist()
+    val valueSer = SparkEnv.get.serializer.newInstance()
+
+    def completeTaskSuccessfully(tsm: TaskSetManager, partition: Int): Unit = {
+      val indexInTsm = tsm.partitionToIndex(partition)
+      val matchingTaskInfo = tsm.taskAttempts.flatten.filter(_.index == indexInTsm).head
+      val result = new DirectTaskResult[Int](valueSer.serialize(1), Seq())
+      tsm.handleSuccessfulTask(matchingTaskInfo.taskId, result)
+    }
+
+    // Submit a task set, have it fail with a fetch failed, and then re-submit the task attempt,
+    // two times, so we have three active task sets for one stage.  (For this to really happen,
+    // you'd need the previous stage to also get restarted, and then succeed, in between each
+    // attempt, but that happens outside what we're mocking here.)
+    val zombieAttempts = (0 until 2).map { stageAttempt =>
+      val attempt = FakeTask.createTaskSet(10, stageAttemptId = stageAttempt)
+      taskScheduler.submitTasks(attempt)
+      val tsm = taskScheduler.taskSetManagerForAttempt(0, stageAttempt).get
+      val offers = (0 until 10).map{ idx => WorkerOffer(s"exec-$idx", s"host-$idx", 1) }
+      taskScheduler.resourceOffers(offers)
+      assert(tsm.runningTasks === 10)
+      // fail attempt
+      tsm.handleFailedTask(tsm.taskAttempts.head.head.taskId, TaskState.FAILED,
+        FetchFailed(null, 0, 0, 0, "fetch failed"))
+      // the attempt is a zombie, but the tasks are still running (this could be true even if
+      // we actively killed those tasks, as killing is best-effort)
+      assert(tsm.isZombie)
+      assert(tsm.runningTasks === 9)
+      tsm
+    }
+
+    // we've now got 2 zombie attempts, each with 9 tasks still active.  Submit the 3rd attempt for
+    // the stage, but this time with insufficient resources so not all tasks are active.
+
+    val finalAttempt = FakeTask.createTaskSet(10, stageAttemptId = 2)
+    taskScheduler.submitTasks(finalAttempt)
+    val finalTsm = taskScheduler.taskSetManagerForAttempt(0, 2).get
+    val offers = (0 until 5).map{ idx => WorkerOffer(s"exec-$idx", s"host-$idx", 1) }
+    val finalAttemptLaunchedPartitions = taskScheduler.resourceOffers(offers).flatten.map { task =>
+      finalAttempt.tasks(task.index).partitionId
+    }.toSet
+    assert(finalTsm.runningTasks === 5)
+    assert(!finalTsm.isZombie)
+
+    // We simulate late completions from our zombie tasksets, corresponding to all the pending
+    // partitions in our final attempt.  This means we're only waiting on the tasks we've already
+    // launched.
+    val finalAttemptPendingPartitions = (0 until 10).toSet.diff(finalAttemptLaunchedPartitions)
+    finalAttemptPendingPartitions.foreach { partition =>
+      completeTaskSuccessfully(zombieAttempts(0), partition)
+    }
+
+    // If there is another resource offer, we shouldn't run anything.  Though our final attempt
+    // used to have pending tasks, now those tasks have been completed by zombie attempts.  The
+    // remaining tasks to compute are already active in the non-zombie attempt.
+    assert(
+      taskScheduler.resourceOffers(IndexedSeq(WorkerOffer("exec-1", "host-1", 1))).flatten.isEmpty)
+
+    val remainingTasks = finalAttemptLaunchedPartitions.toIndexedSeq.sorted
+
+    // finally, if we finish the remaining partitions from a mix of tasksets, all attempts should be
+    // marked as zombie.
+    // for each of the remaining tasks, find the tasksets with an active copy of the task, and
+    // finish the task.
+    remainingTasks.foreach { partition =>
+      val tsm = if (partition == 0) {
+        // we failed this task on both zombie attempts, this one is only present in the latest
+        // taskset
+        finalTsm
+      } else {
+        // should be active in every taskset.  We choose a zombie taskset just to make sure that
+        // we transition the active taskset correctly even if the final completion comes
+        // from a zombie.
+        zombieAttempts(partition % 2)
+      }
+      completeTaskSuccessfully(tsm, partition)
+    }
+
+    assert(finalTsm.isZombie)
+
+    // no taskset has completed all of its tasks, so no updates to the blacklist tracker yet
+    verify(blacklist, never).updateBlacklistForSuccessfulTaskSet(anyInt(), anyInt(), anyObject())
+
+    // finally, lets complete all the tasks.  We simulate failures in attempt 1, but everything
+    // else succeeds, to make sure we get the right updates to the blacklist in all cases.
+    (zombieAttempts ++ Seq(finalTsm)).foreach { tsm =>
+      val stageAttempt = tsm.taskSet.stageAttemptId
+      tsm.runningTasksSet.foreach { index =>
+        if (stageAttempt == 1) {
+          tsm.handleFailedTask(tsm.taskInfos(index).taskId, TaskState.FAILED, TaskResultLost)
+        } else {
+          val result = new DirectTaskResult[Int](valueSer.serialize(1), Seq())
+          tsm.handleSuccessfulTask(tsm.taskInfos(index).taskId, result)
+        }
+      }
+
+      // we update the blacklist for the stage attempts with all successful tasks.  Even though
+      // some tasksets had failures, we still consider them all successful from a blacklisting
+      // perspective, as the failures weren't from a problem w/ the tasks themselves.
+      verify(blacklist).updateBlacklistForSuccessfulTaskSet(meq(0), meq(stageAttempt), anyObject())
+    }
+  }
 }

From d35eb2f9b0af1a625749ca8b7f12d8eceed28766 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 3 May 2018 17:27:13 -0700
Subject: [PATCH 432/566] [SPARK-24168][SQL] WindowExec should not access
 SQLConf at executor side

## What changes were proposed in this pull request?

This PR is extracted from #21190 , to make it easier to backport.

`WindowExec#createBoundOrdering` is called on executor side, so we can't use `conf.sessionLocalTimezone` there.

## How was this patch tested?

tested in #21190

Author: Wenchen Fan <wenchen@databricks.com>

Closes #21225 from cloud-fan/minor3.

(cherry picked from commit e646ae67f2e793204bc819ab2b90815214c2bbf3)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../spark/sql/execution/window/WindowExec.scala   | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala
index 800a2ea3f3996..626f39d9e95cc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala
@@ -112,9 +112,11 @@ case class WindowExec(
    *
    * @param frame to evaluate. This can either be a Row or Range frame.
    * @param bound with respect to the row.
+   * @param timeZone the session local timezone for time related calculations.
    * @return a bound ordering object.
    */
-  private[this] def createBoundOrdering(frame: FrameType, bound: Expression): BoundOrdering = {
+  private[this] def createBoundOrdering(
+      frame: FrameType, bound: Expression, timeZone: String): BoundOrdering = {
     (frame, bound) match {
       case (RowFrame, CurrentRow) =>
         RowBoundOrdering(0)
@@ -144,7 +146,7 @@ case class WindowExec(
         val boundExpr = (expr.dataType, boundOffset.dataType) match {
           case (DateType, IntegerType) => DateAdd(expr, boundOffset)
           case (TimestampType, CalendarIntervalType) =>
-            TimeAdd(expr, boundOffset, Some(conf.sessionLocalTimeZone))
+            TimeAdd(expr, boundOffset, Some(timeZone))
           case (a, b) if a== b => Add(expr, boundOffset)
         }
         val bound = newMutableProjection(boundExpr :: Nil, child.output)
@@ -197,6 +199,7 @@ case class WindowExec(
 
     // Map the groups to a (unbound) expression and frame factory pair.
     var numExpressions = 0
+    val timeZone = conf.sessionLocalTimeZone
     framedFunctions.toSeq.map {
       case (key, (expressions, functionSeq)) =>
         val ordinal = numExpressions
@@ -237,7 +240,7 @@ case class WindowExec(
               new UnboundedPrecedingWindowFunctionFrame(
                 target,
                 processor,
-                createBoundOrdering(frameType, upper))
+                createBoundOrdering(frameType, upper, timeZone))
             }
 
           // Shrinking Frame.
@@ -246,7 +249,7 @@ case class WindowExec(
               new UnboundedFollowingWindowFunctionFrame(
                 target,
                 processor,
-                createBoundOrdering(frameType, lower))
+                createBoundOrdering(frameType, lower, timeZone))
             }
 
           // Moving Frame.
@@ -255,8 +258,8 @@ case class WindowExec(
               new SlidingWindowFunctionFrame(
                 target,
                 processor,
-                createBoundOrdering(frameType, lower),
-                createBoundOrdering(frameType, upper))
+                createBoundOrdering(frameType, lower, timeZone),
+                createBoundOrdering(frameType, upper, timeZone))
             }
         }
 

From 3f78f60cc80d04ce762bd332dd2fa5f7d17c2b3c Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 4 May 2018 19:20:15 +0800
Subject: [PATCH 433/566] [SPARK-23697][CORE] LegacyAccumulatorWrapper should
 define isZero correctly

## What changes were proposed in this pull request?

It's possible that Accumulators of Spark 1.x may no longer work with Spark 2.x. This is because `LegacyAccumulatorWrapper.isZero` may return wrong answer if `AccumulableParam` doesn't define equals/hashCode.

This PR fixes this by using reference equality check in `LegacyAccumulatorWrapper.isZero`.

## How was this patch tested?

a new test

Author: Wenchen Fan <wenchen@databricks.com>

Closes #21229 from cloud-fan/accumulator.

(cherry picked from commit 4d5de4d303a773b1c18c350072344bd7efca9fc4)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../org/apache/spark/util/AccumulatorV2.scala |  6 ++++--
 .../spark/util/AccumulatorV2Suite.scala       | 19 +++++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
index 0f84ea9752cf5..2bc84953a56eb 100644
--- a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
+++ b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
@@ -486,7 +486,9 @@ class LegacyAccumulatorWrapper[R, T](
     param: org.apache.spark.AccumulableParam[R, T]) extends AccumulatorV2[T, R] {
   private[spark] var _value = initialValue  // Current value on driver
 
-  override def isZero: Boolean = _value == param.zero(initialValue)
+  @transient private lazy val _zero = param.zero(initialValue)
+
+  override def isZero: Boolean = _value.asInstanceOf[AnyRef].eq(_zero.asInstanceOf[AnyRef])
 
   override def copy(): LegacyAccumulatorWrapper[R, T] = {
     val acc = new LegacyAccumulatorWrapper(initialValue, param)
@@ -495,7 +497,7 @@ class LegacyAccumulatorWrapper[R, T](
   }
 
   override def reset(): Unit = {
-    _value = param.zero(initialValue)
+    _value = _zero
   }
 
   override def add(v: T): Unit = _value = param.addAccumulator(_value, v)
diff --git a/core/src/test/scala/org/apache/spark/util/AccumulatorV2Suite.scala b/core/src/test/scala/org/apache/spark/util/AccumulatorV2Suite.scala
index a04644d57ed88..fe0a9a471a651 100644
--- a/core/src/test/scala/org/apache/spark/util/AccumulatorV2Suite.scala
+++ b/core/src/test/scala/org/apache/spark/util/AccumulatorV2Suite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.util
 
 import org.apache.spark._
+import org.apache.spark.serializer.JavaSerializer
 
 class AccumulatorV2Suite extends SparkFunSuite {
 
@@ -162,4 +163,22 @@ class AccumulatorV2Suite extends SparkFunSuite {
     assert(acc3.isZero)
     assert(acc3.value === "")
   }
+
+  test("LegacyAccumulatorWrapper with AccumulatorParam that has no equals/hashCode") {
+    class MyData(val i: Int) extends Serializable
+    val param = new AccumulatorParam[MyData] {
+      override def zero(initialValue: MyData): MyData = new MyData(0)
+      override def addInPlace(r1: MyData, r2: MyData): MyData = new MyData(r1.i + r2.i)
+    }
+
+    val acc = new LegacyAccumulatorWrapper(new MyData(0), param)
+    acc.metadata = AccumulatorMetadata(
+      AccumulatorContext.newId(),
+      Some("test"),
+      countFailedValues = false)
+    AccumulatorContext.register(acc)
+
+    val ser = new JavaSerializer(new SparkConf).newInstance()
+    ser.serialize(acc)
+  }
 }

From f87785a76f443bcf996c60d190afc29aa2e3b6e4 Mon Sep 17 00:00:00 2001
From: Gabor Somogyi <gabor.g.somogyi@gmail.com>
Date: Mon, 7 May 2018 14:45:14 +0800
Subject: [PATCH 434/566] [SPARK-23775][TEST] Make DataFrameRangeSuite not
 flaky

## What changes were proposed in this pull request?

DataFrameRangeSuite.test("Cancelling stage in a query with Range.") stays sometimes in an infinite loop and times out the build.

There were multiple issues with the test:

1. The first valid stageId is zero when the test started alone and not in a suite and the following code waits until timeout:

```
eventually(timeout(10.seconds), interval(1.millis)) {
  assert(DataFrameRangeSuite.stageToKill > 0)
}
```

2. The `DataFrameRangeSuite.stageToKill` was overwritten by the task's thread after the reset which ended up in canceling the same stage 2 times. This caused the infinite wait.

This PR solves this mentioned flakyness by removing the shared `DataFrameRangeSuite.stageToKill` and using `onTaskStart` where stage ID is provided. In order to make sure cancelStage called for all stages `waitUntilEmpty` is called on `ListenerBus`.

In [PR20888](https://github.com/apache/spark/pull/20888) this tried to get solved by:
* Stopping the executor thread with `wait`
* Wait for all `cancelStage` called
* Kill the executor thread by setting `SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL`

but the thread killing left the shared `SparkContext` sometimes in a state where further jobs can't be submitted. As a result DataFrameRangeSuite.test("Cancelling stage in a query with Range.") test passed properly but the next test inside the suite was hanging.

## How was this patch tested?

Existing unit test executed 10k times.

Author: Gabor Somogyi <gabor.g.somogyi@gmail.com>

Closes #21214 from gaborgsomogyi/SPARK-23775_1.

(cherry picked from commit c5981976f1d514a3ad8a684b9a21cebe38b786fa)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/DataFrameRangeSuite.scala       | 24 +++++++------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameRangeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameRangeSuite.scala
index 57a930dfaf320..b0b46640ff317 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameRangeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameRangeSuite.scala
@@ -23,8 +23,8 @@ import scala.util.Random
 
 import org.scalatest.concurrent.Eventually
 
-import org.apache.spark.{SparkException, TaskContext}
-import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
+import org.apache.spark.SparkException
+import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -153,23 +153,17 @@ class DataFrameRangeSuite extends QueryTest with SharedSQLContext with Eventuall
 
   test("Cancelling stage in a query with Range.") {
     val listener = new SparkListener {
-      override def onJobStart(jobStart: SparkListenerJobStart): Unit = {
-        eventually(timeout(10.seconds), interval(1.millis)) {
-          assert(DataFrameRangeSuite.stageToKill > 0)
-        }
-        sparkContext.cancelStage(DataFrameRangeSuite.stageToKill)
+      override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = {
+        sparkContext.cancelStage(taskStart.stageId)
       }
     }
 
     sparkContext.addSparkListener(listener)
     for (codegen <- Seq(true, false)) {
       withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> codegen.toString()) {
-        DataFrameRangeSuite.stageToKill = -1
         val ex = intercept[SparkException] {
-          spark.range(0, 100000000000L, 1, 1).map { x =>
-            DataFrameRangeSuite.stageToKill = TaskContext.get().stageId()
-            x
-          }.toDF("id").agg(sum("id")).collect()
+          spark.range(0, 100000000000L, 1, 1)
+            .toDF("id").agg(sum("id")).collect()
         }
         ex.getCause() match {
           case null =>
@@ -180,6 +174,8 @@ class DataFrameRangeSuite extends QueryTest with SharedSQLContext with Eventuall
             fail("Expected the cause to be SparkException, got " + cause.toString() + " instead.")
         }
       }
+      // Wait until all ListenerBus events consumed to make sure cancelStage called for all stages
+      sparkContext.listenerBus.waitUntilEmpty(20.seconds.toMillis)
       eventually(timeout(20.seconds)) {
         assert(sparkContext.statusTracker.getExecutorInfos.map(_.numRunningTasks()).sum == 0)
       }
@@ -204,7 +200,3 @@ class DataFrameRangeSuite extends QueryTest with SharedSQLContext with Eventuall
     }
   }
 }
-
-object DataFrameRangeSuite {
-  @volatile var stageToKill = -1
-}

From 3a22feab4dc9f0cffe3aaec692e27ab277666507 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@apache.org>
Date: Mon, 7 May 2018 14:48:28 -0700
Subject: [PATCH 435/566] [SPARK-23291][SQL][R][BRANCH-2.3] R's substr should
 not reduce starting position by 1 when calling Scala API

## What changes were proposed in this pull request?

This PR backports https://github.com/apache/spark/commit/24b5c69ee3feded439e5bb6390e4b63f503eeafe and https://github.com/apache/spark/pull/21249

There's no conflict but I opened this just to run the test and for sure.

See the discussion in https://issues.apache.org/jira/browse/SPARK-23291

## How was this patch tested?

Jenkins tests.

Author: hyukjinkwon <gurwls223@apache.org>
Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #21250 from HyukjinKwon/SPARK-23291-backport.
---
 R/pkg/R/column.R                      | 10 ++++++++--
 R/pkg/tests/fulltests/test_sparkSQL.R |  1 +
 docs/sparkr.md                        |  4 ++++
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index 3095adb918b67..3d6d9f9746ee6 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -164,12 +164,18 @@ setMethod("alias",
 #' @aliases substr,Column-method
 #'
 #' @param x a Column.
-#' @param start starting position.
+#' @param start starting position. It should be 1-base.
 #' @param stop ending position.
+#' @examples
+#' \dontrun{
+#' df <- createDataFrame(list(list(a="abcdef")))
+#' collect(select(df, substr(df$a, 1, 4))) # the result is `abcd`.
+#' collect(select(df, substr(df$a, 2, 4))) # the result is `bcd`.
+#' }
 #' @note substr since 1.4.0
 setMethod("substr", signature(x = "Column"),
           function(x, start, stop) {
-            jc <- callJMethod(x@jc, "substr", as.integer(start - 1), as.integer(stop - start + 1))
+            jc <- callJMethod(x@jc, "substr", as.integer(start), as.integer(stop - start + 1))
             column(jc)
           })
 
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index 5197838eaac66..bed26ec6a3752 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1649,6 +1649,7 @@ test_that("string operators", {
   expect_false(first(select(df, startsWith(df$name, "m")))[[1]])
   expect_true(first(select(df, endsWith(df$name, "el")))[[1]])
   expect_equal(first(select(df, substr(df$name, 1, 2)))[[1]], "Mi")
+  expect_equal(first(select(df, substr(df$name, 4, 6)))[[1]], "hae")
   if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) {
     expect_true(startsWith("Hello World", "Hello"))
     expect_false(endsWith("Hello World", "a"))
diff --git a/docs/sparkr.md b/docs/sparkr.md
index 6685b585a393a..73f9424ebc1ac 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -663,3 +663,7 @@ You can inspect the search path in R with [`search()`](https://stat.ethz.ch/R-ma
  - The `stringsAsFactors` parameter was previously ignored with `collect`, for example, in `collect(createDataFrame(iris), stringsAsFactors = TRUE))`. It has been corrected.
  - For `summary`, option for statistics to compute has been added. Its output is changed from that from `describe`.
  - A warning can be raised if versions of SparkR package and the Spark JVM do not match.
+
+## Upgrading to SparkR 2.3.1 and above
+
+ - In SparkR 2.3.0 and earlier, the `start` parameter of `substr` method was wrongly subtracted by one and considered as 0-based. This can lead to inconsistent substring results and also does not match with the behaviour with `substr` in R. In version 2.3.1 and later, it has been fixed so the `start` parameter of `substr` method is now 1-base. As an example, `substr(lit('abcdef'), 2, 4))` would result to `abc` in SparkR 2.3.0, and the result would be `bcd` in SparkR 2.3.1.

From 4dc6719e9afdd0f0ba134bfc69f042f7796ff2cd Mon Sep 17 00:00:00 2001
From: Henry Robinson <henry@apache.org>
Date: Tue, 8 May 2018 12:21:33 +0800
Subject: [PATCH 436/566] [SPARK-24128][SQL] Mention configuration option in
 implicit CROSS JOIN error

## What changes were proposed in this pull request?

Mention `spark.sql.crossJoin.enabled` in error message when an implicit `CROSS JOIN` is detected.

## How was this patch tested?

`CartesianProductSuite` and `JoinSuite`.

Author: Henry Robinson <henry@apache.org>

Closes #21201 from henryr/spark-24128.

(cherry picked from commit cd12c5c3ecf28f7b04f566c2057f9b65eb456b7d)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 R/pkg/tests/fulltests/test_sparkSQL.R                       | 4 ++--
 .../org/apache/spark/sql/catalyst/optimizer/Optimizer.scala | 6 ++++--
 .../catalyst/optimizer/CheckCartesianProductsSuite.scala    | 2 +-
 .../src/test/scala/org/apache/spark/sql/JoinSuite.scala     | 4 ++--
 4 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index bed26ec6a3752..a73811ed4a978 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -2186,8 +2186,8 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
   expect_equal(count(where(join(df, df2), df$name == df2$name)), 3)
   # cartesian join
   expect_error(tryCatch(count(join(df, df2)), error = function(e) { stop(e) }),
-               paste0(".*(org.apache.spark.sql.AnalysisException: Detected cartesian product for",
-                      " INNER join between logical plans).*"))
+               paste0(".*(org.apache.spark.sql.AnalysisException: Detected implicit cartesian",
+                      " product for INNER join between logical plans).*"))
 
   joined <- crossJoin(df, df2)
   expect_equal(names(joined), c("age", "name", "name", "test"))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index a28b6a0feb8f9..c77e0f82dc253 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -1122,12 +1122,14 @@ object CheckCartesianProducts extends Rule[LogicalPlan] with PredicateHelper {
       case j @ Join(left, right, Inner | LeftOuter | RightOuter | FullOuter, _)
         if isCartesianProduct(j) =>
           throw new AnalysisException(
-            s"""Detected cartesian product for ${j.joinType.sql} join between logical plans
+            s"""Detected implicit cartesian product for ${j.joinType.sql} join between logical plans
                |${left.treeString(false).trim}
                |and
                |${right.treeString(false).trim}
                |Join condition is missing or trivial.
-               |Use the CROSS JOIN syntax to allow cartesian products between these relations."""
+               |Either: use the CROSS JOIN syntax to allow cartesian products between these
+               |relations, or: enable implicit cartesian products by setting the configuration
+               |variable spark.sql.crossJoin.enabled=true"""
             .stripMargin)
     }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CheckCartesianProductsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CheckCartesianProductsSuite.scala
index 21220b38968e8..788fedb3c8e8e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CheckCartesianProductsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CheckCartesianProductsSuite.scala
@@ -56,7 +56,7 @@ class CheckCartesianProductsSuite extends PlanTest {
         val thrownException = the [AnalysisException] thrownBy {
           performCartesianProductCheck(joinType)
         }
-        assert(thrownException.message.contains("Detected cartesian product"))
+        assert(thrownException.message.contains("Detected implicit cartesian product"))
       }
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 771e1186e63ab..8fa747465cb1a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -239,7 +239,7 @@ class JoinSuite extends QueryTest with SharedSQLContext {
             Row(2, 2, 1, null) ::
             Row(2, 2, 2, 2) :: Nil)
       }
-      assert(e.getMessage.contains("Detected cartesian product for INNER join " +
+      assert(e.getMessage.contains("Detected implicit cartesian product for INNER join " +
         "between logical plans"))
     }
   }
@@ -611,7 +611,7 @@ class JoinSuite extends QueryTest with SharedSQLContext {
       val e = intercept[Exception] {
         checkAnswer(sql(query), Nil);
       }
-      assert(e.getMessage.contains("Detected cartesian product"))
+      assert(e.getMessage.contains("Detected implicit cartesian product"))
     }
 
     cartesianQueries.foreach(checkCartesianDetection)

From aba52f449730456d1a0fc885a80d0eec74b6e106 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 8 May 2018 14:32:04 +0800
Subject: [PATCH 437/566] [SPARK-24188][CORE] Restore "/version" API endpoint.

It was missing the jax-rs annotation.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #21245 from vanzin/SPARK-24188.

Change-Id: Ib338e34b363d7c729cc92202df020dc51033b719
(cherry picked from commit 05eb19b6e09065265358eec2db2ff3b42806dfc9)
Signed-off-by: jerryshao <sshao@hortonworks.com>
---
 .../org/apache/spark/status/api/v1/ApiRootResource.scala     | 1 +
 .../org/apache/spark/deploy/history/HistoryServerSuite.scala | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
index ed9bdc6e1e3c2..20108ac85c315 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
@@ -49,6 +49,7 @@ private[v1] class ApiRootResource extends ApiRequestContext {
   @Path("applications/{appId}")
   def application(): Class[OneApplicationResource] = classOf[OneApplicationResource]
 
+  @GET
   @Path("version")
   def version(): VersionInfo = new VersionInfo(org.apache.spark.SPARK_VERSION)
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
index 6856cfe12bae3..4c06193225368 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
@@ -294,6 +294,11 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
     all (siteRelativeLinks) should startWith (uiRoot)
   }
 
+  test("/version api endpoint") {
+    val response = getUrl("version")
+    assert(response.contains(SPARK_VERSION))
+  }
+
   test("ajax rendered relative links are prefixed with uiRoot (spark.ui.proxyBase)") {
     val uiRoot = "/testwebproxybase"
     System.setProperty("spark.ui.proxyBase", uiRoot)

From 8889d78643154a0eb5ce81363ba471a80a1e64f1 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <zsxwing@gmail.com>
Date: Wed, 9 May 2018 11:32:17 -0700
Subject: [PATCH 438/566] [SPARK-24214][SS] Fix toJSON for
 StreamingRelationV2/StreamingExecutionRelation/ContinuousExecutionRelation

## What changes were proposed in this pull request?

We should overwrite "otherCopyArgs" to provide the SparkSession parameter otherwise TreeNode.toJSON cannot get the full constructor parameter list.

## How was this patch tested?

The new unit test.

Author: Shixiong Zhu <zsxwing@gmail.com>

Closes #21275 from zsxwing/SPARK-24214.

(cherry picked from commit fd1179c17273283d32f275d5cd5f97aaa2aca1f7)
Signed-off-by: Shixiong Zhu <zsxwing@gmail.com>
---
 .../execution/streaming/StreamingRelation.scala   |  3 +++
 .../spark/sql/streaming/StreamingQuerySuite.scala | 15 +++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
index f02d3a2c3733f..24195b5657e8a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
@@ -66,6 +66,7 @@ case class StreamingExecutionRelation(
     output: Seq[Attribute])(session: SparkSession)
   extends LeafNode with MultiInstanceRelation {
 
+  override def otherCopyArgs: Seq[AnyRef] = session :: Nil
   override def isStreaming: Boolean = true
   override def toString: String = source.toString
 
@@ -97,6 +98,7 @@ case class StreamingRelationV2(
     output: Seq[Attribute],
     v1Relation: Option[StreamingRelation])(session: SparkSession)
   extends LeafNode with MultiInstanceRelation {
+  override def otherCopyArgs: Seq[AnyRef] = session :: Nil
   override def isStreaming: Boolean = true
   override def toString: String = sourceName
 
@@ -116,6 +118,7 @@ case class ContinuousExecutionRelation(
     output: Seq[Attribute])(session: SparkSession)
   extends LeafNode with MultiInstanceRelation {
 
+  override def otherCopyArgs: Seq[AnyRef] = session :: Nil
   override def isStreaming: Boolean = true
   override def toString: String = source.toString
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 2b0ab33f3e96c..e3429b58dceec 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -687,6 +687,21 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging wi
       CheckLastBatch(("A", 1)))
   }
 
+  test("StreamingRelationV2/StreamingExecutionRelation/ContinuousExecutionRelation.toJSON " +
+    "should not fail") {
+    val df = spark.readStream.format("rate").load()
+    assert(df.logicalPlan.toJSON.contains("StreamingRelationV2"))
+
+    testStream(df)(
+      AssertOnQuery(_.logicalPlan.toJSON.contains("StreamingExecutionRelation"))
+    )
+
+    testStream(df, useV2Sink = true)(
+      StartStream(trigger = Trigger.Continuous(100)),
+      AssertOnQuery(_.logicalPlan.toJSON.contains("ContinuousExecutionRelation"))
+    )
+  }
+
   /** Create a streaming DF that only execute one batch in which it returns the given static DF */
   private def createSingleTriggerStreamingDF(triggerDF: DataFrame): DataFrame = {
     require(!triggerDF.isStreaming)

From eab10f9945c1d01daa45c233a39dedfd184f543c Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Fri, 11 May 2018 00:28:43 +0800
Subject: [PATCH 439/566] [SPARK-24068][BACKPORT-2.3] Propagating
 DataFrameReader's options to Text datasource on schema inferring

## What changes were proposed in this pull request?

While reading CSV or JSON files, DataFrameReader's options are converted to Hadoop's parameters, for example there:
https://github.com/apache/spark/blob/branch-2.3/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L302

but the options are not propagated to Text datasource on schema inferring, for instance:
https://github.com/apache/spark/blob/branch-2.3/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala#L184-L188

The PR proposes propagation of user's options to Text datasource on scheme inferring in similar way as user's options are converted to Hadoop parameters if schema is specified.

## How was this patch tested?
The changes were tested manually by using https://github.com/twitter/hadoop-lzo:

```
hadoop-lzo> mvn clean package
hadoop-lzo> ln -s ./target/hadoop-lzo-0.4.21-SNAPSHOT.jar ./hadoop-lzo.jar
```
Create 2 test files in JSON and CSV format and compress them:
```shell
$ cat test.csv
col1|col2
a|1
$ lzop test.csv
$ cat test.json
{"col1":"a","col2":1}
$ lzop test.json
```
Run `spark-shell` with hadoop-lzo:
```
bin/spark-shell --jars ~/hadoop-lzo/hadoop-lzo.jar
```
reading compressed CSV and JSON without schema:
```scala
spark.read.option("io.compression.codecs", "com.hadoop.compression.lzo.LzopCodec").option("inferSchema",true).option("header",true).option("sep","|").csv("test.csv.lzo").show()
+----+----+
|col1|col2|
+----+----+
|   a|   1|
+----+----+
```
```scala
spark.read.option("io.compression.codecs", "com.hadoop.compression.lzo.LzopCodec").option("multiLine", true).json("test.json.lzo").printSchema
root
 |-- col1: string (nullable = true)
 |-- col2: long (nullable = true)
```

Author: Maxim Gekk <maxim.gekk@databricks.com>
Author: Maxim Gekk <max.gekk@gmail.com>

Closes #21292 from MaxGekk/text-options-backport-v2.3.
---
 .../spark/sql/catalyst/json/JSONOptions.scala    |  2 +-
 .../datasources/csv/CSVDataSource.scala          |  6 ++++--
 .../execution/datasources/csv/CSVOptions.scala   |  2 +-
 .../datasources/csv/UnivocityParser.scala        |  2 --
 .../datasources/json/JsonDataSource.scala        | 16 ++++++++++------
 5 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
index 652412b34478a..190fcc605d043 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
@@ -31,7 +31,7 @@ import org.apache.spark.sql.catalyst.util._
  * Most of these map directly to Jackson's internal options, specified in [[JsonParser.Feature]].
  */
 private[sql] class JSONOptions(
-    @transient private val parameters: CaseInsensitiveMap[String],
+    @transient val parameters: CaseInsensitiveMap[String],
     defaultTimeZoneId: String,
     defaultColumnNameOfCorruptRecord: String)
   extends Logging with Serializable  {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 4870d75fc5f08..fffad1753513a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -184,7 +184,8 @@ object TextInputCSVDataSource extends CSVDataSource {
         DataSource.apply(
           sparkSession,
           paths = paths,
-          className = classOf[TextFileFormat].getName
+          className = classOf[TextFileFormat].getName,
+          options = options.parameters
         ).resolveRelation(checkFilesExist = false))
         .select("value").as[String](Encoders.STRING)
     } else {
@@ -248,7 +249,8 @@ object MultiLineCSVDataSource extends CSVDataSource {
       options: CSVOptions): RDD[PortableDataStream] = {
     val paths = inputPaths.map(_.getPath)
     val name = paths.mkString(",")
-    val job = Job.getInstance(sparkSession.sessionState.newHadoopConf())
+    val job = Job.getInstance(sparkSession.sessionState.newHadoopConfWithOptions(
+      options.parameters))
     FileInputFormat.setInputPaths(job, paths: _*)
     val conf = job.getConfiguration
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index c16790630ce17..6347af619d46a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -27,7 +27,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.util._
 
 class CSVOptions(
-    @transient private val parameters: CaseInsensitiveMap[String],
+    @transient val parameters: CaseInsensitiveMap[String],
     defaultTimeZoneId: String,
     defaultColumnNameOfCorruptRecord: String)
   extends Logging with Serializable {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index 3d6cc30f2ba83..99557a1ceb0c8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -19,8 +19,6 @@ package org.apache.spark.sql.execution.datasources.csv
 
 import java.io.InputStream
 import java.math.BigDecimal
-import java.text.NumberFormat
-import java.util.Locale
 
 import scala.util.Try
 import scala.util.control.NonFatal
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala
index 77e7edc8e7a20..8a0fe5374b912 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala
@@ -92,7 +92,7 @@ object TextInputJsonDataSource extends JsonDataSource {
       sparkSession: SparkSession,
       inputPaths: Seq[FileStatus],
       parsedOptions: JSONOptions): StructType = {
-    val json: Dataset[String] = createBaseDataset(sparkSession, inputPaths)
+    val json: Dataset[String] = createBaseDataset(sparkSession, inputPaths, parsedOptions)
     inferFromDataset(json, parsedOptions)
   }
 
@@ -104,13 +104,15 @@ object TextInputJsonDataSource extends JsonDataSource {
 
   private def createBaseDataset(
       sparkSession: SparkSession,
-      inputPaths: Seq[FileStatus]): Dataset[String] = {
+      inputPaths: Seq[FileStatus],
+      parsedOptions: JSONOptions): Dataset[String] = {
     val paths = inputPaths.map(_.getPath.toString)
     sparkSession.baseRelationToDataFrame(
       DataSource.apply(
         sparkSession,
         paths = paths,
-        className = classOf[TextFileFormat].getName
+        className = classOf[TextFileFormat].getName,
+        options = parsedOptions.parameters
       ).resolveRelation(checkFilesExist = false))
       .select("value").as(Encoders.STRING)
   }
@@ -144,16 +146,18 @@ object MultiLineJsonDataSource extends JsonDataSource {
       sparkSession: SparkSession,
       inputPaths: Seq[FileStatus],
       parsedOptions: JSONOptions): StructType = {
-    val json: RDD[PortableDataStream] = createBaseRdd(sparkSession, inputPaths)
+    val json: RDD[PortableDataStream] = createBaseRdd(sparkSession, inputPaths, parsedOptions)
     val sampled: RDD[PortableDataStream] = JsonUtils.sample(json, parsedOptions)
     JsonInferSchema.infer(sampled, parsedOptions, createParser)
   }
 
   private def createBaseRdd(
       sparkSession: SparkSession,
-      inputPaths: Seq[FileStatus]): RDD[PortableDataStream] = {
+      inputPaths: Seq[FileStatus],
+      parsedOptions: JSONOptions): RDD[PortableDataStream] = {
     val paths = inputPaths.map(_.getPath)
-    val job = Job.getInstance(sparkSession.sessionState.newHadoopConf())
+    val job = Job.getInstance(sparkSession.sessionState.newHadoopConfWithOptions(
+      parsedOptions.parameters))
     val conf = job.getConfiguration
     val name = paths.mkString(",")
     FileInputFormat.setInputPaths(job, paths: _*)

From 323dc3ad02e63a7c99b5bd6da618d6020657ecba Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 13 Apr 2018 14:28:24 -0700
Subject: [PATCH 440/566] [PYSPARK] Update py4j to version 0.10.7.

(cherry picked from commit cc613b552e753d03cb62661591de59e1c8d82c74)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 LICENSE                                       |   2 +-
 bin/pyspark                                   |   6 +-
 bin/pyspark2.cmd                              |   2 +-
 core/pom.xml                                  |   2 +-
 .../org/apache/spark/SecurityManager.scala    |  11 +-
 .../api/python/PythonGatewayServer.scala      |  50 ++++++---
 .../apache/spark/api/python/PythonRDD.scala   |  29 +++--
 .../apache/spark/api/python/PythonUtils.scala |   2 +-
 .../api/python/PythonWorkerFactory.scala      |  21 ++--
 .../apache/spark/deploy/PythonRunner.scala    |  12 ++-
 .../spark/internal/config/package.scala       |   5 +
 .../spark/security/SocketAuthHelper.scala     | 101 ++++++++++++++++++
 .../scala/org/apache/spark/util/Utils.scala   |  12 +++
 .../security/SocketAuthHelperSuite.scala      |  97 +++++++++++++++++
 dev/deps/spark-deps-hadoop-2.6                |   2 +-
 dev/deps/spark-deps-hadoop-2.7                |   2 +-
 dev/run-pip-tests                             |   2 +-
 python/README.md                              |   2 +-
 python/docs/Makefile                          |   2 +-
 python/lib/py4j-0.10.6-src.zip                | Bin 80352 -> 0 bytes
 python/lib/py4j-0.10.7-src.zip                | Bin 0 -> 42437 bytes
 python/pyspark/context.py                     |   4 +-
 python/pyspark/daemon.py                      |  21 +++-
 python/pyspark/java_gateway.py                |  93 +++++++++-------
 python/pyspark/rdd.py                         |  21 ++--
 python/pyspark/sql/dataframe.py               |  12 +--
 python/pyspark/worker.py                      |   7 +-
 python/setup.py                               |   2 +-
 .../org/apache/spark/deploy/yarn/Client.scala |   2 +-
 .../spark/deploy/yarn/YarnClusterSuite.scala  |   2 +-
 sbin/spark-config.sh                          |   2 +-
 .../scala/org/apache/spark/sql/Dataset.scala  |   6 +-
 32 files changed, 418 insertions(+), 116 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/security/SocketAuthHelper.scala
 create mode 100644 core/src/test/scala/org/apache/spark/security/SocketAuthHelperSuite.scala
 delete mode 100644 python/lib/py4j-0.10.6-src.zip
 create mode 100644 python/lib/py4j-0.10.7-src.zip

diff --git a/LICENSE b/LICENSE
index c2b0d72663b55..820f14dbdeed0 100644
--- a/LICENSE
+++ b/LICENSE
@@ -263,7 +263,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
      (New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
      (The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
      (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
-     (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.6 - http://py4j.sourceforge.net/)
+     (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.7 - http://py4j.sourceforge.net/)
      (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
      (BSD licence) sbt and sbt-launch-lib.bash
      (BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)
diff --git a/bin/pyspark b/bin/pyspark
index dd286277c1fc1..5d5affb1f97c3 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -25,14 +25,14 @@ source "${SPARK_HOME}"/bin/load-spark-env.sh
 export _SPARK_CMD_USAGE="Usage: ./bin/pyspark [options]"
 
 # In Spark 2.0, IPYTHON and IPYTHON_OPTS are removed and pyspark fails to launch if either option
-# is set in the user's environment. Instead, users should set PYSPARK_DRIVER_PYTHON=ipython 
+# is set in the user's environment. Instead, users should set PYSPARK_DRIVER_PYTHON=ipython
 # to use IPython and set PYSPARK_DRIVER_PYTHON_OPTS to pass options when starting the Python driver
 # (e.g. PYSPARK_DRIVER_PYTHON_OPTS='notebook').  This supports full customization of the IPython
 # and executor Python executables.
 
 # Fail noisily if removed options are set
 if [[ -n "$IPYTHON" || -n "$IPYTHON_OPTS" ]]; then
-  echo "Error in pyspark startup:" 
+  echo "Error in pyspark startup:"
   echo "IPYTHON and IPYTHON_OPTS are removed in Spark 2.0+. Remove these from the environment and set PYSPARK_DRIVER_PYTHON and PYSPARK_DRIVER_PYTHON_OPTS instead."
   exit 1
 fi
@@ -57,7 +57,7 @@ export PYSPARK_PYTHON
 
 # Add the PySpark classes to the Python path:
 export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH"
-export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.6-src.zip:$PYTHONPATH"
+export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.7-src.zip:$PYTHONPATH"
 
 # Load the PySpark shell.py script when ./pyspark is used interactively:
 export OLD_PYTHONSTARTUP="$PYTHONSTARTUP"
diff --git a/bin/pyspark2.cmd b/bin/pyspark2.cmd
index 663670f2fddaf..15fa910c277b3 100644
--- a/bin/pyspark2.cmd
+++ b/bin/pyspark2.cmd
@@ -30,7 +30,7 @@ if "x%PYSPARK_DRIVER_PYTHON%"=="x" (
 )
 
 set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH%
-set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.6-src.zip;%PYTHONPATH%
+set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.7-src.zip;%PYTHONPATH%
 
 set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%
 set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py
diff --git a/core/pom.xml b/core/pom.xml
index c34c9df39063b..c7aa9a4aac409 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -344,7 +344,7 @@
     <dependency>
       <groupId>net.sf.py4j</groupId>
       <artifactId>py4j</artifactId>
-      <version>0.10.6</version>
+      <version>0.10.7</version>
     </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
diff --git a/core/src/main/scala/org/apache/spark/SecurityManager.scala b/core/src/main/scala/org/apache/spark/SecurityManager.scala
index 1d4a071f36506..00cebc4e6bcbe 100644
--- a/core/src/main/scala/org/apache/spark/SecurityManager.scala
+++ b/core/src/main/scala/org/apache/spark/SecurityManager.scala
@@ -17,14 +17,12 @@
 
 package org.apache.spark
 
-import java.lang.{Byte => JByte}
 import java.net.{Authenticator, PasswordAuthentication}
 import java.nio.charset.StandardCharsets.UTF_8
-import java.security.{KeyStore, SecureRandom}
+import java.security.KeyStore
 import java.security.cert.X509Certificate
 import javax.net.ssl._
 
-import com.google.common.hash.HashCodes
 import com.google.common.io.Files
 import org.apache.hadoop.io.Text
 import org.apache.hadoop.security.{Credentials, UserGroupInformation}
@@ -542,13 +540,8 @@ private[spark] class SecurityManager(
       return
     }
 
-    val rnd = new SecureRandom()
-    val length = sparkConf.getInt("spark.authenticate.secretBitLength", 256) / JByte.SIZE
-    val secretBytes = new Array[Byte](length)
-    rnd.nextBytes(secretBytes)
-
+    secretKey = Utils.createSecret(sparkConf)
     val creds = new Credentials()
-    secretKey = HashCodes.fromBytes(secretBytes).toString()
     creds.addSecretKey(SECRET_LOOKUP_KEY, secretKey.getBytes(UTF_8))
     UserGroupInformation.getCurrentUser().addCredentials(creds)
   }
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonGatewayServer.scala b/core/src/main/scala/org/apache/spark/api/python/PythonGatewayServer.scala
index 11f2432575d84..9ddc4a4910180 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonGatewayServer.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonGatewayServer.scala
@@ -17,26 +17,39 @@
 
 package org.apache.spark.api.python
 
-import java.io.DataOutputStream
-import java.net.Socket
+import java.io.{DataOutputStream, File, FileOutputStream}
+import java.net.InetAddress
+import java.nio.charset.StandardCharsets.UTF_8
+import java.nio.file.Files
 
 import py4j.GatewayServer
 
+import org.apache.spark.SparkConf
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils
 
 /**
- * Process that starts a Py4J GatewayServer on an ephemeral port and communicates the bound port
- * back to its caller via a callback port specified by the caller.
+ * Process that starts a Py4J GatewayServer on an ephemeral port.
  *
  * This process is launched (via SparkSubmit) by the PySpark driver (see java_gateway.py).
  */
 private[spark] object PythonGatewayServer extends Logging {
   initializeLogIfNecessary(true)
 
-  def main(args: Array[String]): Unit = Utils.tryOrExit {
-    // Start a GatewayServer on an ephemeral port
-    val gatewayServer: GatewayServer = new GatewayServer(null, 0)
+  def main(args: Array[String]): Unit = {
+    val secret = Utils.createSecret(new SparkConf())
+
+    // Start a GatewayServer on an ephemeral port. Make sure the callback client is configured
+    // with the same secret, in case the app needs callbacks from the JVM to the underlying
+    // python processes.
+    val localhost = InetAddress.getLoopbackAddress()
+    val gatewayServer: GatewayServer = new GatewayServer.GatewayServerBuilder()
+      .authToken(secret)
+      .javaPort(0)
+      .javaAddress(localhost)
+      .callbackClient(GatewayServer.DEFAULT_PYTHON_PORT, localhost, secret)
+      .build()
+
     gatewayServer.start()
     val boundPort: Int = gatewayServer.getListeningPort
     if (boundPort == -1) {
@@ -46,15 +59,24 @@ private[spark] object PythonGatewayServer extends Logging {
       logDebug(s"Started PythonGatewayServer on port $boundPort")
     }
 
-    // Communicate the bound port back to the caller via the caller-specified callback port
-    val callbackHost = sys.env("_PYSPARK_DRIVER_CALLBACK_HOST")
-    val callbackPort = sys.env("_PYSPARK_DRIVER_CALLBACK_PORT").toInt
-    logDebug(s"Communicating GatewayServer port to Python driver at $callbackHost:$callbackPort")
-    val callbackSocket = new Socket(callbackHost, callbackPort)
-    val dos = new DataOutputStream(callbackSocket.getOutputStream)
+    // Communicate the connection information back to the python process by writing the
+    // information in the requested file. This needs to match the read side in java_gateway.py.
+    val connectionInfoPath = new File(sys.env("_PYSPARK_DRIVER_CONN_INFO_PATH"))
+    val tmpPath = Files.createTempFile(connectionInfoPath.getParentFile().toPath(),
+      "connection", ".info").toFile()
+
+    val dos = new DataOutputStream(new FileOutputStream(tmpPath))
     dos.writeInt(boundPort)
+
+    val secretBytes = secret.getBytes(UTF_8)
+    dos.writeInt(secretBytes.length)
+    dos.write(secretBytes, 0, secretBytes.length)
     dos.close()
-    callbackSocket.close()
+
+    if (!tmpPath.renameTo(connectionInfoPath)) {
+      logError(s"Unable to write connection information to $connectionInfoPath.")
+      System.exit(1)
+    }
 
     // Exit on EOF or broken pipe to ensure that this process dies when the Python driver dies:
     while (System.in.read() != -1) {
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index f6293c0dc5091..a1ee2f7d1b119 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -38,6 +38,7 @@ import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.input.PortableDataStream
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
+import org.apache.spark.security.SocketAuthHelper
 import org.apache.spark.util._
 
 
@@ -107,6 +108,12 @@ private[spark] object PythonRDD extends Logging {
   // remember the broadcasts sent to each worker
   private val workerBroadcasts = new mutable.WeakHashMap[Socket, mutable.Set[Long]]()
 
+  // Authentication helper used when serving iterator data.
+  private lazy val authHelper = {
+    val conf = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf())
+    new SocketAuthHelper(conf)
+  }
+
   def getWorkerBroadcasts(worker: Socket): mutable.Set[Long] = {
     synchronized {
       workerBroadcasts.getOrElseUpdate(worker, new mutable.HashSet[Long]())
@@ -129,12 +136,13 @@ private[spark] object PythonRDD extends Logging {
    * (effectively a collect()), but allows you to run on a certain subset of partitions,
    * or to enable local execution.
    *
-   * @return the port number of a local socket which serves the data collected from this job.
+   * @return 2-tuple (as a Java array) with the port number of a local socket which serves the
+   *         data collected from this job, and the secret for authentication.
    */
   def runJob(
       sc: SparkContext,
       rdd: JavaRDD[Array[Byte]],
-      partitions: JArrayList[Int]): Int = {
+      partitions: JArrayList[Int]): Array[Any] = {
     type ByteArray = Array[Byte]
     type UnrolledPartition = Array[ByteArray]
     val allPartitions: Array[UnrolledPartition] =
@@ -147,13 +155,14 @@ private[spark] object PythonRDD extends Logging {
   /**
    * A helper function to collect an RDD as an iterator, then serve it via socket.
    *
-   * @return the port number of a local socket which serves the data collected from this job.
+   * @return 2-tuple (as a Java array) with the port number of a local socket which serves the
+   *         data collected from this job, and the secret for authentication.
    */
-  def collectAndServe[T](rdd: RDD[T]): Int = {
+  def collectAndServe[T](rdd: RDD[T]): Array[Any] = {
     serveIterator(rdd.collect().iterator, s"serve RDD ${rdd.id}")
   }
 
-  def toLocalIteratorAndServe[T](rdd: RDD[T]): Int = {
+  def toLocalIteratorAndServe[T](rdd: RDD[T]): Array[Any] = {
     serveIterator(rdd.toLocalIterator, s"serve toLocalIterator")
   }
 
@@ -384,8 +393,11 @@ private[spark] object PythonRDD extends Logging {
    * and send them into this connection.
    *
    * The thread will terminate after all the data are sent or any exceptions happen.
+   *
+   * @return 2-tuple (as a Java array) with the port number of a local socket which serves the
+   *         data collected from this job, and the secret for authentication.
    */
-  def serveIterator[T](items: Iterator[T], threadName: String): Int = {
+  def serveIterator(items: Iterator[_], threadName: String): Array[Any] = {
     val serverSocket = new ServerSocket(0, 1, InetAddress.getByName("localhost"))
     // Close the socket if no connection in 15 seconds
     serverSocket.setSoTimeout(15000)
@@ -395,11 +407,14 @@ private[spark] object PythonRDD extends Logging {
       override def run() {
         try {
           val sock = serverSocket.accept()
+          authHelper.authClient(sock)
+
           val out = new DataOutputStream(new BufferedOutputStream(sock.getOutputStream))
           Utils.tryWithSafeFinally {
             writeIteratorToStream(items, out)
           } {
             out.close()
+            sock.close()
           }
         } catch {
           case NonFatal(e) =>
@@ -410,7 +425,7 @@ private[spark] object PythonRDD extends Logging {
       }
     }.start()
 
-    serverSocket.getLocalPort
+    Array(serverSocket.getLocalPort, authHelper.secret)
   }
 
   private def getMergedConf(confAsMap: java.util.HashMap[String, String],
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
index 92e228a9dd10c..27a5e19f96a14 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
@@ -32,7 +32,7 @@ private[spark] object PythonUtils {
     val pythonPath = new ArrayBuffer[String]
     for (sparkHome <- sys.env.get("SPARK_HOME")) {
       pythonPath += Seq(sparkHome, "python", "lib", "pyspark.zip").mkString(File.separator)
-      pythonPath += Seq(sparkHome, "python", "lib", "py4j-0.10.6-src.zip").mkString(File.separator)
+      pythonPath += Seq(sparkHome, "python", "lib", "py4j-0.10.7-src.zip").mkString(File.separator)
     }
     pythonPath ++= SparkContext.jarOfObject(this)
     pythonPath.mkString(File.pathSeparator)
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala b/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala
index f53c6178047f5..949aa445537a6 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala
@@ -27,6 +27,7 @@ import scala.collection.mutable
 
 import org.apache.spark._
 import org.apache.spark.internal.Logging
+import org.apache.spark.security.SocketAuthHelper
 import org.apache.spark.util.{RedirectThread, Utils}
 
 private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String, String])
@@ -45,6 +46,9 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
     !System.getProperty("os.name").startsWith("Windows") && useDaemonEnabled
   }
 
+
+  private val authHelper = new SocketAuthHelper(SparkEnv.get.conf)
+
   var daemon: Process = null
   val daemonHost = InetAddress.getByAddress(Array(127, 0, 0, 1))
   var daemonPort: Int = 0
@@ -85,6 +89,8 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
       if (pid < 0) {
         throw new IllegalStateException("Python daemon failed to launch worker with code " + pid)
       }
+
+      authHelper.authToServer(socket)
       daemonWorkers.put(socket, pid)
       socket
     }
@@ -122,25 +128,24 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
       workerEnv.put("PYTHONPATH", pythonPath)
       // This is equivalent to setting the -u flag; we use it because ipython doesn't support -u:
       workerEnv.put("PYTHONUNBUFFERED", "YES")
+      workerEnv.put("PYTHON_WORKER_FACTORY_PORT", serverSocket.getLocalPort.toString)
+      workerEnv.put("PYTHON_WORKER_FACTORY_SECRET", authHelper.secret)
       val worker = pb.start()
 
       // Redirect worker stdout and stderr
       redirectStreamsToStderr(worker.getInputStream, worker.getErrorStream)
 
-      // Tell the worker our port
-      val out = new  OutputStreamWriter(worker.getOutputStream, StandardCharsets.UTF_8)
-      out.write(serverSocket.getLocalPort + "\n")
-      out.flush()
-
-      // Wait for it to connect to our socket
+      // Wait for it to connect to our socket, and validate the auth secret.
       serverSocket.setSoTimeout(10000)
+
       try {
         val socket = serverSocket.accept()
+        authHelper.authClient(socket)
         simpleWorkers.put(socket, worker)
         return socket
       } catch {
         case e: Exception =>
-          throw new SparkException("Python worker did not connect back in time", e)
+          throw new SparkException("Python worker failed to connect back.", e)
       }
     } finally {
       if (serverSocket != null) {
@@ -163,6 +168,7 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
         val workerEnv = pb.environment()
         workerEnv.putAll(envVars.asJava)
         workerEnv.put("PYTHONPATH", pythonPath)
+        workerEnv.put("PYTHON_WORKER_FACTORY_SECRET", authHelper.secret)
         // This is equivalent to setting the -u flag; we use it because ipython doesn't support -u:
         workerEnv.put("PYTHONUNBUFFERED", "YES")
         daemon = pb.start()
@@ -172,7 +178,6 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
 
         // Redirect daemon stdout and stderr
         redirectStreamsToStderr(in, daemon.getErrorStream)
-
       } catch {
         case e: Exception =>
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
index 7aca305783a7f..1b7e031ee0678 100644
--- a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.deploy
 
 import java.io.File
-import java.net.URI
+import java.net.{InetAddress, URI}
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
@@ -39,6 +39,7 @@ object PythonRunner {
     val pyFiles = args(1)
     val otherArgs = args.slice(2, args.length)
     val sparkConf = new SparkConf()
+    val secret = Utils.createSecret(sparkConf)
     val pythonExec = sparkConf.get(PYSPARK_DRIVER_PYTHON)
       .orElse(sparkConf.get(PYSPARK_PYTHON))
       .orElse(sys.env.get("PYSPARK_DRIVER_PYTHON"))
@@ -51,7 +52,13 @@ object PythonRunner {
 
     // Launch a Py4J gateway server for the process to connect to; this will let it see our
     // Java system properties and such
-    val gatewayServer = new py4j.GatewayServer(null, 0)
+    val localhost = InetAddress.getLoopbackAddress()
+    val gatewayServer = new py4j.GatewayServer.GatewayServerBuilder()
+      .authToken(secret)
+      .javaPort(0)
+      .javaAddress(localhost)
+      .callbackClient(py4j.GatewayServer.DEFAULT_PYTHON_PORT, localhost, secret)
+      .build()
     val thread = new Thread(new Runnable() {
       override def run(): Unit = Utils.logUncaughtExceptions {
         gatewayServer.start()
@@ -82,6 +89,7 @@ object PythonRunner {
     // This is equivalent to setting the -u flag; we use it because ipython doesn't support -u:
     env.put("PYTHONUNBUFFERED", "YES") // value is needed to be set to a non-empty string
     env.put("PYSPARK_GATEWAY_PORT", "" + gatewayServer.getListeningPort)
+    env.put("PYSPARK_GATEWAY_SECRET", secret)
     // pass conf spark.pyspark.python to python process, the only way to pass info to
     // python process is through environment variable.
     sparkConf.get(PYSPARK_PYTHON).foreach(env.put("PYSPARK_PYTHON", _))
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index bbfcfbaa7363c..b180a40239f9c 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -342,6 +342,11 @@ package object config {
       .regexConf
       .createOptional
 
+  private[spark] val AUTH_SECRET_BIT_LENGTH =
+    ConfigBuilder("spark.authenticate.secretBitLength")
+      .intConf
+      .createWithDefault(256)
+
   private[spark] val NETWORK_AUTH_ENABLED =
     ConfigBuilder("spark.authenticate")
       .booleanConf
diff --git a/core/src/main/scala/org/apache/spark/security/SocketAuthHelper.scala b/core/src/main/scala/org/apache/spark/security/SocketAuthHelper.scala
new file mode 100644
index 0000000000000..d15e7937b0523
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/security/SocketAuthHelper.scala
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.security
+
+import java.io.{DataInputStream, DataOutputStream, InputStream}
+import java.net.Socket
+import java.nio.charset.StandardCharsets.UTF_8
+
+import org.apache.spark.SparkConf
+import org.apache.spark.network.util.JavaUtils
+import org.apache.spark.util.Utils
+
+/**
+ * A class that can be used to add a simple authentication protocol to socket-based communication.
+ *
+ * The protocol is simple: an auth secret is written to the socket, and the other side checks the
+ * secret and writes either "ok" or "err" to the output. If authentication fails, the socket is
+ * not expected to be valid anymore.
+ *
+ * There's no secrecy, so this relies on the sockets being either local or somehow encrypted.
+ */
+private[spark] class SocketAuthHelper(conf: SparkConf) {
+
+  val secret = Utils.createSecret(conf)
+
+  /**
+   * Read the auth secret from the socket and compare to the expected value. Write the reply back
+   * to the socket.
+   *
+   * If authentication fails, this method will close the socket.
+   *
+   * @param s The client socket.
+   * @throws IllegalArgumentException If authentication fails.
+   */
+  def authClient(s: Socket): Unit = {
+    // Set the socket timeout while checking the auth secret. Reset it before returning.
+    val currentTimeout = s.getSoTimeout()
+    try {
+      s.setSoTimeout(10000)
+      val clientSecret = readUtf8(s)
+      if (secret == clientSecret) {
+        writeUtf8("ok", s)
+      } else {
+        writeUtf8("err", s)
+        JavaUtils.closeQuietly(s)
+      }
+    } finally {
+      s.setSoTimeout(currentTimeout)
+    }
+  }
+
+  /**
+   * Authenticate with a server by writing the auth secret and checking the server's reply.
+   *
+   * If authentication fails, this method will close the socket.
+   *
+   * @param s The socket connected to the server.
+   * @throws IllegalArgumentException If authentication fails.
+   */
+  def authToServer(s: Socket): Unit = {
+    writeUtf8(secret, s)
+
+    val reply = readUtf8(s)
+    if (reply != "ok") {
+      JavaUtils.closeQuietly(s)
+      throw new IllegalArgumentException("Authentication failed.")
+    }
+  }
+
+  protected def readUtf8(s: Socket): String = {
+    val din = new DataInputStream(s.getInputStream())
+    val len = din.readInt()
+    val bytes = new Array[Byte](len)
+    din.readFully(bytes)
+    new String(bytes, UTF_8)
+  }
+
+  protected def writeUtf8(str: String, s: Socket): Unit = {
+    val bytes = str.getBytes(UTF_8)
+    val dout = new DataOutputStream(s.getOutputStream())
+    dout.writeInt(bytes.length)
+    dout.write(bytes, 0, bytes.length)
+    dout.flush()
+  }
+
+}
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 5853302973140..12d0934be9204 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.util
 
 import java.io._
+import java.lang.{Byte => JByte}
 import java.lang.management.{LockInfo, ManagementFactory, MonitorInfo, ThreadInfo}
 import java.lang.reflect.InvocationTargetException
 import java.math.{MathContext, RoundingMode}
@@ -26,6 +27,7 @@ import java.nio.ByteBuffer
 import java.nio.channels.{Channels, FileChannel}
 import java.nio.charset.StandardCharsets
 import java.nio.file.{Files, Paths}
+import java.security.SecureRandom
 import java.util.{Locale, Properties, Random, UUID}
 import java.util.concurrent._
 import java.util.concurrent.atomic.AtomicBoolean
@@ -44,6 +46,7 @@ import scala.util.matching.Regex
 
 import _root_.io.netty.channel.unix.Errors.NativeIoException
 import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache}
+import com.google.common.hash.HashCodes
 import com.google.common.io.{ByteStreams, Files => GFiles}
 import com.google.common.net.InetAddresses
 import org.apache.commons.lang3.SystemUtils
@@ -2805,6 +2808,15 @@ private[spark] object Utils extends Logging {
 
     s"k8s://$resolvedURL"
   }
+
+  def createSecret(conf: SparkConf): String = {
+    val bits = conf.get(AUTH_SECRET_BIT_LENGTH)
+    val rnd = new SecureRandom()
+    val secretBytes = new Array[Byte](bits / JByte.SIZE)
+    rnd.nextBytes(secretBytes)
+    HashCodes.fromBytes(secretBytes).toString()
+  }
+
 }
 
 private[util] object CallerContext extends Logging {
diff --git a/core/src/test/scala/org/apache/spark/security/SocketAuthHelperSuite.scala b/core/src/test/scala/org/apache/spark/security/SocketAuthHelperSuite.scala
new file mode 100644
index 0000000000000..e57cb701b6284
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/security/SocketAuthHelperSuite.scala
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.security
+
+import java.io.Closeable
+import java.net._
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.internal.config._
+import org.apache.spark.util.Utils
+
+class SocketAuthHelperSuite extends SparkFunSuite {
+
+  private val conf = new SparkConf()
+  private val authHelper = new SocketAuthHelper(conf)
+
+  test("successful auth") {
+    Utils.tryWithResource(new ServerThread()) { server =>
+      Utils.tryWithResource(server.createClient()) { client =>
+        authHelper.authToServer(client)
+        server.close()
+        server.join()
+        assert(server.error == null)
+        assert(server.authenticated)
+      }
+    }
+  }
+
+  test("failed auth") {
+    Utils.tryWithResource(new ServerThread()) { server =>
+      Utils.tryWithResource(server.createClient()) { client =>
+        val badHelper = new SocketAuthHelper(new SparkConf().set(AUTH_SECRET_BIT_LENGTH, 128))
+        intercept[IllegalArgumentException] {
+          badHelper.authToServer(client)
+        }
+        server.close()
+        server.join()
+        assert(server.error != null)
+        assert(!server.authenticated)
+      }
+    }
+  }
+
+  private class ServerThread extends Thread with Closeable {
+
+    private val ss = new ServerSocket()
+    ss.bind(new InetSocketAddress(InetAddress.getLoopbackAddress(), 0))
+
+    @volatile var error: Exception = _
+    @volatile var authenticated = false
+
+    setDaemon(true)
+    start()
+
+    def createClient(): Socket = {
+      new Socket(InetAddress.getLoopbackAddress(), ss.getLocalPort())
+    }
+
+    override def run(): Unit = {
+      var clientConn: Socket = null
+      try {
+        clientConn = ss.accept()
+        authHelper.authClient(clientConn)
+        authenticated = true
+      } catch {
+        case e: Exception =>
+          error = e
+      } finally {
+        Option(clientConn).foreach(_.close())
+      }
+    }
+
+    override def close(): Unit = {
+      try {
+        ss.close()
+      } finally {
+        interrupt()
+      }
+    }
+
+  }
+
+}
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 3ded7aea11ebc..577bf43a1340a 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -169,7 +169,7 @@ parquet-hadoop-1.8.2.jar
 parquet-hadoop-bundle-1.6.0.jar
 parquet-jackson-1.8.2.jar
 protobuf-java-2.5.0.jar
-py4j-0.10.6.jar
+py4j-0.10.7.jar
 pyrolite-4.13.jar
 scala-compiler-2.11.8.jar
 scala-library-2.11.8.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 344f22d2360e5..304982e92e093 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -170,7 +170,7 @@ parquet-hadoop-1.8.2.jar
 parquet-hadoop-bundle-1.6.0.jar
 parquet-jackson-1.8.2.jar
 protobuf-java-2.5.0.jar
-py4j-0.10.6.jar
+py4j-0.10.7.jar
 pyrolite-4.13.jar
 scala-compiler-2.11.8.jar
 scala-library-2.11.8.jar
diff --git a/dev/run-pip-tests b/dev/run-pip-tests
index d51dde12a03c5..03fc83298dc2f 100755
--- a/dev/run-pip-tests
+++ b/dev/run-pip-tests
@@ -89,7 +89,7 @@ for python in "${PYTHON_EXECS[@]}"; do
       source "$VIRTUALENV_PATH"/bin/activate
     fi
     # Upgrade pip & friends if using virutal env
-    if [ ! -n "USE_CONDA" ]; then
+    if [ ! -n "$USE_CONDA" ]; then
       pip install --upgrade pip pypandoc wheel numpy
     fi
 
diff --git a/python/README.md b/python/README.md
index 3f17fdb98a081..61d2abf61d261 100644
--- a/python/README.md
+++ b/python/README.md
@@ -29,4 +29,4 @@ The Python packaging for Spark is not intended to replace all of the other use c
 
 ## Python Requirements
 
-At its core PySpark depends on Py4J (currently version 0.10.6), but some additional sub-packages have their own extra requirements for some features (including numpy, pandas, and pyarrow).
+At its core PySpark depends on Py4J (currently version 0.10.7), but some additional sub-packages have their own extra requirements for some features (including numpy, pandas, and pyarrow).
diff --git a/python/docs/Makefile b/python/docs/Makefile
index 09898f29950ed..b8e079483c90c 100644
--- a/python/docs/Makefile
+++ b/python/docs/Makefile
@@ -7,7 +7,7 @@ SPHINXBUILD   ?= sphinx-build
 PAPER         ?=
 BUILDDIR      ?= _build
 
-export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.10.6-src.zip)
+export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.10.7-src.zip)
 
 # User-friendly check for sphinx-build
 ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
diff --git a/python/lib/py4j-0.10.6-src.zip b/python/lib/py4j-0.10.6-src.zip
deleted file mode 100644
index 2f8edcc0c0b886669460642aa650a1b642367439..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 80352
zcmafZ1B@`;wq@J4ZM*wx+qP}nwr$(CZQC}!wmtuSnLBspB`;ISE+lnMcBQh<I(w}p
zF9i$&1@NzfO5;TGzi$5b2^0Vaz}|z!l3qm_5&&49T1O+|pKx)91^@(k1_S^ALHXxJ
z`G1E0>k0v&QGOsvq7S?-^UssrKY{S?;d*)&wieEMdUW<4|230^y}AYwy4HW6Nl6u=
zoaJ5-LR@-QR$A^vl6rDZB|J`!nwFAQGA2%Ke42Kgo=QPnnre2Al2#%n?2d5qNx#vg
zLS!W4-9uaZ{$8F;zKm8{9bO_C`oE>MI*Aq63km=LhxLD@WoTezWpC%`{QpU7MO`Xx
ziw&XoQw?Flz@#5B5@*v;0sWj>WS)p39@G;8D1<s@ZCbKeP_e7ZFYiEzNkpNQeFvmm
zHk9S-^m}H0PJ)tZ)Q(P~%#Gexo5$<L`k9*6PK%IL*?8?<k$C`hLt-I5(=z@TuDJ1b
zIJUWG7h1MaltSnA*yLx(v!!!0C~%)9h2LB!DKn)Xb<nk25T^(HVQx$9*zT3HZBnh8
zu`}Xw#qxv|HTo80Bh$^}4JyQq#QXl{;l!(PC-Lt|sCQqlHlla+!3fX179`i<!u{E#
zFRfK23Z-t5-ZI9gnV3mR#8sTUCA@y_*|Fsu%4npO`+RXh_HI5nl5P56Q6xi6zNoux
z4;2?>fR(qEen@k{AUt3$<M^S%sDN$w<s?r*q`3f9xrH4tSplM3vnmF|j3xNBG(>zi
z;{dihzth9IJM2krlN>HW3%ZU@f)zh#x}qz1I%iKBe-(P<jlXf^J<<vP)^W|_3WTZw
z0yGTUl1Kt(9{Xy_LK1<w?CWiyp1C4XyXe(tBN%G}1I9ZD5Od7?L}`5kD28tQZm{@S
z4`m9bWos^k!O1pqG!%MfHzH0c0vZ{;_D?;Jqo#`Nim-Ix4N2Ri=?f1CVj%1Sw&S<U
zUu6j7O-}T28(e`wz`6ndsy~f!f|QwHIhc`*T46Ph^qW6uPX#8iDv~0sJ6QCE;_DRY
zaco4K(2YDq9{BS!rbh7kG1f9U>UFCT?dhT0aD|MeuiGxy4R>IH0`W_ZMf4)j+w+$g
z&DpJ|DXf0EK#%-2Xw7m&$!gLmp+z{0+|j)9=N-Q-kysDt#|9;^#5;m^kYh*zh&4#|
z4df7CH$H;4FfyEB7=l^*M<v$rooJwh%!>|dV=3w8Kc42583#*=jtV%zL*ueu&U2Oo
z&)ds{ur9liY+7?~5wuIlOP?FV65|<W<i(goF}ONk`d5M(g(RNG@WvOML&EAENg{Ut
z{BL&8uxTF)vpI&@c~BhLHUQcJVGBxzi*f|}(}U&tLx^qgqR*f_GihvUlJeSt5&(xO
z&3(^!`rKR0{cR=$hjy#|3v;i?(c2A-9##(rcgCtD4Jsbis)SMNE14DZ4RW$^hRIVM
zczDw0OzmgdFT#@e$D!RjHFLybZo#{HBEuq5WgSQ{qr-1{ngsNbP~%XH?SL82r0GFe
z^U9W|fOd+=$dvJ0bZjkg18kI$+)5t>eTeC@3ZEeknNdgMEe>Yr-~<eLX3VX9(X05_
znPA$ZpAp)SN#*n2OdDAv>rnGgV#w^%xwuzI{{u3=pNbvv`QwzoNZN_kuvWq3t?@y$
z8MEXqc4>Y2dgo}^sFQb_jN``W;U}$r=9)Y3S(74`h~W{;EmYZKnY^q{u~4q{4VQf~
z79KOih$5^^(v5a|aY3h~Q(t8JFvuJMhg%~`X$_cd@<K+pw%Zlj{!H43B4<r{;|{9D
z({O7zuixETA|tN8<9L8)_HkEh^UA3LtB?U-0IJz6Be;+qIxcIeoA4A;rD)_$=N8c#
z(*uzl#WzQ4v>_nb7q3Pt3<WoLR-Re#Qhc#<>mWyq?5X=1Cj-+d>H*(Nq>;;*OGB2~
z6}$*}Tnw0Z$!e_-f+JI8IeHLiZWRdIV|J<#VaX-l9)ZP_)z(gWnlBX7X-&o)PRc&<
z<yhLeUmY6{)p4RMt8%_XXVm!K)-r$bu1ok_YV+8^`F1AOm$Jk>KOTvBXrbPmD>-L=
z`cZbtSy-iQOV||H$!Y_4TiFHsY&F^43qL=mKdxUEPMtU`d(E{95DdDFx%`kp<6>h6
z#S4|1euLb|Jb~|)-B;LQ%_k-gCC-eYUZluQgEXN2=B+z*FU4ld<*LphdGKbhOaGWD
zJ*gL?wLTtS-$re89X)Dv*)w3Y=s!~ySv#&?=eaJ+n$M`ks-Q7b-^}BK#+ELLM*q4?
zzks;u+q55ct(g4>p8kby<~YP%=coVxzPbPaDF22hBWnv2TW2Q|M^_Wae`CSF!0uo1
z-@xu#`y%P6HRZRq4`{)gA;nzTHCwqvc|~jGVl{f}!l+g2NTZH@q<}<RhFye;yHqS|
z^Veq&FuuK<_f(u$UFD^jVjs{Q-S_q&fPQ#tuMcI)Fy%C>!kx?S`GNk;m;nclK2jw~
zLSc`rMZB#`+SWu@jR>*IxhIv>J77;TLtm|dR+R}ET4PTt10Lz#=z$QCuf$}BB%@tC
zwatRA;3~_<N9&kq#!BNx67n|>2F&aaf+Te*<7ur#?_x}v1l{r2iA|ziF@?!<oTw@X
zygA^WtBCFd6a^40Wu-ASICoWqyGrsr<3L56NN4DQltd0FjOvPn#uTcna+8l&>8en1
z0+yUUz;*&d6itxsh^nKNH%F=JwVFeo={PWkG99)Ej#hIr`S{v^z7Coe;@DNGtt95y
z_)cN9NK%<N*{tHeRHBMe0!<|muq0y}28{Al^}4nH)rXY|&8_i_F;_yfjlrc6W7%2f
zSda9LWQVHK#`W*h)v=noXt#u=N8VqTJ#&Kn$2hVSm+t9EDAj4b%B@me)YtcPEaf|r
zAVQNrXmuHb5~+cK^$aV-wq8Kq+9Xq5*c!g^Ct1ZH5d)4@t!3}yw$mU_03x|lCOK&D
zS7elGMh=`_E?+O+c=&m_x^uR+o#1x0wP)KcJ`ayyVO3LaGtVo2-_v|<J#w<9cBl7W
ze+^xq(KK`M^QL{<Ir(yQu1XH24^6l{wO2-L$<>NtQ!>4}8q1?h5+aHZ`(SP9qpoAG
zp|pO?h_NlcHeZ^L?@WKo@Xbs#0MhQ~q!?NYyOrcBY0sUy6RXp(QPdlD0?Ed>-U7(u
z<Oa?&nd10JG>FB51^`qfQV+$imO-#`mYU4RnMoxcI>XkFPQ%(Bm#W;-?yH)njvkwS
zb9KgUYiobmy1<Br+ErpWz+~`gnu;rjt61b3pZrYT;mBZ;H891N0A;L5*Ysg6a#tO2
zRG74iQUQhb-3oZ$oCV<x5I)*ID>eW{%z_!8F=R9wi)_mqCoE8czP@cTC(Ly<jSkCX
zE4eTlu<<s4ho{;0HQF~4&P`%uQ2C(u`*WU^O;et>M2yp<Ol0?)<-;#5L7D~?dR{~y
zC&(z#Cqx6uj4{ovv}t@p{D|t{>y#DnqSW@i#M#x|%?EQ7L~}E!QUlDq)X4OqO7?x5
z9sCQHB{GA$z}DKWKFje3nu5Bv1+45-_3_2QJ*$^sIYjTKZZ~4;-}~~q*MQ$GjcOuX
zC^TyH{mNsx$Nb4$j6J{;SX>b6lrm~!&7X~;@V+H~ZXsNRS;Ha*t1VXq#P^*s%|wt$
z6jU%>%TLBW&Kt)Yd~#`+FzrrSDP9@Xsh-AgS8)uRU={?oBCedNqePLlNcqXYzi$i7
z%2}`ZKc<p}3pH3vb|ifOJ@R#E>@EncP;Tu#(PgSNT<9E#W>*dMSoZQ^s-Rwp5Kd-h
z-XWRydEzVVYG!ZK+8jO5_s~>wf+FX$_m29Zu+IP3U}LytElYKIfk-2acnT}62p3uw
zv+t9RQ|gpS$?JU7w*C>bke}#Rn1cG=88$+ygmnt{ei-`8s9^}}LbWE5gb4F~l-KwB
z<K<T^i(huGx0{64Ek+syxDhSVs4dBsz13*W);3IOh0~L@hyIR0rhVhEsG+dU$|W#N
zjX3c4mDb)Mk@CER;=!68jV%fRxPWCrLH-vp;)6%Rs#>CTSRa<462<G<LcODLoRzCt
z1j~-zR%tW=i2xmea+Nk0nL>Kdc<Z><>S(4K39~X>J1BqRJETL>by{2Hkqof4uyPm)
z24-Q)&~AJc56<Ea1ky8rEWQ+oItghd>Z?}XSFQG=bP#Vi7e;ErG}p2mu;LG@Dww*L
zCwP;<%9KE8qz+f3AyB#6w(TMnJ|Vz?d9)(SU(op?8ead#DMy{;eYsSc;1F#x*lJ!C
z2CR~rVu}{{HT-vnv;YQ6^+2Mb)9I^<aR%*KXWwPy#MQG-Y0?^L3D{o}lAomzR&EIS
zs=qV6l_#!^){-#Vr~BOW`iB-GP}Yl~Je2g7G8<sP*AzzDJU}^;1j_+-?I-r)iN&b*
z_(o_QRY6uPuE3fkvE#8g-NhwQo9V5P!|1Rdie8fAGqX(}(|<RN&6Rh}Eh9>oz?f7b
z17Iqfu6OUnRtXUWj{WcV6cbBNHwjTYWK4sh3atk=sFHQg>8KWmWO+&(HL#RK3nFJ6
z3(tj#_Yo-2Fa3HCuSEm&)Cg<dWe6w)QG01KtwUmefybKjVWb4~aq<mgpPaR=5*Ljc
zUiTO7`aVf&?*B+bcqARa);CFGC>My<uK!qzsIZNUCgi&W5Pzzt5zy4<kcnN*z92i$
zu}?VCLoHIvW7}6@xNg|CK>fi%W<j9-74nXv$RP53#%Y^^A_~XBv2U!!?cA{~%f=Js
zP3hTXBSNRbVBNT$xd5<aL`4bTMy^y7sNMwU9^kY^|5N;OI2*f3s3!TLm2nIs#^V{P
z8fnwnQ8OW{a1(3_51UqdjZgq)QOfFt9fIyLQK;3JNj!Yzc6zrT!ro6aR#%!%2e2o_
zY!N224t5D=YhEvnC6$l}d)C800oDuCg~60X;bHx^gbjx+z@8rDMy$_BN}-ECiiqI^
zlHf_fZ-WnCp8VW#e~oR~AQgK+R<HH2YJ}Tn`0<Yr9{smLf+c1NaaOF;=qw6^*SI!*
zxF=jND+I5b3-&rU8*vezuktYn&&Mr2(&Lv@QHUzP5^^lEhy5K33A=z9(4^+AmbZ-N
zsXOmO1A{?vkZxj9o;et&IML#9uyuB%#461rjs@T^HR?g+2ks^`3~AHqk?f$k*_DEY
znLbzAo@~I2g*Gtf)9le=q(5_Od<w8>HX0c?aqR$A)z<v170@O#`K5hoa6;{b{8qa1
zsOA`!sV%BkW&H`IIdX;b`F++TRPsvd#a&>b(S!ERUswrmB=4hb5?~Hkl0b;BN8)z0
zHg^Qs;|E>fZn)76JSB~DVuNgTmUfB#<Zl{+*kEu#NHaxG06qBeCY|#l1H3M>IqMeh
z6vxngU2+c`kU4|#I`@W-iepJl3|@sN7P8zX-Pnj;akhHD2ZWwKWKU96v#dWF^cItU
z%aS4PI>Gk_?AwvoAcbdU;83eb{q;5{LZ+Yez-lM#BO77aSw>%gSVP*LiSa=?`OhR6
z>Yx32=Jyj2=2ch8f%usJ%=l9_%I0$VyUwJ_TtFHV5AYsaL}f<AK-p9|M<6kJq`mx2
zl16fg`Tm)r(vV`tO*Cfk4k?}3V771YJ~jd9yjh20Cc19OPhBjM$18br3;pZL;&zoF
zX*TRRWF~BksC7l0lrGCwK4LI_48zSTgul)PTQ0SkL5h7CiXP%-zqT_<#4Bxr_>n%T
zb7qzq5c42X49xd&DK~U-tqf0iQ@q`hQLK|iRzYzP!`ts*_Gv<P!JdY=n;u(V>TSFZ
zH{BpjK;lBcWc4L<P{7;GbBwG_3PYO9b?t=D@p6p-%;lE5KEuP~54X$Q1&y6~P;x$u
zjAMB$Jz~JZ;*{&mw&~P8Dr%WsHjxcG%@>{l?r4WXN~BNrG=KdD5fR;-mGYN8(J3Gr
zt5B8|oMeIkW<ahaAKpSlN;-h%OgKeOhx`z#%r#Gy5Q&WUs(rHywdA_FG&jxZgkgi#
z<+#yxd1982`&#J?ufj!`3(!a>sEorBX$M%VcnsBgpR%0G_M0cSuf;tj*0v3B?XpXt
zG}z7a+yG4#ehaCMCx%&G9LInwL!GSXQk3jY&)B|SKZEyM4ed6!dF|xK!SDgXrIBo9
zT$|HJa><L#ZrXE++FYxB4JE&R_I11vby!lW5EKS^-JCP5J02xC#KeH{TM;;d060c5
z`BQO?3J0kuEJy~(7ch_&wW@{?iT4U&%tfaZVMm-wvcbY&w92PHAvcYW@OY?&+DEaz
zRLBYzdTzayCsiZLlzs}aq3;9z5DVc3++|%Y4HQsp3O*|=Ea1n?CCi)cM9P0L9}~wV
zgB%=VL^DF``(T>vLydtmJHo^b?$>XSxd-4@FB}`L(=h^|tk}qSaat#V(=3aGd`WDp
zy0f^*mHCrN0Ws>ML5{1X-EdZE7(sRG!86)6_!Cbtp+4iIeshH22(O&+#jA5)w4=V?
zyBT#9VR+shhEWFwY1d8Ak#Cqfwy9yiYP(7%>IY9t;gjw){ukOI<N%sg6jWaQkhfyE
zQkpM4OjoD0@EW!nDD<(O;*nR`LS`<YqCmqlsuk?eQvN%w0PybF@52!y+-gE)3<IdL
zl2UIArmSs1b4Un+e}=Z&rEaKaOaDc_&VIuP{S-?JK_=1RxHA5D$5z2yWEOLlPG%)w
zS{thjQF#Y|tyf~;Ug2<PPL^wd3dy$W_3|5TpAZ#8UqGlaS$udO=@a8>K1A&rr=9yx
zBsm7GZiWRfdF^V7YaB!)d?<HZc(F%$g^q8(7%*pTQ8YZQ<ch}|#(i7<SBAj*&^x1w
zt2YDOE~?0BNE@;@)+)T|Q9^~)fOW+t7ECXmNA}O9LUWhw`2jEIF>i{Oem8?C2WirE
z-_9scgUE^|FZ6M_n;rS}e#^bD;d4~B3jXFGye&AQSKut`<)c85S2i8nXB11FW%WU!
zt|b#aI^_B|26Bohn`1G&kfxc{->1sIgOeH^uiU+YoJ(k>C+#*C->J)Afj^JXH<qa=
zWvY@<Yeg0*j+BU{<@}Kj{v&9UXUxSTk)tTU3@}#8DP<YoYiC@9F7J+Xs{p(1CXgZj
ze0J<@cySbq)K!m+`WiJ3XjzG$D_M{W1P|_;YeL`hS@cj$E?c@`By;vyWo}4?>1e#~
zTz~$kh4P+q44JH6tVhl<%&#yU@7t2mi=3FL2%5R@+;yr$s%pz{lWkhmD1D^J&+ww8
z-I6|{nD->^BDl%8SZM`#NI~_}IGeHAy!@L<T&$Ov-1L{e;Q|u!Z$WR>wVmS^6>nGe
z3@cH_wsy`JZEuF&Tr7PzHLt*P{jASW1vXE8wMy5yfaF2L3{|X<kVOObHTkPaqtZNy
z_vhF-->Td8<`hOQ7sfNA<{5k8_!YBj_7H`zuvV;uW7hTrtL_|DYLr2KQc<USC!+as
zk)^6tig4CNr%l$(DDC)o`rdhfDBR{_hzVxVT;$!HfETNl0;Z>#5WK<CP&>D=BoA-g
zFUPrd_}TR|9w&o8?OW(<Mq+D)!uF1qfi(GRvfK5IrMmk%t!kcPiz!g~mC)FPJSp#j
zRaF?^HqAMP>yB~dn~3p+=Hh~_l$graf_`UqbS;y>9%st6<ln@-T_t8ZAy*Er=aao1
zjF?cY=t~2x2>lM|RTj6uJGEp(Lhp}dmhT`Jz+OIkLg+f=*N8J$NAG`k?&t0&Rz^nd
zPb|DWycx3q)@fAre`I=m-_s1n?e3(zt=;1Y`w+@PW+T39t&}UUZVfxE>mB{zw;Px4
zy;WQr6QfL#VYgzpi;5??@05ncs5S{rsao)2gM8KG&<BfL2Ed0yE%XQS+U5#C6|zfX
zX*PJ$ViHhmRVbbLu%K42F}^*{htf=Sl;@LLTaQdGgH>c0K~g^=9Z@yHYTqS)=T9Xs
zvQi3j^>TD0jxz0zmngwhHUj{aoV@J-bSKE2Z)4UL8$>)8-%+zil)oOgez9fGUzBh!
zB%(jAiHaXO9qZMibWBFJ4rd7?Gf6KY&T%)JR9RI9)RU>;2BEiy8mbh?YGxVJU&nl=
zU}A!j$0kk9*4EV4?pI8_nY*}oj%m_?WQW6OC3JU?C0Xq1vgktW6;e<2clxCDd|zJ^
z#0FFM8IhF*3Wx>pY>t=Q34QyCe4QVh?<XiUQ@z>R3Xit_kWO#N?1PSJk6+jBf}dA}
zRCKBqRnN8Kg?>HHebxtOauS0t>aYxYoeUMsbpa19Tc0Z)Dr*FelymGA%mF$n#|wwN
z=Y)g$@MQJ%^hVR0va^rXnHA+m*ds+*d_C+u-Rf4|UAumr?|g~7z_U@I^^IO!I%E!3
zGC?dBJW8|fCMcdH9X&d^U=MD)4>nz^V%q_(8n&o-)kD6-5^j$93@_nE?d~jm1#gG)
z3?J~+8#$P})Wy~PLInnu<mAoCp2}&Or}y>Ds4+!t)<2`43iKPp*L3-%q0nl&dpQD4
zVr)getD`+z5c($5Cs+&HJ$7viXN9o?zdj0fC+%UVAvxq78gZ=aIG1qh@c5g3@9%@6
zMk$G;^VXa-e&3)Hrl=>6<V^eM74pp)vvj?UHNGpoG}JYCXk;1NlXvCjOwameVyYJh
zwY$5*y@lS=V#>~IE<&={1?0W&q7cV3WY<L;YjDDalH=>MmHlZ?7wYpO$4TK0Us<66
zD!x|x=1%QiBAA$W_}eit8GG$AF-~5(S{+V-abENe)~KXw1K=B<%wv&uz+(qMwDjPz
zfWOB5r8Xu}Gx|QiF0hk2SmCY^(#&GUOkV36XK9|xrqcMr1SSV$v*Y{y*0Wx6AAP+|
zYQRKvJ{`IS1)7P6bL8uygFE_r=C;2Wlw-rkr-}^s<|WVDf>FdlmDCXkfHi^`{mIt-
z{FdgAav&!}m4{g34Hx|2u`q!gKY=Vik;FDBCq(=$e(Y2bUq*<4Z1xoI1@_?%CX^S#
zxr{KN-0kY2Yx<TCjyb#1LhBmBb#;d*br4;pA=Kwqm}AG6??E9ok`~8spHbA#d=x^I
zq;FVhMo5PC0W`(HWIue7paHn}js>Q$c~tyfThY__XGrb&DzM|)YA(yI%w+a+YdZ#(
z@9T%a?XiH8r~6tSsfJ5y=+MY_7{ptNMXx<f%%f5Evh#^&QGLb=<vSPhCDF0N_)+X@
z;6?fno_BO0?lZD~Jo~)esGKKeNHPlly4N|%eB1(AneDNLLz4Ktw6&J<S!MpyeEtz^
zWwPVOF(?cGZL^>`c!s@fSz99Rd1+-zb+gYoQ-13DCSr0(swB40%H{K6nKqsta;wwM
zB0BVw5CYXJE6ltzcbHf~h9aXS@;qy2AJgf!(k8&Uo{f?zQm3=AILmZECaOgR_A&T%
zLUzUH&$s64jjze4Ekn_{L{5SIZY+|niknyt-37dQ&~;kVCA+YOoBg)h*yUCMcf#%R
z=cv7gw^s(W_c<XuZkw9@=G2fvzMu<=w^=pwX4#5T1-hNP89)Gbfod{R5dWl)bk>iZ
ziE<om(WS|Oa^JfMDvjH18Z%P!i&oHi*pq_DYXGpI8UdO`U+jd(R9PDS-8;4-ZdCCr
zTTY#3ouo^5X%?sU*jDPgGnnID<dxE;GrpBcPha!5n6HP8+gXfW;t3hd6RT<`gLOE6
zkK)z1*B|;muQiLaY>Lhk>M-5fV}wPqe>P^Ddf8nqo-VGmwtJh(##=-2wlrO#6Yg&Q
zXQFv(Dt~n=+|(;lEmvhdpk1FN^ZZ{dvov^Z*ez-0uNQx?jM^68nb(v)MP^@Mz<=TY
zU)Go^*?s~87yuv<@*f59|FFh(HueVpM?3r<_~I7Tnb;);gx)hs(H*HyD#S~oU@A}b
zf)HRNpnze(8Y1!x>z=I5ifij7^dnGr#!*+_EL-&W^MZ!H_tUYXIR`p|6ijqC>T>Cy
zLm7swnLsy3Ev6x4U3(}0ZlsC>Mf35JK*_soBQQwYbz*V?fhIAOyLWvy#eSyBfC!k{
zLQbea)G7!S>UO&NQ25I=xrT(1Rg;O7Z9$x~lnRtl#ycCzlm^IhqG(EXD%Eq)df}-;
zCJZl81QuyJ=ZFf6vX6Wp=_ZBoznPoydpHOnapf$uB&S4a^{sFV2!;KBU;^-~PH3yw
z5a`hFJbR)1ed&Ze++FZVX?*Tn-e~p`c)Ue`?EwXE)cr$71Lkt4p&rZDleX#Y4rsOC
z0WzMM+gA)$lMVc!Dx82C$Foic;;UvKSBAy(Z3aCh>xj)P(jz0dig#Ch=T+=%LW)!C
zzKx8<(?xAi+)f9Kg)LulFs{vUj`ZUxN8qHei*lZ7u~*y|^$*40|N3H|5E9Gz*V=V5
z277Ko)JoWgMxY2GktT$~u9YPHu3-MYl9+POo~Yh?x5jdgTQCxLFc>hoq)`#SrCU>T
z3CMclF?zer-eH0IRn!ZDZ(B=w%2o3qbz}_fbPw_P+0)EccJ6Nm=QRIHZuQjGI8yfc
zM4w#c60a!Y%Z0ZniPg=*Wl|{OS0wzzk#gj3v^WcLmjIYGX7L$uo_MqUlIL(9>QOy+
zK>RATPvBdw4nXrC)YUyVOY|hqvA#wC#EiP-jB3%|f@$Je`mxXVdS%$9n}$YO+fr_h
za<QOARB(lsHF@|!Vcjq#Pw?!xv2ZIiB#U6vV1-@op#9Jaot5yJ+SpSyVk!n$3^arJ
zEqq=vKE;tS>dYV20oZ|_p}*tlpoeT<-5la8h;Q~UqDTQ_4$*Glzeq2{NCP2m3)>|6
zb{(!`9dazIA49j3u-VL=lssbJ=p);~<`wtDv0lzAiL&4mP?=!Pa*JneH`cqd`VJ4p
zC_WW1K`AT>a`LN@R@$#Kc7l{Idn~K?YCFyUNy78KbO5@-cM16rxWoQWef`Tcp!87`
zy!20fEg}E_ApX0)Of75;tSvnMBgy;^u7PW{ov1BQ1iv*sMs0jcbgg32c_NN;ohbnV
z3&I}WltV2kVk@UAorbQ+qu{UGobPzjjfb6%wJ^a#axa#=zV{iJ53tldT0%9@v@DpP
z?jDVwW0HxH5s?Q`)EfSJ)FyfT2~sS5`yo=~FJjAaP-ZZeOi_uUwPH;wnP2v8_2pxP
zbQ*E!jAA~C4NS5LdaR5QpyXh7>7h92w}X*a`a}|=nwsR<{Y(q3f__i%zD8Kd(HOak
z<;u{NN^6@chZZ1Rc1bOB(lR7_QVvklK^p&>7^IpJm3i^Nk{XJ7>K112PtB9pdy^)w
zce)vs9r=!X){ZKfM3RC<ExT`UYKOQ1>6$$Q+cOQIC;3n{d-eiX@NI!MM9UfB;Tl#D
ze-*E8{iwgYT?^`kq8Z4!Il#coxrUN19xVZrsaui7e~=Al9zH!X-SOK{MPLHj<5XIK
zT2Unjv!m0SO;NI+8~9U}IUrIz%y>m-U3Q^LKiA1z9PWZo)-AQu12XNXppo{HR<L|N
zK)~rML@;o@t-x|C!o93%s@-IVyqK$Rl#LeHh55evFGcqU-y<WmL;8F7t>z}K0*);O
zph^{MDEO(Q4C(>VF7bq9CbzLrOkms^mjX<sgy_9F0{3PhC95Fa5r+SW`51eg;8ujH
zTXiH>9JQmva{58$@p@&nb-`rk%mnSu^^UPFuA}Jk2SHU+sCt1nZO;JHqxEK^rXX)%
zVSfIzB1Ur{n3GP>p<U%x8)UAKOnL8l9ANFTSBF)G`*89z@H{P>pnTyD?JO|kl6C*$
zLXb&c`d?T%#wEQV{ES5PLAKLv0*_l%uXu5T-6AlRv4TZ=S7d+IoIW<%0?Y}{(azb7
zJWfJ~)gT0#8Y}AwbR`J#Ag47e%uF*Lv1>xXBTio1?-UsV(FjN;d3gwNq08wDPq^8T
zSc473#czC0?W;&Em1uQqxq}B;=wrRE4BZAN?>YuWDWph7?qCROlG=_S7c|)Zy@|I3
zE+ZCmxQK0mWr>u!stw<QK;mmZG_MuHpZWS*d*>+sc|IdN*G(4lD-R0!V<aR-v8FSb
zy$CaA&Gy2^*BS|#uwZ?YnsGGRyejMn9vm6=IN;>*laS3FKasDe7kB)7E*|@N6%9XU
z^mJ90I{Cy?h6E#%-fuD=8?<5GQlMfWi$6*KL*=_rc6;UH^|yBf_s@gdi%-_@&l9TG
zaUiZ<ou6Aru4@ZkhdTK=!$@#heSb-w$E3+p9&|H6Pm^qgAF9C*{P+=M6ZnzG@CBdJ
zjuV>&=b*HHLT)IJ{fR~9?|<{|+%}|ns=xvO0Eq$sVElV^urzQr&@-~Lwl*<xwy?8x
z`nPWTKP?j4Hg;PaNIy4v15SP7T5IB+p9cumXyc|kT~CQD;X9ycJAx%PoT}7Pyb7|1
z)GvK|(|01px;W&rTQu^(A~ueu9NCz>*@PwBM)bb!17sU4vUPClTMJd_zr(4rZClsb
zCQB+r>U76UQYPG?w@ozQhfkWvpbYhEUztO<B#22wvx;mI?a6|TL`LXWhOxA*O+{Rv
z)+}+PS`-I&v|5Jja_jCY^(041j3!5{S&^7}q#}Jw7dU5u=<`qyx{t{{-Wx{ruP?8h
zIl#Sv3J<_9)N@fC;=_hBzDH_ACk@4v(+}1Ih32hPsvY4(X!NNXy7ekpL&+k^!y)4y
zL|)NxCaKj>h?J4Bhcqo|p{(Z&Z`n0aa4#H3Q>5z!W1)wA)Csb}b5HAFY2Fl2B(#IE
zXk=@Oan0JlPYrw%Xi~aw#08;uc7U9vhiGqS+7E@}17Hm}b<hKk*>x)n4cdt-e#5RB
z$2@GONPYgX1<bQz_FB%MWIYE~%vhZdGjGJxtzvs~!-h2cgDDh+ApjLhE56aXG$xoY
zCb_7X=}`_9y<9^Zymp4@S9!Yd58O*e{oCl=IigZ`#g-GG9JDdD9*sa<r-xLj`cV|J
zY1AZT1R$^y>DM!=A#1pTZ}R&I59H@JL{mYItp|XR{`!DslA|*FWCU^1aZecY+9F+R
za03z3MIHM1+VA-K1tb!#z|?@9&BD~x%_^KGe}bv1F=PXIVWO-ng88R?^ntF2Y)^?u
z1ylz?R0T2z3?=G>YqLb~f$>WECj-*smfFJ$`6G>2jX>uXiWxZ+K6x@p8pLOw?v*x+
z`&ipdL<fC?5wS|V3D*g)4&!P`-A6?+vBwyWNOB`4Jb|M4vId#~0JWKqxEXn+u!)bD
z(AOApiLWgS*F{z^Jzp?jB>l`Hsfxun5;2~ukpd|pOX}k)?#$T#fI-B*0cP=~zi_A|
z7(B&mB4T5bQGKu|sxwL2lK@O70}<v9_(^Zef*cS|ktM0j0xk4izE*cXTDNCTg9AA0
z#{?9>0`{e&bH?Iegcw>|9tGtasyb<8qH>_GR4XT!h)9xO)w-enyzW0DaR9|N5DI!O
z&sz~X5y`SPkLV=S`QYNBw^d;!10@$ovQ&x-O}&?b>JmjHcSDrM=y7KQQb20+Ket7(
zjla{D)`Xr}f!p~Dk1Z)A_Ul%(03^>8ZzcsSMWzQrj5-C#R|N`xeq#dz%Ln94_6R?^
z6_j9iP6`0uxVi?*8SZrZN|^fZ&~;Tey<%Jw#G)96=@8j_>JUbVvkQvpjs7ABj4`23
zfVD(WqDTGJTxRJ-F8}BU{3C?$0^9#4G^DC_$C09yJ{|ufR(+L9eIrjC2<s8Sf|q4m
zb*C^01ox_t|LJjC2mIpkTW5>^Fb+x&bPUHE(%v<ccE#{?&baXf0w<{4Xc3P^Rb9FM
z{e@Zls}gx5=={uQ>VSoybxsasJ|7Ai*`YCY7))oTGFazTEg3%fB7!Lr)D*RVzhKWW
zsDo@C<aRqmNBE@qLKu@kfVsLu<JI<>`#tSzOav%TG+AF|0^r61Bv2++6o3khwjr6=
zu+t_A$(zGg2-56U)f~@v*PW2*Y58RN&dO9yhaRTSzq1jJTW&5t5@SI_&b>EYMwmZY
zk(Grx;IVvcJ166XQf;x>w_O99z11AhRxaRLRck!9K2S^G!M<!0VXuPCYM^c783*gY
z3`SPox=Pv!-4rX+@+2m3Xd}MwW`%>d3Md!Vwtg_Mg}54w*Pr3p3}y#nCS12~LxFY=
z4I7qv<qrv}9bZ+(lnW&t#6`u60-8c%6ky`R9R*6`Qhmy(0Y<HZ>XcO7^@OpI^d}5!
zw4!L#`vEczEn6u>JyACk(aGev`CJl|A)4oax*|C+Kwm_@3xq&+HCm2hO6&BVi0;se
zqCLNS5>)M!shC~kCjw?pseaUxk|1Sm_2XmyoYsFNah7bcqTPiVoA<jtafC8$<bWEk
zYDcHN{tMvRI=VBUYs{Bd7_c6cK*Q)wunm}`zxe!h{KuPyGJyY24U%}!nTtTlVvRL6
zfpozRq^973j+)BxdVl`M82=PwY3B@CuyH=4oaBg1uo5>5^%Og2iBza+o(ceM%ENcj
z)Kn2);Ssx;nb=vM$eEZK@XME+oK@P5o<GD;Tc2%l8>Vft!Zv>}2q8w(r`cvr#r;>k
zYar73#j=a9Z8Bc}yl6;N(NsdoaYZ*`2Wv2Xnh&Ku1TD?{?^=I_r#CW1hRsNJ-me%_
z1i2{}$kh=11&1a35xksSEO?xeF9;N>YkD3P&$s%{F~Q*%SErd!4saA#YP|poj=!dg
z+VRvRScT09M9{~FntlS6wJ>N4#nfFsE8ds>A@LJqX{G4j0|%X2aQNL$3!!{OYj`aG
zWiVsY76AL99E4C5wRp!@^R_^ctyHoW8T=Ul&ixl$kvrHqib9HNQ$olJ^n-!QHqmhu
z9+w}Gu2yD+2f7m-ed{m!dSPD-BZwpnze=h{WpEq;N*Zf*8g)<V#}OIUmg^n)G(R2)
zJYOq_T12ZT)_7hMj`hoS6DehWNOct!GFp-{L^c>L9x`GmbAmel5HX1C4(7tciiCf}
z>&UdHzv82O;fEjjt?j7VU$NOx^p+@Z7Y{%%9m-}filb^48B{`|LLJI&8OZsd9r_W0
zO(V*fYfK$uwB`D7%QgPixX0p;mKMt}{0XD@ZVS`}@BoHU$Tu6>U74&jY!mxw!H1)Z
zZtVP=xY>N11Ux+Z_EgJCcS?_t+EO`}x>N6M4zC>H#5b1`I9{M!`Rym`#W%zZtqou*
zY4pR-w!iyhuY_F@2FZ{*V$9v|y0tO}QiaSea<3n+%qkpkE%_L?F{+7!@MIW4E&A5k
zVcIbhC<b%N^#p3D3&<`>K}ju|0)Rc4)Axy}V+TtX)i;s(m4nwy$rqAxm5FA|4D<?S
z9^R;B+=K?k?RI{Q{Ok*_oSC1K<Z|$I-8wSTYp;u5XNz0Un7RNPBv;U<nUg6e`dJTK
zp02g*MWbKbSu-R%(bpSf{NjBEN^o`&xeaO#aMA4&PCnvnpbP-%Ky7K94SE1_%WJ7X
z`O`_>XKQqtHMVXm;H$wgtbtM`w}w4es~`EAXwq_1mGEWPABXYRpB33mwlG#kd#t+<
zdG-FnBF7>lx_s<g&<<!as<Ryj`8tp=_VaiS!7e7b_*gP9V8nC4f)X`@LPC$Ww?0LK
zp-nnENXx}_SOt}L>@`yOb-X82q^8`jIS^D8$?Miyq`e2SgQEAf{pvfiTk-c6t5zJP
zu-Y$E1u46oUs^b)YeaN;RAQ3?a8oUfzr8N$sn6zWhdRJL#Q`r`C+yeQSt0BymRf0D
zZySZjE5_K9y4TG!CG@_n)PDDmcSz~&_ysVda!mmiWP4KwqmFMsVlQRnLknmnj0qdS
zpSNevdvjO#XWc(mG?6p#!9I{f?8``R8+2V`a<p}!pvA$qY5IKQ39ij$=fj&p<mDGS
zb2$hxX_~TSs{E?cggibt#@xP6zTPaGpjZn7<?=#(^uow|CdE_i85?nbL@La?H?5K&
z4pk;X3~b_WBDJa$b-r$1vU_-TRu)=dWh$0&^yHRDvS=>5)D_s;-Nnk9=|+>tj|#)Q
zM&b2!8rk?9$|5&I%Ltwc2frt$KFA#0PPUASx;2yA8T}_!=B{9NlGb%uQaK7R4)S3|
zc_kD32`NNqnoKM%h8}5Z)-h=@q(X7AXR2Z=HlXd8>oU&^%9^w0NVb!nW#U|rcCs|S
z3s$UC&}hH9Y-;W1@O^lwL4SGUshhfAOqY4wY7erebY|Y%-Ob2D?UGIux%F*p0G<B4
z`Pl#2u2B3|blSOderIMAKcN~^b3N<UXAiU$yEp~7rM-CWY@Qd(KAq57fx22iL@qxI
zr<d*Cq60sFw#uH7x0`jyO6@GL5+%7MX>VS#1Zt;^8KweJAqgtvpq*K6X5<)C(?Jql
zE7Lr*aqqXCIeKqh>jVB|vRh-}qt;E?D|zK_b!RlCh%|cKwT|zw8$()rSngJ}VxvtP
z#+<+RG{xtm=bo>P>Nv{u5HG3m0o3Ry=%YF;{<Wh!o8PhNvm`4Mrw-UHeTWU2Zuyfg
zZ*c@3a8On(xU@QL9bWzCUe}M^i`Qt(iBTc0n7xmO=4^vXp~^8DIvI27Xn?#I^k!T!
zjuH&?mqZbRXN^Tw-$wLQ#_4LLwnO4J6b&XjZGHx)VNQ%k<h(EYM$sxzbG^N;j*rFj
zd<X1FMC(gEu#1R7L6GJfI56D)XqG~d3R~37`<<ey@kVBRxcJ35sp@*e;Pqg1tOXKz
zz1g|olWvG!JdmtbJ;zbo$iZ7p5zHXyKwy~bd^jx`mGaxNnK#~*)5g!1;Eo@uzbR$@
zaf?h4pBy1h>o+r?l36*?-qjf~=BFLwNqrFgh|<pQR5W9#J8k>NT(*6uKlx_c%~+G3
z3%&*~wd;2M(xyBIyo|l$<`0W4D;K;s6P7k!%&zvXm-FWt9b3hn-+xG)e_3mJYA4Ce
z<pBU_wE+KPt^Kby&CI~r#Ld9t-!0nzK$UK5+c<5BCjQLoF-qyF@Z*C{#WuNBQTfDE
zZl)Hsp;B`Fse{Hz6hcCSAON7q$a?sBxk~c@7N3;-D|I6)p+Um1w6wFd^zA$sjTg}~
z=4F^DbL8wxP#Z@7`h%R;ym|ei!KOnhshknMB#G*%7}3?P8E*ZhAXABy3q`p*u0^>!
zD$<e|!I6|Cl1;@(-B=P|Gv(4UzIeJ-zv4+9r5=l{5@|_D-4J~iB`J-hLFXQ9SumW{
zW>#2jTmYU@p<be-lAKg6U*jLjlP0yV`PBHNTA~6YB*UCwnnJNmqD7~|DRSzDM9GYY
zypCZ64oaW&CQ5n*cehTRT)nIeuwulDxaT&~^!?q}q6zK@s)!1H2pX?bE=;?2s6MMg
zb%fNiN_0?U-dd>|5=n$e{|JMjUczXuAtDjoDLq*mf5arEniAPA66Gb^NCNmA$Wn5_
zsnS=iEafNB_>NP|rq`6%b#i6F#)#{<U&cTy8S?bE&DybOyRA}vfU8ixcGM$5M%iMl
zXWB8vb>w8iWY4ue=5JEVX}+fa=zO&FD<RK$WR()e5=)=bL#cdI>cPkR7=m(M5b8OY
zfK<O6;ZnzCWC)Fk3hG>^A&nI1HqbkrG>0q0@o%ppwfP4e1ibRfJ;dChGTA+!io%&W
zf!Sq=$4REv393fQ01br3gIQdmkDAeo<M)?LA4OxK6#krwOByQsM+6l)lbS;1jsB&Q
zzH@CQ#Eequ5(PN+B@(Q+CIv$VOJ@os6iQlxAyP=@7^%8Ua!sXoNYB7NwEH~j2@-{{
zQb_4YaQ>FGD$~JiFXyB?oZ!|5?!4M9&peF-;>D{%F}|=u)yiOsV&tPrVmH_*vI@v1
zg3x!56aSFhigdY&HmJU96J5=)VddU|9GtOc4a&JN^bG}^l`$8s5QefN6#jt*_Pdpv
z;GdjKQ(n*;!DQDbSiZo%@YDMI+vCn1ixyq&KJSgbNHFr>l`!48*WZ`5Rfpe`h35+f
zPEOw5OL(xm_V1?+zu?ciy`@$kFAoRSV6{2lx1Z13ouRz2I{Mk7sj8?v-xn=zH({@y
zpA!bG-0Wxi+Jd1Ag}UF5!>4w?kfU=|+vMGai;JkD-MEE(zh|_!YPT>tzGgZ<-ybrp
zFLy`GXF6HA&z^&ckw1(*&mvh{@O8BcyNW6UpD&sz*R|$#_j!>&l&Y2AfXz!(Du79G
zp{b4Zk&{$YPWEA8o3{G7{TugY`Dufbx)d-Fn-}0b86-x3IxfNi5s_mt*30Lg`WJem
zkn`&&AB|;V{a74n#fS-`-l4QTRzz-b%l2oMB9Jzu(*ZqOSrM>-0QXHJt$IluOEXRf
z<sl~2>`+lum)TL1&%bYxlfQ=4!YdXfS_SpxiH4-FZ4t>UUC581j{0G8#pQ6dT=;Qs
zk^6ov3Lol{KCaG|Wscb}-P6OS36nmd1bSUjJ4;^AAQFMem<DN7S6I5F`ed>GgqDkQ
zz>h-~x2;V708BJCezX=6qOxr9DU_p)ItkK%gh7s)17OTVS*M7xyD_+9o+H5Ld{{vC
zs|k7G0LuF@p`vM}8Wt9?k|6-Bp)*u_NU;d07PD#(hxeldddY{1(w|s1@|&}9c(^$7
z9xDTT0w1RwJNF|we;{DRVxZ3eP={53r2nFaCt9qy<_DYjPkgk%9#zO$BF!wm0J{3i
zXUs)?F}J8lRJ6VjMf8izApM~)C<>Y|xUKYK`73tz?G_I2=}%LrriQ+jmZu>;>YbUo
z>L#IXuTlvC0AJn+6Rm;h?p6ubtp#OC=`59Sg5z$Q3^7pF#R2+B*&<8-H;Asn!X8oQ
zxenY1gs1dYabnrnw5mAr5yU36ItbGrGnq00n;LK`?vaK>t<V#eT0oo7UrZFu)0BX(
zNEjwS`swD}c-`ZTWX0M4F)+{nq`x&^^~=-V0SlJSlkX)==XE?{6IG5YxACA2M4~7{
z7djzZe>$Sly`sMh<hc&^`)W228*&E%J&&+Vqb`b5Mc)!x8(iQMkeh(l3+tw>i`61Q
z*9TB<5~w*$0aKrj6itE+Ebo!QSb_%+M(_&9(#O;BfZG+61TIB_v=gWX%LnGzAj}4V
zD&Qms2W#DT+t1<KQfL@R(F(OfE@>!jA24~fkIf-2zbVD!j~K<*)sNW|do1Z4yHY*>
zU6FoAh^A@V-%#nx$E&5eYro}Cc)(#AlNOQGS+uk})T7WsM#oRA)aSZ6UT9jKH~m>l
zO;ff%0#FBP{8+a{Rk%fBRr=(RJV;}jKm;GMXq3_l{NL`Clof~8jBoB;(Y8h|YbzJT
z{3(O9$;h6gfaKs>DNf<CA^XLR(<>yoBl!vUn$g;?h?WqZ2#ERg@xA;GQuzhVVqD`3
zhas51{QBX{d+|W}is2aQBZu9p5A|{^_}x_S+x5}`8qmrd@LeS4?Ujcl+#$l<LDgP~
zEHf(0og*4=?MI<??*KXRqDds8;`Zk6a77=~mNije^U;$Cgu(4?<Or&s=>O_l#}?if
zH1xi@yfAaJ4d}37YxP(*IfNX6N0rXEe#ACI^<pwb_GKz;j?M4<ee!KZJGeUy)qnzy
zV;>$mIS2(2lt3V#Ze!8mJHP_Ju}l*DDP6Hax(>OAHlcp5mMRjKRJnrQ(0%@)9j7c0
z4pI#ei7h9}19J5sRQ#8qM)VrQL2w-`$T4fKI;F;5!ohd_^pipuP2=GU#Qu>5PzBvr
z*`U{`>lx;xAmzy|UAI0B`LNiN0WJlyO1b(xL6qce*a8Wn_J-m#vN!3M+WSXN4l6tW
z_sIxNM1(OzulZS^hRDD87$aJLKy1ch?Jx)`diw`Q<3rOv)$4Ow8-OOT*5R0x*DTGV
zCVLauCPQdY8tP>bTM%)_l2jD@0dG;M6dIMEbM^FFw1?URXAb51`@f6$kUX-P+rtQi
zQ@<bv_Mb5n-%{K^*6e>h)V4lo*RE7NtL&?X$g3LN1Html6@gNLs{xr5-(qs${!$-2
zkKM>6y2Dr-q&_YsQMS+VT(e1;uGUS5yl-L)E*MtDEEZow)=YVipgFw05a6NpDO~#T
zK=#E<n`l??37+m5EcO{B>{;yzBUJGmGZLO6(ZfkIRF@enlQRS`_0Q?5tUs>5jlS-)
zLCYLd<<LZL0!535ePS3xXVQZtp)_B!FEbs3pB#L`W5*-;l{Q4y!@W?}VyGK-U^y3)
zVO*91Sin<OT3=nUD~k-kjwk~)OC9-Dj(79npjBO?jDw}CeT8q|iZ@u5qZ`gpQUcje
zpQBzG4r@8gC$t?&Us^1hn{52!S%;VI4OD6m+tM}Fo4YCR_g<V;qS&Qv2CrK#lO!PV
zzX>@M43+p6H(HNS351NF`T>lUAB0R#>}2Moz%iRVjh6`(X29A(tq%wKyMY<Qncaz$
zCJc^s9ADD0hLwl-`j(0=QfOM6P=-T|Li{9=s=7##j3?KBM_RAGPJ%_vupY&3Tn;0$
zdHgL{12t?}9VMSVJ<O>!06;3J0DWe!gm5~4?Wt*V;Huf0Ne_Um26YPJRTkMY92ywM
zKjp^F44rbke`KnU)u|dO$I3C<;@FP?hR6`VKQf$x#DZxf(eFbMzrk`;>TvO32faj1
zrBfn!0^Kq^BfW(72p1CpLvyy`v45AGYqCu-)K;KsRyl8M(jaP9@STtQA)7jyVMG^#
z!8^0aQ>`px!f3)0>VYE7?YzdydI;Vo35y9S<;$joVK*`kI)}K*NwEZQkBZJ4-m_Vs
z3Uy7e7TjzbHqxvof%|ISRY3Wo3SO?Efx>~w3@EB9u9Hq^ANgtamURx&a<DB;+lZ=;
zSe$7)D~_zq){HAUi|b68hKETeYA|*-^Oy`?zK`hHqOb;8oYP2qv>+I$t9;VztpKu;
zWTZ(nlXzN#!p=z5S(*|m#LHmM?L;?&i`I|UygZK`#jmE!DNEl?chq>`Lck`oLVH6I
zdI&U?;FkXR5E+_c^QoksE9j^^>=A<9r1)j1Q_?(}IN9v}2GBtOE{GOof>ahM?Im>W
z>;?t9dsG!7YWGDBWrKs4)lXNzZbY7eH6o|0S`VbUZ~G(=w79BdE!QS$(*atenHC>0
zj>O|N1X2+k?HZ0sQOl_Wxop;15vmAMy<VFeXzR4f)*qaO#Csc94fv|wr3&iaFOgXA
zY7BO$h~5V9Hoz5maU@LJWf+b2cXe4fRr$Qkh9&z*_!83AaeG9uOtc}Q*pUV>3YH$F
zfqIE-Wu+IPB_ib1&&%oQg~HRvVbzZ*Rw=NNL+++Q$VV9)R4F~MMjbkeS}ZC-ZYS*q
zKwvnI8LZi-joj=$PA}>11zEjpaxcAp3lc9F5bh2;YYdE1n3{v!-|3d%oTEQb3-*w-
z8thqu?orsqg77V%ObvBH_cmg;v3+XsmWo$NrTSrVfg|IBQ)q^JAjYv_H}O#bj~e+2
zKs0f<t+MJ$N6H;w)dV(d-y<R{don`O1ktS5hj@`8zMy=y<6&N~Q!_Rt#0k#HmfG^n
z-H^f&o7gi+D9sWU!+wR_7l0O9Btw+lC`nh$y_w$vZXrRy5(OPVhu{c)a}{CBc4UNk
ztt8h){ZM*>RHl{3SU~?=-H+VLMfj>@kBrfFXR%{P3X>eK5phZS9;(cn;%UUjSW;6g
z$JAmKj}vN>2kO;0OTDwvg^J-(@0D+>GofIaQ96fHT`_ZquO7qhF*2ZXgjrRjWKu&a
zv^>r0A~L8-wYSX@5cg)o)wlO0;6Y*(@2)}j9PN+?|8Svn3>)_vUetn@)4?|(tPM1+
zhsV4XJl{dvn4M~8>PWlADm$~(L`B|lqe5l2%(6QZoby$X!h`@`z>T=N&Q@u)`EoU3
zJvc=D6XW5N)7-yxoP2B0o2N8?N+|EIO=N~``yCMalE9K%ZUx&YKVPLAy?%T<OIGiI
zC`=IvH1zH>Wk($}Vc;wp+6B(R>0rYxcCvMl*dZJc<{z0^|M`q5=bmAqY*z&q78miz
z2j#f&x{8m`P*#p=tM(23v`W~49KdN%lsSpeDsQNnAaI+8D}g)G%A~h>Ra5<OLaL13
z;fl{_R45*vtp5c}K(oK4h{un8hOD(7V_LAJte64{XO7NQn9z%d7jRlQ9^&x?v&-?Q
zksy|Aj}cKi!~hteD51u%2<m9J!L6799o^jker4u@qw%9t?43<+o<J%$=g@XR$$3xr
z4tC9GS9n~Q8qVP+9PNZ8a?rVFaUv&m^V!A37Mpd9@H%=JBbTfRI|9<^iuVm6KRk$@
zwPDX)9C+*Sr=NaeC`F*-msJda@=BH2s+cIfKPVi&{yzM{>r4Hz)Bs%u)l=T$qjMf4
z2Irx???)idJ|`>m=q=@qBKT$yEdCHn5e=a6d8c;FBI1MA_EZa`#>tdJJ?A7{L60J=
zle&uAm8yyul892L0t&+|w3v#KYd#YC7_{O%T2mbPr&&ySM%PMatAx)1jpVAzn47~H
z1^`{sGeIU}Z%Dw8$u@RB-FS@L>=__k%`(Y94dM56{BuV-d!ex&3rcAx%U{EK&F#pS
zC;ijo?VFMO>Q$fiB6n^@IvK)<w{L2jl8)2gxd&Mr;6WRbW<3}ZM%43*=&XPCqJL#Z
z;{paBsnlDPo%j16?!jznrqxSl1yWP%!~+NhK-7GNlfJt{Q=M%G613fKY@^m_n?2Qg
z6trv5?hYpX(>3=lvaTS88fR+mU1ngNwFQP2IwC4N*BaOM!1v%{!Z4+<IKd-~)kLne
zLjpJodEwIRxMOB6tE}*x+C-*0anB$MCYhYl+1@{HEduL)ACjA!^PV|0VJTca-3&t~
z`n;t}*AMh2yZ^{p2VX_lzC;@<$b98`Mzs#}EK54A-{&d@|AVfgCcN#k!-0IQ2l+qf
zK)&cQzKOfobQRcQsbG;cd<8$-8IdJ0Of~(9G8WH0aOKFf9xfjogz+lGh{}lmfm&Ah
zLNdOz2Oi1LB(lOIhqSE(bQ3N!;YBdBjx}$L6}7$rwEz}7z|t7i%rx+xGF$at6UjC5
z-{o<bjNjE$?xxStqwWF#+E5UboGGjkITP{7)=c5cc5#nU8AE|}Vg!t`S)dzj8;9x(
z-YK+nO51HEDwd8NOKNA<4yBc~<I_|GE3(^os*B5xY<&te69;xn#B8V;YWL0;cDT?y
z>}pSg9$@_8x*lJK69Jmp8i21%c%z5bNmt*@>|v*R=C0}M^8J>PJGDu!*9ROY{*Aoa
z^)6)i&|`UPox!JL`Mj4*7|ni3A4H4Axt|zV9Ss?5#Y_!XThm|LQ4z~kS^1Mu)B1*j
z+x1k$9RK)uy9}wi1#V1V{0xWdTwUXF!?--rVqVOGx<EC1_kE~iFxa*Tu4wpDS0Zm^
zm_WiD63gT<odj+x0zK3%o<OPW<~6t>G{2BF5wjcE;+C}89@W=X-8N0hc}pX0n^dx&
zWJRIMJWHpv11g*BZ*xw*0nuk{t__n_=BJGKFWyUb+WI@bH92cZXGgh-@fleDqYEgD
zBD+J4;J&zbW69Hp>3Se8TwgrKF(T@tkqIBo1!%|IC{8^R$9&^iR+U=jw3ET<wWOAM
z3X^OOq0ViwTLw#)I<vx<;i_1Bl~g@?$U4$1hA6nZS+04#UTPcg)i5FkT55NZ0=V&6
zMki!CwaGBIX97OCZ@vBIv|!7U?#JLFq}IP>MjG@F#K4ij>ouB>-80i5@aIfWgNd-A
zzHspRdKg_Dzq|UnsiWISsLz_Zcg-Xwe&`_p70B)PRt6W*>B;%4{uRMC+S%LZ1P4)<
zxEc)UfHer9VOv&A;;~zcW=G7sZYlM_T|8l5DPs`&IK&LMU{D%7OT>qB7IxzVE10ZV
z1UpiL9=am#E;1Pd1E{ArYrXS8m_h<|bNvJ9FXej7dLxyYL^G~j*^;3ip5`&)411Tp
zNGU>ji12mOOQI~zW-|zCPxW`!H9Xkol;ysAGWXt*;W=@DjQ>PHh*J@azl-DVx@PpQ
z9kyd;eX>xK8}E{-o%=3nb@K*I4?E~BS-XPitb&}bc@icnzEyZN!2EduDzn7~((6rA
zo-`m|3M%7PI&?`f&j7<B4i9T>3mpgbO(tBNYa7R@x7mF6?n5=KfbGof-PfFp5uNW7
z=w8OWP=>~t52v03eP+)+;eKV6>7(eDTHP%CSvPl=_QU=C{ii0i!|+)uYbG^6;=Q$m
zK>fGDvzp`3Zw#*+u8!||8RYOOrxPcC@xY6Y)b!UG5^wx%+}Ete*5e$fhhp4rH=&#L
zTtR!t2S?0m+BRuz3!Zegfk)|rW<Gl>u|AJpyh!zFPjDYaMLEu{Q=Zef%oDk0$7pT-
zn4h9;;!yP0I`lPGeK&=mW4g7{Tjz@{1)gE5p|6V6t3$<shod~OjYGK1jZO#fK99!~
z3uKPj%vFlju%?-7ogC0+&7S^dPrl9Kq*8^>CUK;)c2t5#Xaj70{4&ehbZWuXAI{eB
zkp5tX_xlA-1RjE+G8`q4@KlG%A)hiN7VsF4tiU5dG?`0`j>HE3r5HTAY7%pZE*UqZ
z3L(?+Ghvg}clpQ)e>ul<)Q{T@IAIOKJFP-xM_RKnN?c2@?eRN3Dxlz!B8S!iKd|SV
z*lzHQ5Kz}AmMdLp(2UpfKr_xRIyLU~<HkW7XQ5b{Fg^L|uKl|}v91hoLl87hWMQf?
zBP21hH9%WRZAzgBX=z`C3c#r=ZF;qVzRX{T321S)Bg}qy%|!9E?PY+?`s3^V>E+HV
zQ6z_JcJbm(zjtpn5`Tb$us(jU?Wz?1Y*bsd!FImb(An2U+k5su$hH3FdsviZHx&#i
z__tDP@APOec#u|M(EoRu)O9&kv^%uk8?TWuU(rCo^J<%$%=oiWAT~YRX<#}bO1aG~
z-(r(q2=Gpen4X6;0j7)&?#{Nr{`}TT(#TWl(X>7r_k`A3RCOQgvDJiC{rk7qaJ2X-
z>O&VAgdZVAEkp{dAfkc-MgQI2w>GzNBnf`^ujqj`!hjcor3vrF26yOPnxbWnMN&&r
z_Bb>Opb0d|wm<+z1EP68|M#nWRb{;zASKzJ-9gM)BvF<1%F4=jBC&dp@I-T>U75bt
zRDrx?2WT<q**46(46n-Nh52>J>BE=~MQn^y_V2_DU?M%%fP$5WHkDQoZLiW#1!E03
z4S=~|TX#XXtDwD`vHf#oc}gxl==e~vm!aFdY@|@2G2&=^`XnXd=!le<lvQ->TUpl%
zM}{K-JH@1$8qNXG7zVMTuM?rbsmH$YqRE5U0(9|4ebNFSZ_ry%c*@(^{FVeAj4hG;
zeMbedE?^d==Q-RH7QI=yW3xomH#gO!E|J-0x`pY0#^$2aV{RSK`)vFur?<cvUsOO1
zimnowDHq739^=$8qS^=-(|TO5nE7#jCA<knW)wZ8xUH%jO03QrZKKqQN$v(sRnU$-
zQo=@(RxNN6()sOFeDXEPxTFfjrVR%m*%jaqh-_w*dNmx{G;HMR1KwZ;%1yuc44l2p
zKcBJNmr^IK7iJFrl)$BVdf6LTuwi%C+q4_lFa_t-=Qp5PKqfN$;S>GRC439T57B?|
zyrG6*)x*?g=8r@0?i~JP1H{@Ohwh5oK&FI@3bK8_9o!D;nL77`d^u5P=vKkUh{0<+
z>AfD<5V#Nm&UPCJVVGP9=yI|R6cwxI04&!t?5lcV9N-YMVXiO{8T{B)=wUqq`yk{y
z6a!fmBTS4a;RD&U*In=N!6~{+7&_8zzJ(G{AkK(U+#kjagaIBKFDrTQiQ0WQt((>6
z`x?yto)?x~O5q=k*4`Za&3#mSFHX5*SS4JqmDrmb3K42-2&oxa5kizi186^jEzQW}
ztl_#D#7F7P#?03H-8PTC@{^rkziCBIyDI}oX4lQobK5A4@X5A9k3@~g&bBj4Ow?aU
z(o|*fmtQ!S0H{0HmQaIn-!I{YOWGBANa(;YEr1ISrZ36X3>rJs=FlrUpIz2h;L%VP
zBW39xwcvGh9DPX=wHsvJ?4zgfuFkgEFO**f=?gq~_$_V<sp0q!lp|FFyDQ+Vhv2T2
z0?{;6=l~(TtlO(*cm_9y`(54*jWB2W1OJoI^l23(_Eu;lTm^hYEldk1!@s1;GRzF8
zslk9w%m#z%nkC+zm5RsXGY~F@Ba1&>lJgpHr^V>(?8QDUXN2;e*Lg%r`s;bKvbxI=
z#q^SAJ<kcP+o%}xx_3!v>XCMcfpiU<&rZ9Bxm(qj6xO^V<s_g-fCA`3ZumI}zY<pp
zwMlWn@`mm4vP|lso^f_AvoOrr8jt0$i(*?5KSqEuW65=mJTqMaj>?9eN(r?P+|^lu
zScP%F^ZDELg4~80jMlxgS~vn(qbV6wgctr63FB(LlzVv85_!uYEikhNb5!*h4Hhu0
z3WkKwdI2{J$Ga=8YqX<~CbJPGp>kzH+tpjwCYo7OOfh)((i}9(VCh}OTNy<RyQ6Zc
z059&;+4{qUT)j9&-QnvRh$2?tiKasIz@H4AlD~P}FpL6(k0d_F<OB_%o?#^$-mZlU
z%Rb@oR*QV(nHgXVBO0fMS{f<4FvJ;M_dx0ZsJ{5qtl+fy6uqcg_&8xmz|eK3^)$}*
zt#-yy#psX(pI4F<iBC-hJ$l0{S?lbsS(yi%GuxaS%6McZUN_eMCa~F;mLdC6za0f7
zLuKc$SMY$4mK>BkP0GHn)~IOvj)d3|jW<-%!$rAFH|3-XG6&$4U^g9QjW;v%Wau#F
z3a*Zgns4e!#d$!zk(A?0*K=;PiU+hbYcM@P5WWk+Q^EH_y0B0Wo#M^%t&nEkUlva=
zZn)uCZVZtEPGyX4IOXL^Y5w5j=(JehS95bU{FK1ferF>l>_P^~w-Y%dd6;(Pi3+$i
z3M;bVN3NeG@X*!%l4wM<W>*ofR3j!3yB=Zj8@h{Snz>&BA=qIi2!Zam09FRC0CMK5
zP%=}7<X!KKE%TU5BKaHekp9pghm8#Lvj_TQ35gPJ(Zxt9`jB8HV`eboBz;!ZOPIO4
zIXV0vh|~OCc{F<<JDLfi(D#b$y40kQ@B_=$P~u}f-0E69d;ktPYX_-X2{c3$2iyT@
zUjx|{V@uGRSmG0*U&|UaQhDnMC!yN;Xk;!9Bq?5V>I0Vm!%o3VfN=qVk!1Xa6@g9L
zgJ<^a3TB9}V;bBNoZn}4bDd(aG#hak@Or_LK)%o$>R%oXDYb6zo+Gto7Vq=M0~GgW
zf7*EB2|xIssEt5@gtIKk)YUO0m`m?UZ~bNl;f#VuGNz5Ojo(f;8yLi}Ka^D<V$t=k
zvkA~<Nbe(l&_?wUKDkKH5Lr?49CK>fyZUdkq#fdoP%`rTr94QWPl7Z{rD2|pP-J%4
z0`Ster|{-?<5XZCn5mXc-NLX${MyH2UhHKw-`bxuO*CQA&4(#M@*(*Ypo-o-0LQG&
zuUn{56eE?3BiAXEZP*T+$zU)DJD)r|DNfN^=?bb8U~Gc)0c6Ichav_UfWiS<9`0iI
zKIya_3WPv{{MlaPZJ4%a>zkW9tghs<X^~Djs=p(x$}z?svEmuu34{B)PqQ63O=&yS
z;Sh-Ea?{{LOxa=8-r%If(V*s+i>RBg=5&U<=j(Dg*{}iqLJZ*UDumizmxVddLh!fF
zi*Pq$i&|IBCVTj+-pEp{DcALEwfp#Kx`!TE+eNz2tGDH5F3WL+#U(tK646Q|oJsVC
z42D<ryK0Cz>9xr^m|R<4oM<5uN)R3@4j9QCrz9?-)&s~g3OYceHN6v!fnymw)3+F~
zOBWIh+xWOCFG<&;VF9}%nyLBh@ub51Fb<mX2nH9E$1XP5Y4%8|WeX@M#th-yfT#ae
zkVg~d!N#NtXT&-bR1Z2i*<T+d=cL(^_IJEC57|wzLy<S0ie9Yjwmu$lI+#<C{h)OK
zj)3q^XC(RP$u}}=<0=}_@m(0!xVSEv_M(mvy#_B!h(Wtpue7|tRKLk@n>r=QbOm0<
z4eVfDav&$=I<&mh;NzGj+KmulICMf9jI9o~lNF-35cA?rJ1qGitFHudamsb#cgV?j
zCml0g;i7gd$hTFsP+1^FY1P<hoPevm8&6f%)6hnsR|&iY^G<(KOa+~bPiAyU4SyvJ
z!4}adI+3-^&lZNvhZ+w~eA~gG?}m}|P#~-%&M;+NP~V$JC^q<ceGn}fo%vrzWB5;B
zqpbTec4*RAs(w~$K<b{HVf@D&QyjM(wDHM_<}vGBH}d3Q#Vnij$_=k|eyJH`)PUhN
z(7pLjZ5(uBq^$CfQW)=&6}_w8`;5~Yzy5)jAAG=fA)$Vju$CqJITh161`<VK^~!po
z^wTM&vKmq2xy#e=r(Nr1dwvQExx?dg@v}aB2DCm*09T}XKi@k)_<rvPVZAp1h)}}6
zVV)zTe|XxnH);YOn$FR|(TQ1&@6Wus{j-A?u^V)<n@V(%{J!(|S@rjsqGRyMyFJS1
z!@EDKu{}3P_#1gbR;!Se^jTA&i5m{gdHCvjtCkjv*PiV?P#Y5+w#VGNi)V<+Liv91
zULG30>1P~m6A(%1rq=49F_WJZCE|X;9(iL$yjRTGB<OQLP0V_UF}#Fho93RC7Ba$D
zbQo&?IVEFK%4kJwM(SHGS`^`FjOX<`!0kZcmxykab&Bh!El3cY5Qe#;=E+5XiWZ4w
z6-fc0a;jNtx1O=^Riq2sK|2dY-6ZXoULxTe-}U06l9VT$jC5LGR>Y05!d#NeqDELU
z9IR?*ypdybWS?87DSzTt1lND7dpU^qyB8ykj!^f$2v`A^V^d*LNJ^3jB~;Q?I}PHZ
z<xFGDi6)g;k;5+i=*Z>aOplO^<&MRSi+Wa?Uv!0t0Qmx(!$iX@#Hvg*D3pmjc;R6m
zWt=I<jgll$%m@uNRlugtiT^29mK*!dELs$$Sc;LAGZQLxo-s)=BZ~bR>2)+!w4O@7
zcjF@NUYchMecHL=fE4gfDU<>GLN^GHvv<xAXeXFpXdsI5ZdCGvgjn;$%ILJbL2fI~
z2aQ^IQaYnf!Il~qW4Rl{TFuGqWM=Xo2F|DE_M;c21P?+Cae1pRAnln0gr~K17BB<s
z0#pMWabR%MC4Sa+2eck2o76H;&`2PF->>M-mpnO46uAn#s6sshBoJd+ew2JwY!wgO
zVwJbrA`M4taS-`L`5Sz9RCHn4+=B2#O(w#a9&OXrnsR?96dn}6(9CV{qK6eDI(%$J
zhyM}ir3&HJq%#|vrsl`E8NHOYkQzbm=Qx`o?o9Hjh7V`$(Y5Kj0<#y78tDp&!UZ5h
zJtKU0%Ti283mXv<e~3F>gj!LsU|A5UodT6h_`0ahD*nL09-wbmKZvJK?1hHy39;GD
z!q(9V)6S4BaERoti~sF=cY85Qore_Yg!e3w8iexQ+f3T+j~}l`5H%LE>`TY58~wA*
z;mb6wKUi8|+q{6*8ME+udU|r|=udD@+L(F1DlI%Q1IyStqK;}X?|3EnCw0i7W>Ly-
z%}5;%2#U(oU2na^lVc-qA3t*=IwyCZ|KP|q;8mR#n_|*37In%Hj43x+M~@j&$Xmne
z=_QMBX`U+c*j1O8pb|0!-|W?uV>?`)orqCZL+(+%Cr|MYhf^AENO&RUr)gA_O+9xH
zC$=`?(<?6@F%2;38MzZ+5F4Rd##Bs(-f6}56$(zWU&Grv1go~0;8oQHgSY9>+d`AE
zoq#JJFVG-I(A$M!__W*YA~TP;0a^D@#W_esUujOF7xNpol$LVZwAy8qXuhK&dc{!*
z>HDieTL4Z<R!-{k|G<iQ>y(#L%7dxt-Mj_{7G*y-6mD9ZI{pu+>@6>c!1@7V(^G2r
zC4h%-l^(k9z(R6W^S?k%OGswn&_7*<uaYEzz5o35VDHGY86@WEd8s5Mz$o9%1{j;h
z(~eNN;WlcZ2e->|!MS+i_ftXYOHFngOQLr>2r+9F!GBwI{4GQq-d5#Xb0r|{V!{yU
z%l_rf#iT55m;Ep{cvtO)zu_&>-i0TowzrL9hP93Wm2wHS6a2jig|7=oP-j;*Hw&ND
z$i|P4;C76XS3j+&V;pAK-Ry<itd)s)+aOd}jRwj8p}tkOv%Ki!b%Maf2V5doXy2Pr
z#jIjJM($+7jG$Q~Z%-KS0FOu~HY;@4#jg9ylV5j!dHQS5W?#;2L`tpX_*Gk$T|OB-
zw`pgX>|Tlw>yxyQQVOYb0s(3$iJkY@Ju-)vEPrsa0V;6ScuD%`<*@nVk_c2YoDhp_
zra*PXZSEwGiZk>UupWaK)g{D@a*jngKu)vB2!!RYXki*w7{{NQK@`nNAln)Q5b_l8
zx6&u4;_x1*Pj*UV4eTNMWTiybpj!3G_q7fMW+9n@I|)%Jx?R1Yq;LW&qI(<h!5+DN
z;Aa!F$?;=Kk`*NAe0K8v@lY(9+88)6EE`0tch;-R$N%05=;%hwYsWrm(QZu9nf?Cc
z8Z}(V5J@N$WU?hq<&!fl21GZhL2olIW^FQAt@-bv8<&Lck#1aNUGi?wA4etr(RHcG
ztL>x9zQ4MiOmQ_kNO(&h72ji~S0rO7ykBw=K28r?OBz?oC;`@W^O)U%19T%5cN{QM
z?4zfi_wIG=o45J?i+U-0rMF;Uc#~KWr|nRzSZG(iW!Q<BuStLg$o-TysEt_KglS&x
zjZ~VCZtwNZ<EL+u3=-qSo{xh*PH@?bNbgC1CmVJHY)Y4waC&h5>hyScbZ~aI_xvEC
z^t4l_A6`G*c@rSTH)x=4IEd4?$xtnalBz)%?u3XX_=nCY+iU=fM`K<_#U-dPS2&YF
zc74lw(d~i4PNziOaPYFFc$=MD5I9xB90NrtD+y`D4}!IHlphIiE$}4fo*wKU9DaA;
zypUBTi`WHOk`<uQ4|Aq=>p?Y;`XoKy9oD%>jVR&iY<_DWG$ck6eUJXX<o60p*4Gr+
z4}X=S8|r<*f+SRjR)DdhM~dLOHEd2aL*+RHz*6~*g$e9JUwKRuu!{ivFDx*!Kn1A*
zv59kI+I0h_C3@?T@Bbkjk`g;!o4ZDY#vjF&_;E9*KZVMs`BXY@Q`#t6H$(xGCM*TR
zHSjyy8YdIXz~hQm{+Fk_pwlyL;_!k;_}256QWm6UrJ`Xr4vE43{N>%V>fPtnY78%`
zB=c2JFw?6`fNe+?5+W{tB5otlO7r6JB3c3p{I%vYL}n17&raHs|1q!$dQlT<sKD_I
zYiY8#x1UJ~q{ktbE`IGuJrv}O|B9yiCDuEHFC$AKGLFvRXWC@SlW2ezOKduzEE=Wl
zgk++mCWCP~lkjvHKVw?dE2mp4-CfsyYBI6iAlgWA;F3S&yT~SQ-{ZpY4R51w({{ns
z2odmf*JQU4=ep9W@7AuZau@4*I?<T-S%7+SlN23MX$O(+TceO`(}So-f6B?3c%B3}
z3qJ=~ymE!}@Q?Rha82xY@F$VJK*!dh6474Y*HJ27hh11OPTwn;#G81-^;l}9S~&{5
zWoEav&;jWOij}K&wYGY&Ps#55X`6%uq&MXXVygw9*-thF($90Ok1qiw_ptBnG##Kq
zq|X$R!6(edOcsjPe9Wo$Gic%+>O)c`_Y=0qw%p0y2%#r{lK0{B2#jc~-(tkDMG&$4
zi<cV)sj`$<u`sqPjhTIo28w6042GH_ptGf}kRnTtbSHVI21Y<ymE-GfjPmK2L%=V~
z(1m#Z0kH`JU4I8$3Xo|V1d1G-^y#K@Hvdr%Tu;)IQHX_Tmr+!yV3O@9SPruMLYN|w
zC7S?j$({=`1IjMeY9rr9R7lvwZ8t7dIDxo5Gn{oZ4DuU&cSJ!Y#d9O?D<!dnLw<LZ
zH-<}z3@_yrNfgfDE{+3LkSNbWPc4RS&lZBr@LrP4(92Y`NeIbhG8)RU?iRoN8hqud
zZW0mDKK<MRYDo+i^Tm2vW-YZ@p#?r~tGlGOCoGui@`DG2WMfZG6P}<y?o7N8yCjro
zL$e9Iym5q#WbMZ?nOJ|V?8xKsZfa6}b0c<xmBcJ*CfPf3iMd4Z<1xVQDJlb>RK|C6
zE-Kx-IZY|itY`t#tUwaYjh$X;u*@(%$Q9BEM)AbF;Jp7SH0O5ueWYeM4@htADUO1G
zEmQ~@re8RcKm+**(ktVZAV)^23IRGiD!`keYa911nt8z$!P1rz1I|`ZGjp@C`T(Ss
zdU0soY>L(GyjZN43ov|#b9&m7gD$#oc~E!T{AUkS!9dm=tRGm<u=1q2V0hX!Rgn$T
z13rTPFxyXQOkgoUQG#ZF?iG6lRQMzfCf{9*RpSnr%nucw6Us+_Q-KhOOI|HQy>U@U
zwxR;?FQGNvGiO&{tXDMRC(*f)jmv*MB7H_Te4-NSgf2LE!*_j;dGX~>DhE$oQwRXY
z5f~YDMf8vX!ODQcEl4m^{stOKVD2m@r0R!Gr!Gms?R?q5A(<U*#^$kD*7Go@C*+P9
zgDhE7`H;zcjR{_jyiR2Yb*@#eDLF`_x|UOT4eNSiwl7s=Sw5u3rJEU0#eR|^coNjP
zl)9>20qZnz@aCk+!=z4RJ)u6rNYH^baR?z~x$XiZ;*Bvd&yH!_EB;@6-d<Yz0DOwd
ze~7bv&vVs23n7OUU9|9?rJQ+v+3eZjX{%@^AJ_Wp!-E&kHkM!Vk22P%w~dmQ<mBT^
zbRl=PIcCKn(Uzi}KXCeLByLcL5j+tS5YwXdVq)NR^&|%7*{xSy*s_rQddNY#mPZHY
z-<)iBirb&yHmCQ#CwK2t`!FXaVo&aHyvD#v5Luj2$M_m<|9tw#a39*P=l)O{??#S=
zJM_8q8E<-@lZ;-7uSiC=sQ5<ho*!&1qqiNNDjVMWsgdr7y4KY0Vq%U@%q=|n>cC!G
zcpMA^L-B3wknsA+n=nd`j0T)g)>Q?APppgo*w-cIon(UK?CG&`UdFMXtVDpEvo8{m
zBSccLVfcX$8bk<oZ6zJ-o~Y{BOM=2hRZ#u1p`M#JlqNV(%kUOwCUV`H%WVKH_CZPE
z^=nq`Dc}aH`5DTKez#?{Nc4Loz5|+@U|=1UhNYhDS_`+9iIvh2OWH*`!#%3ufnSb_
zWu!r*LU>An*gyugu@rOVlxLr~nUTlBzyoU?+t7CbLu8@eHmTrH0Hi|OP@HVJw295k
z=M}lm^*wOx0HJy+t4)nYR4Q6Mr%JH`%+?GLagm%v{GhGTkMC|qUKZg}WbS@Z4Xy@O
ztuqpLThn9kqnX}dTF$NpbhtrPv4jHk@~7D*FAFSvxeG1dORK0o8#XGzV%LUY<Y3_{
zaz-sfFNkow`?x{)4G8g?fBHyddwei;Pg0)y1Ibc8LL~R{{W$Bh^S$%K{p}g+e2H2I
z_aBg@MiP}_ro&6Qk*W3&kNdsed6T`!Kv9Q4QHS`3CBh+|R~+T!h%eV|<%|^+cI;li
zfn*qVyunZF+Dw_#Z%P4^0lk`rr-!b&Gz~t2ogRGs;$Z*$@Z@+4ei#}fJN(zZ@Aif-
z_Ku&sddQoNtRA{;U9b2ah^6luc%3jGTYOnOB@aY$hkf$KF~VoB_V=TFjINUVheO5y
zEjq1yURx1hJkVdB{OUBx)z45GuC|s(KavY3(Ca-wD{cV3?l522!TUtKUMG|aT6MJ_
z2H#j%xj&k{uK?zs^zVa?R(^Z^R5c|n%JBS$mj~J8Cz~X6lFliIt2p1_3-Z$J{Ndj$
zXAU^rq8Mm%uS4m`PCs^LC;9=aNa)SgGDr+)nFOEuzW{A<T8B-OOL@+1ef{CHH3{&g
z)dybs!7oqmb@6%F&_?-><w`pIeVs>JF(?J^pWdTeIeFRA2bU#?yuiq;G4S=K{uz$`
zx?F})XGk_JHHca3ZM;y2|5M#ihg<ofM*eXp$7koKulCPR+69fINd3|L<nAX(Jls((
zBG~j6I>zZY^N)MgRLgISu)Mck&7p$xv`Xfh!GFOBZ|(^M@<EDQ<Z7tu3xa~*Srr9Z
zxa#K;z)_%k3Gc*rb#*IIo$t)g1pUh|zbxiv``|AFcGc^F)Qegt)6<D7^sVi+G^a7Y
zk*-#)v|iQI!6|;10HnT^dIzpbk4&;|<arZEI8LQRvv(6$IxT|s)tvzAtBHSx*5Y?%
zJ(b$U02`-WoKu?6ax#JcqjyI+B}HP@eEyO)+t7u{+zpEg#>L$iKjMl7o1kg~q=3P{
z(a*Mr`de2`+VIx~tp|KoBo97iApyGC@&>zlsoYL*jOR#YArKYD7L!PaNg1f@`Y$<J
zRMp_5sZ~T~2};!rG$QqZ-rmjE*z*C)SwYMp{e@y&;fX1iwhphGvagH$_TUE~i+>Tc
zpDz`@f7$Vr#=H|bwK0F~DUN}flG>Ca|9KRpLWaM~UFMAmOK0RXz;?htv<~<|%3pK6
z-rWrANtjsb!)SsHsuB1$)g!mCIy0_)V2H@@@YyGsw)DeB^&!*j*J=7ZFi8340kXA`
z;_?8K%7}FH*eB}4NbOsA#zmPb&Rq%`Ow1uyNGBx(j)YLc6WRtMw0&T;!K>qMk0C$B
zCcS}7E3F#v5a&139;l-Tt+%WtFk|e0yZ8Jci7%Q8^77h+R9c^AYFk3kWd<m{r!AW;
zLOZJiv;yGk|2_uo_38P}n@wWZX3=>|^%!qUPX$sF+mU=W=&KPVotfWC%0w^AQjn_8
zt5@ljGjgv4>UH@I5#N4Szap?%cDZ;u*6k)=jGoTPB{xsKA<gDs>J&96{Rz`_`V(zQ
zn#kYJtS1974fd`*B&(Qc*)2uxk$kI&1;kW0$E~_sLUC|<aJ+wj3Mg~89h5=f%<T)!
zkBT#P^~E|-mR+?5<L-PWS>{D%?b&it0Vp&W+5AI;&|YO3cC3Y2&#tRwO|dZn9H&|#
zo7Q1*i(RnnyG@-4=n8W79P)`=PODTQ!AnedeYKt%whR-X2p^cCi@fo>YX5>VJNURt
z>G>gukYr5TExLqbuix}GzQy-7j=5H_hldB>Z|BC7rSg&R-L|yXPu}!?2WOuTC=0h6
z(=Z>68Y_G241*0K-Io4qe@wULJoYBE$FN*;`hEHF>CPL22yGl>#@d)ctfz0R{qc8U
zI<-pU9;ljO)RJ7?!<ahXzN2ol2d>rr_+k03tQo=zO)|&-ReTqlB>V)+FIjAjm@jq}
zbNXPv*mU2CxIsD0jnb_1Rqtcb1|uYALg^@gJK%e~sK;DXBRet0?n$|!Oyk^Fo{EFF
zEvMoTUXx0O8tMjRvay0?Ng?fvx%tiTovYglb3ie-^4*bUDN7Zp`dJZy7Nrz0u{tkd
zX>$vxMid&$kX7kv296mC%p#ZI=vzY_4cc1NVr(}q!(OS}1_Olnk^lx%$xTA5@`i+D
z_nnN)WbRGojA3m%576e=as5#VMbHrpEhj9FFrU_E(SfJ2Nq|U+4cPUFBXTLIja63|
z{6;s?pA$JrnjfK1E2VyN$KoArpidlGb<|{jZiediMu@kbmkU9$EfqM`%X-Q{QnO__
zsUFWSFD;Uz<!4sIb4sdSs24C_aS2vPV!m$RIc$-cBHkbrpjc#b<bWL6eDnDN?;1ES
z{tRo-6<U8d|HizAFHcYY_CumvD?SKAA%KjWJOUTSB<q&^&DBOv%_B}$&C?G?h#I6;
z!%fw}JTTY%-ps4!E4Rnh3mu89>`lW<LSMWCesB$S8A$uhoSvXhoT)Oq;&4LVprd{6
zM|V3hM>VhXoKh!3L7>Y)al~m~*;A9UN&*r%)vmPl;GBSy=WX4nvuvPXQ-?D7i3K3u
zhwNeQh@N5z9BH#63g8#zto_ZwFCd%Xl8V%^dS6N&$W0T}@ZZN6Gum_YZ&{Hx4?9A$
z1Ow%6wdNhXi>H5yoo3X1bU%hQY4f-Ut_s;N!>wpR=|D<4p6jX(x7AW1Tr)Q;9m0c^
z5$*}(B<gKmLhjBa@(g3y-;i3yWX+2V-M_BP)If(JwT`i14wY2&b}1LEsB0wa!tAd7
zJ?bQDgt*5r+~XbtxlP$_7}x$DgCt*|?0C_EdJ{;(qE|UQ@<di;nQ;!AQjWiQv{bv{
zg_rTNgWN1z?XTa*?s2pbSx;)~<;z(3ij=PGp+QBP{BOG<g~?KR5LI&D1sc8Rj9g#$
zw+u~p6)%#rmhZdXy*@Jnt}E7_(k(+u{$S-T5#$6fpk>y48mu<N(lC3B_#nkk%OH0z
z39$RN+l!!Wg|7*y$(z;P08%Sp(`XdvV<QxlgmTn+6RVlouSgjdSQE07v{;jwroy^V
z=dcZnFwjpvNy{$*&tDqRzT4|>1tKTe$0wiX!_wQT<DdqMb*=zT7Oc%OoGNXeJra9a
z?4cyCU`k91^V)Mngnzlo=L!TmO;#mK3UzUs$OTMhS%ZsFbmH))U<nVU3^C$0dj4>R
z5c6v<Eb3<dhwgNfcfk9Drv^Hn%%)}*dsun_4J_zYb3p~NTpH{U{{R|XSMU;1Yu|=~
z<XMPpn3^?`93H=Xbw1obIsWe8^jsA<+4Q3Lu)M&w4Y7D(vbzjL-DEWk)n)pPQrYB^
zmX!n!ZwxmlEQobK8;6tX7iR~jW{>YZ;ydu16U%erfyfzXcVllCP=ot~sGcO69X`8H
zJct*mth6q=qsk?SFx8_L5aV<)CU8SKyE8)N&48ny&g+}%sGd&iCN5;3DfdAypH#$2
zq+xdxL`z=cb<Cyf3fcy>tdgKMDpB&quR`hSKDr7eXt6`J)fj}j2GKDblDH@mUmxm4
zpkv<JFToQ|%)bhbTE*FBPjN4)2$K}t;md`oWnp@vPhjUxFxR1I7b+ECawXK~1e4&M
z?c3!Hk+2f9v)>RIE@uU}fO9inVdO6-Q{2Njm$%70US@!Y3gWFo^9JTEw719ns>f-E
z7`x>>+6rbQ$0bfQzQXNA>x~RQt<1?8_I$ox8o-bcjbB!{$)ZfMDRL7UhzucU<>tK;
zbreG#aFBN-vzwdlzxHW}X5!1UZLF#0`7(CKEzfng?Z|^I$kIaar-H``m@4&@>~_ZM
zW;MTohpQ8mb;IogWxeT(NmG#_Lr?YEP(vuIM9#&HVYpm_YSyd`JLhmwvR1R>^-VcJ
zPe{1=*ufelK^Z%($17K5RrzPb2P`}@N&$jy;SK|yV^}@<Bvj@2S`214^T`@(K_M%M
zq(`Ps+$Fknfq!+WLh?~HdQq9?^?ccjNjz-aPI0neFLZM8C&DRw>!5fhtD4!OzutMT
zXZ25z2!-l>KtjPu-K^@FSv~+D&ogmKA}0vX8w);Nv>DE5-ROmkPXqQU9lUyys%gI$
zc}5v#8-^h>X6Z5g$O+kck=sQti(k)XFCgH1_D>HCt2Pu*pw?cIuMe^~zojP+vZB_r
zDstp$^WBMKPLlj(V7c1BZlX7X*I)Sss>$|zih!9>m>gg$Jo|p}*~D;oGqATcIii4<
z1p6cIL)6R2ff%82$UzxJm&moHg!j70O_1WnMFQ~K;w==1=Ia+^dXcf*JOJ$>J0Ns$
zfAza-`=>Y3==Ib7pWlqwbI+jHWIe_NGb8K>zCY&c<W<wF+rv%!i;GO*%(mzjVH<oo
z^2V0E$nrrDL#Iv+9Akp;xyT~~=sM24p(1WXH&zBz2|EPYK?R`Fc81q{@ANdNE||WE
z+>WmKy>GZf7modsmS{?lX~Mgr4%>fVCH8GMhHm@B%ly~fkLf_33uoSffav<3Qlb@3
zcxV}~RI%1G)1#r9h%0sLJmsJ*+^hFbTKXK2hsONPz;E*9D87)vM0Y^U83Fq?;zdL?
z7Nk&_ccfVIu}Kzu?3$79(7SeeNBjg8K5H_MSB7H_4>PQ%x~=#N4w=E4BOM?ywfZmV
z@Lyf_Ecz1H<%*`Iwk8m*GDeR~*))%Rk9g|@YG(NFd!y?AB3#T#wdpV`un=PvRa7SM
zt|#Q*dkcrLjP^eq$1Uz+1<!{gt}BTBfsj^%@q8$M3tZM!yhD4Onv5@){6k_qRXqO3
zKzBKn_d|5URcCdit$ag>2wm|$s3tuz=#bKs)CcAaTdJP4yeF+EmQ3pRzUEcJf(Tvu
z#oMY{WZe6FMl(e{hU%)c&<s$nT8*x{=|+T56|ba-`m&lYDy-7!sbOtBs-la5#dyy>
z5rlk4M=n@Dq2)*|cLzWTK}q2O1)BJqXATz;2OQ>BRJGdR<8<9XZFs*x&NjO@2q(I+
zhc{~~UT+#FwQmW+)>f@2w8}9733bfLt@Os<+ARl%7^DE^eOP?Ip3GfKdE{(yFEJuw
z^9jVT({HT0hpHS*U+Y`^dvjF2rKgq@61+;&*+pui*@6QHwGFw_KFp{A@QSo^4i2uu
ztn=csDDF<VDY!^Y!J>RM5%+OcL{hyeRR+EfHk7OJfIDbPN6N}(#h1LEy_>(4iI8&w
zTC3}u8pm1EH>>%)FtX(xuhhNAIx#`UryV|8EkhtkFOyg}JiB^>yO+zV>J>qxVla?8
zvy>JsOvwTmfcZ)3{NI&E7+#bsP<OH$wAm*^=TZAbDWxSjX%G%#27E@}z+;5TRu`Cv
z<6^#E*}N>^Nid7Cgm=XZzvG2)S}hfHR5_3?sZ8|vS`X!Ii_mUK1UpH%`O-Rf3a~pq
z=HIN#@?|}R1YA^I5@QGQ8mrD#wfg&KU(e?qgBy2B+i-Me{@90~)W2Vql-D96;TM?v
z*|fm;SD7DR;Z3<3Uz>TAS0!Kv8GdG~8tPBiSF_TLSNZmz^Eq`Q$BdN}3c=tFxf)>G
z?9eaA=F9HiKkLN1<jx=2T*3hhw5c(#j@HfF*FI7oHq5f%?$96Pm>8)b`?MK^9rpnm
zroHzdL85RdJ2&x0&4EiODe)_{80hZs6@0a&H)eHHAw%!{W5@Pp2Wp5BmN*xHG_q=~
zT0)@1b@5mUb{dWut`MavPT%Oj1u4!joYrIO>#c9Lz0^z<{tGXwswV?vVp^_e<7=yu
z3Al--Pman@;1OAxF|Kzzoj%=q`x_9V`C`@VzJB9BpOd;8&S!?juWi(ZgwG5c^ER$m
ziI@xB!U$JPuO_9T-Lv}tRzvxo=@wE!nhN|FJm74_1a19dIS0mWc(t6b7bzz41@1%;
z;Uxw<Oq@dyD6U8{K5;50hmVR;l=t*6Q68m%`ufWe$5Oz1uR}j-O-s+Js@S<SGu#>Z
zN0x%_w=e?$Wd*kZ2YW^#i@*>QEyo8O1|X|*L{<$NP~k?jLz)@WATjp<w}Lz`sA+H=
znng9Hi;n4%K!jdgS2tjQl=euiBTkhOkS9@sk#o*m!WVyqZ_ThLK*1ab&kh;pJO33a
z8+wkBN5QIQv-#{Xyw8(zIWYnyd)7ngh)dx@a9&}JF)$Qf;#0mVM$KI_Fr4UR{qu<Y
zGq8N-tRgEePSYFOgS4Jabi9FO&-td&vhA<-LCc&%V)y8h?%16EKx3<)SIg2y?W`bJ
zf>M7VtZry6a>Fy91+IbiJYAjUk!?6~4~Ks<fwLdUq(*~YOwcXuh~y@moS?tlN_-?t
zFJ$NfUdkLDAELAdxDAyCDc;s_7}1Ueq-qBMFDuGpLf^Q8sMx}Ps?80+6pdY`5P^h%
zJ+P{r4k#?<&0tc!o0;cv;3Azc266%F01w6`s4^y1GhWsU%&JwISG2;1e2J9F{5B}Q
zhkXG%3S#0E(zhVSuj;@aPPeI%g0Hct7*OU9z99zYKh}oCj}oG)<3AwqbbW*TCirw&
zPR4&F1<C|p7HmRZYt=gp5EZxH0EMGGDS*?!TZp7o&PcWkwxub$wAp<=rq=g2da`@P
zO?`Du<q;`-Z&59$cO=G60(O;MjiK$D(#93|j$TZw8{(Gh86~;^Vg#yGIDagK6dpym
ziat6e)2lTtQJaa6dXn-DiG(r=_L&xoC*Z$e()Vvpjt)M*sAr%5h{1inZ=P#{tRw{r
zbCDgyNbOljoJdT|Gt5Z9w8H(Vo$H|RAU==EIDq~6z}a<(9vhS&rna`du<-C3p-~c?
z1VS6$;MP2ZbOWXCvLD$D7btuT_gBm5<NeRi4xb+$pF>RMrz_&Jm+QsK!8v4!!bfLA
zx|@DLw=19V!VNb2VvuNRxb{8^iy)j`R1UF_+kq0DK7rv(DzHKG_0_duaohp1t3ml7
z_RW7Hb+!N<B0q{k3N`C5VMcB2I0XnY8>E<U$AS0tvzfovz<3A0_anYLqyuT1<=FA~
zPntQmKW)jM=e}@{hP?l~Ue1U_Fisr(iXsi}crO?KVUXidvA_4?#aDa#-xg;t5B3kg
zKHQfq6@epTn5Y88{J>C!?^?4k??KJ!NzL!$ilT?nD;O21PV4uHofgI<bHyinhd2qR
zR<+{oT~^2#hN>#N>zra}NySCAw~Eq`FGz<g)jH_$x@YbH^4ttX)%#=w;A-Ty`QEc<
zrw3<e_w+uG<(T<Y<e2%?I~344EDc;Z5`Js><wv$Sj?3Sm4GbbHQ(-_hk{&PsL0MiS
zdOiLaVf=X){z$=pzc85JI5fP~X5jA`R=20R7KD;5#)1zMvU{&ca=9n`FJCgGZeeUJ
zy)m2OFXQM|2E9dJZs~V#nkm>bRz}*~0)%QEPCt`yNE5-`;`^#JyQYl-kBS#_gRxRw
z3`*QJb6_&f`%_~&HpmGGln)aMDoTjYkmrRD`4Pd*cRs1m$ygj2tsT1|i^38$j41sY
z>2m6&Jf>7|cUG@CF4w?O1BOkia!M}T1AmQESix>_3M&aMyLB@ZW0Ps(*KpJgKsp$D
ztt=_0V1|U{+6C_fmqOcPlsbS7Ka@bjLShmCL<-W~L59ZltlJxCno(X3(HnJ_-5mxH
zgk{hP?fM!G`wp9-lBfA!fNq3R1jT`)f4V$OU;cb{vj6Sy?ELg#??`VexD*;}8S<Fj
z7_w`f_K@%z4~$?j>1LM^W(xe`@oK$bKA~Q``#=5S&;DP&@H)U2$<6YIHG2p@fjT%D
zE&}en6=aVXx;qZg^rog-GbzEQPhHc{=>v4$fyc&817%4%d+a(x?lMc8_j_e?E!)s8
z_vUm`^{qSNMe)m%Uw3|a`fJZG7L(X*4+wjY*!$Zx6HG+dMR7gQu3z~1IgZM^i%N1j
zLg+$6u?q(OjWoVq6zj!*`nbw%cg<~rAG%a2lpNO074e`y;}Qbq+9mEdb7{neZ$V})
zzn%mm!A+JxfRn9%{0V9y(QsBxOddC^pG>$rScqny9Km~LRjQ~(9Ra8jkyX(UKWVj~
zq_8!&6n(+-tC8FBumv5FggOb<#Xv`>-?YDA4B!kzT=PTMOKy*v>FM>nS!G<oGV7OD
z*=_+)-z>3rYe1d{Rc$DyxOY*c62<-%h4&%^J^*Nk2Ot4IDu^~36nh#}C_d>_bZm}?
z$%&E#!pOmzftgh!4e-OndOfIy#^Z142~GZc@`A_CYa1VMI(FybW?9UF_XbbRY;Dp9
z#d{3sV7vd8B!Fr6xy~8Aq7Em_=^H}Z8AMHm=-z+_*`xvvDe-FhFH)};PMVt1v7-Bb
zI^9E`n<AwQDystlL*$1tMDYy#hmBqxkQ~C3NR>}3Cy?c$UqFss`UxK~EZlDTgZl&h
zlXgV3GlQ(q`2S?HjZ8LP$ss7LK_UA|1VlVSTUP@gs=1Xq$^z@t$codwX+MW@Px{3$
z&4kc=1O6kNYs-3Z+jzD4qG!QuAY~5sNgsyocaPEAm!Kq<t*2>s%=I1IF3Sb)-G-@$
z2O#@iPpTmFz?<)Ev388H+p(MSZ3SxfZfAMXNq1FmKp1yzzsb)znlH{IJ<=w^X%}<3
z<?Zz*5{Qk!C3fBK`66}x9z-$V>9GQbz76CpHP^lxba$J1=F8^E=cjdTj>5(%=tTwM
zO};m|Y_@dWLLS(PR(TH~vfoU&Z%ed!Q8F8SG6DXk_ai<UB<C#$Lx+K_b`!TN5+=5u
ztm4A?37jt$f+M*X&x_P|_m7)L#9?Jx-gs39)MXa7Vn1__tqMi}rACO=eu?Z+d0I;l
z;<$XL6}QVK)Qfpo<qA%|U5-mn#an3J&F8W>M8FS-(V8Yy$oqV%Lb^8cg_ZPtr4=v*
z-H<Bz_{Ake<;0{=U)E6=KIv<ki-8QKf%)Qo!a%WSc6}Q7(}flau@mP62T-a(IRr{P
z<<v=%yFgnb+D|a#lO@LKnOUzosl~DqOqAWo-ny+$HxvQj6Hqy;iA&RHBkOZwQYgt|
ztDsvM!3AyMuYre^gHrumjo0K8us6kI2Y4fHMOn9+vuVBxJSiNAWd)g$N+YwvO37w_
zTV)y(V9D7q#%!f<6VVR}Pj3yEQ{2S?Vk}3%CkR23PfnSMxl7s_x3vo%!Zt;p-^G!j
zP3@Cv$tW)ie6}YINv8e2^Z`EV9g0FQyaiwaKG0+>1CHmi#w;4`!fmX4_FD0^#=@|@
z)K}q-E#}Q?>w6m-Zq*}4iZ`#yK-_A?*4nC3NvDAz2B8b>hFjmCKUuyZdM(A0ikx!d
z$$eN>oXCi(D?qFl)v_D!0YA<IJJ;ZtvgR2PH}GHot3XFTjG{yCXGpY#mv83pFip<W
z!IMF8Zp4~`3rl%=I|+m%cZF$?#_yp9NekuP$*Pl+$CQxQOju#I7Zop}34#OO!>G6+
zPj|fFp#A-|&YCW@>Lt@}`o3mAAX^%GA>tJ^j6?(3EVa!QGh{}DCsx+sy9%#FOv_y6
zj#$Hmn2m}8a3}2cdarjLKYbH{m|y%zHQF8wVRNYKDQ&{U&KD}zSWgD1LHEnkl=coj
ztij`+eV%VGe$NbM^IAB>7#u7ihYJ0506`85vxr0<75SvAy<@$5-e+6Kd+)^yb`sG^
zPtssH4+x`x(@_|_q1??p#l9j<X-QYnCwx{6gsu;LP|ffFJL}cu<A3iY_KD*A!Ua9w
zJN;_!`GHx@3#bvb_1cn%l59Dtrd>;uU(<@;Y`@H`<nNbiIz2dlb$UEJGC!N;Y_l6V
zorH+Zlv(H1?ClJtE{DO^q$fFG(u;Xid|!PE(NpA{h)#eu)?NH*FQ#CL7+(=DTapiH
z1{^04Mtc^l-8}Pv@)yhb+iI4+L=V`|gVWQK)2+9Ybt*hWQY|XvF5-9Fb^YfAOM>KM
zp%-maN{i8nVU$9CJN;Z1l=jR3k5r+X865glB|{8K*tIW5t6o0ZVq7?G7k531h3CW<
z@Ld#jbT5rVSfS7W480D(t~tAsfc;1r7u`R|6pzWCSwiAOl#(8S^gLu}Ls(wtq}&t?
z83{@O!Hor}001XLr6v=rEGM>+AD;|M8Gj{=sP_$XY5qXr#>N_4rJYNQJpmKsX>cPI
zo_n4MUF4CX?Fv)(s^=|FK=cCTv7R|jeG8ohD`Wz8WlK&4c|Pkgh9zg|Pu<L$8<+Ql
z49xK}*5`Tkx}OoiEU0jo{Qa?o68Sj`lc>ojN_VCw+IJfo=$l(<9DHI`@j3eCviZLX
zQkGa%4n9UPt%^P!c45U#{y{+f83>`BNYCo;o5D|ZdW4@LUfR=s?Jxnf5BEeKb^(Hp
ze^Qy_{LLeG*hJpQ&K*?0gx*O_bT5X|3`%p2%@xMq{00x0K5rv|wmOHfphBXoVbEk$
zQ2m4kw&3TwQf2C_BXTA~z#ewFqvw-Po2G#vFs>WQGT0=P7@u)jj+Otiz_=z%lr*Uf
z2MqTUs*g7!J!r|H^z@ZL8M(1W>@$Otk_3_%0v?2YV`vh@NB08PV+#udVtB^N`vj{B
zM!V>(wpm{QQ^bMglX|>zkSv0YkkS|`GwK&3;G*UC9{j;ppuqice`1~3sPEQ}#1_7U
zMj#m>i5P91al`uU$Djh=Q06nN+S%qJitF7ie)-k^^Q`)LMIz}Fcimxy_8G!B4#O3U
zt07lS_pJnb6DYgL7$bp3uqlxPyUvSYo0pS`-^+Hg**_`QuaR6KHG)Bey#tVJTa>O_
zwr$(CZQHhX*|zPfUADc;wr$(SF5NnPUq|25FFHCSXGTUw##)&fbB#H2{A2!KimBnO
z17FP?<R}?;>LId~Rcuw0#l_SGMWwXGm7)2e_3v(SQ%qI=EqFT)-q*jk_6lw6NFQ$?
zOkBpCzx}pobBEjUI8+4Wm?_f8(rPo1G9;-N89^G^RHoE~-L<#M8*Mg7uGB(<GHdj5
z%$!v5Al4%*&}v2m2Zo>=x?nRyOr9345_k0gBiQh|0ZW2R3k*<Tw{-4b_3QwV!-GIq
zgX#)v$4l@E#oGx(Pkp~2KaJcKh;uV8C{dM3BHF8oef~q4W8nGTBUPVBPi03>Y9MG;
zGNBwOLs11J*?_Xb5+?Y7*Z**Ff4<t2R@BAG#raM60)si04>sH38DaP^D)pzQQ|{T@
znC%E{9t{%0#>Mw<<P(1Asv($DV|9*6xBfui$N%G6h!S}qb$WZcNdJSKU}OdWK=@B{
zc?U;(XZs&>`Tuk*wAj|tBtqBz?=bLYZ0jFW`TGuEQ2F<v?KYu#uny>*68{5=>W<i}
z#vHy`G$z`G7A;;FNx5^h&l`4OB^1grN$pM0D6)MR^HAmi4@v8d7S<i62I(Z8yQFxL
zF)Mtg2BQlCGt4ngn(#oLwjgn=+E=6r?{?^Y*fnz-(Nu72M-IQ@a1OLt7$%@^#l%MX
zw<J<go*|N*d7ONkF1Qc>Ax=$l%C)-mVixa)jnWPhW3|D1+Xf;&SYd-D!U-R_YYk5V
zf{b-{{6lS|dFKI%&bU;XDGBQ_Z>T^`g0q0xWJ{(d!IlYv5&==k1*7TVv+$E6-z~Bi
z7Ghr{^S;Pp0#^iR)>)bpAq~OoMwSvn&4$$-F_zEhiqV!aUhT2Qqil&$rq6gmo4#QR
z;;~GVK6SE1Op!^#kZv`vc(%Qd<;59LR;hlHRtIH}b^^^@pq}X8(LQ&rEZYD(sVeF+
z#LI-f5&_Mx@3AF-x}lT(BUQp0*W6`czT*~f+)3%t0ZVHhVLZIN+|5;VTfpT_(sY(k
ziNQ2n@zhYq`GRI^yNnks5#cA*FK>LNx#EEqX+@lCBLTmw($T><v5++8OjCp3vps+Y
zkhtfN{IbzI8gLc`7E)S24saeJLAZw3Blj{lq2K{2@>jh`@kkxqIKB^GC7XnyA%asy
zcgbbiz5%rJJLCwdXz(1x7LxIDT{)=ufR(_b72lGK_Xe&3)+2tS8Z^_&_cC(b%XcGw
z0j33>w3uqvde(Yv%idJ8V@!>KdU(2ioxJJKd~tj$3xN?sdK3uwjp+u>JsGg1_W<Yd
zhVPkyz4f8xCA=4~<9E8kW|ONHZ9}AqJDUU%TvzR6wdsAE+DBI6^Y!lA55_okqPT-i
z;7*m^bdF&h-VLYKZhmcI>k!)6X@A~XfItV`jb_P=AiHt-neV^0fD-+&_q^%M;P$za
zldXentX{)8yaA?t@w`Pt>~{UbQ>(Y5p?>Xh<I3@}YyXG)(*(P+a62aI0-^SlZ}M&W
z&2WV2;(lv0$K$~b87|D$0pkT!LYW4%g%u}Guk0k>RyTG;xRqT=TH1iEy9=k?$~5Hi
zJt%8ifmU}9v*d^KX{POf<#rzX_h@>t#dCoYr&9yK(EM75*G@3IugV+F_M(JcF>bL%
ztaTvK+D1D`@B8<5RN??!H1p-uxNs1u-R9fGU5ZwO|M+UhFMB%OpHg~@UK}{-2iziy
zTWQx@+u{NkM;9>F=C-}`+ltN_GNx#sd0Rlb?Vp9Sbmea&?UgRhGO23s5ayld_iLL_
zdGN)B4_7Xr2s365D7r6b-A8t&Ab{lecOGgs_xm<~<#-Olb|@4g^}qnN3Q?=mzx>4T
zIQHZ8M3L7|Bkb(Fue^v+?wQDK*!3lKKqCazc@)n45kwF=8A~?-<xq(saepPQJa^%0
zt@2m{cg+Hbx1JT>RD%H!&FJk<)DPH+YukiXt6=Ke&{YWug?cH&?$6csDEs#a4lHx5
zg6q<*9h8D>5Q#e5p$Z@ZvHyDRc9#S46VCc<xRJ;7Um|$3mRTDPH8cEf@5;im31z4g
zgdboTdzS8D2t~LB-E3aFGPT_);TgDukSfWC0e2|Pitk6VJdMRtOfRA180gH1r-x@U
zy@-GF065d+hs8?3KX!ebfyl{^8180ZA4$I62*$t+=h1}mLKq)p<)p(KWru0ljF0Bt
zw&6LF;<?ULM&JLMv=x#<O`LZs%q)(I``aa^r?R(ckZaK!jxxN9A(hA2j>}q5Ra{&H
zj8+4q8m>*JTQhe6GF*j8oD^)Ykz7>5Ji+aG1ee8D(`3DwHRJV}CjJC`sNdjVj&PEu
zV|EehBZ&zC^}=!7>F@wMo?clz^APT^*yvu-lOaKG6&n=J?Bu2VtNr+b3zXDA)j3;*
zO7u!twyWsNJn<?oGgxa&#?92QA*p<-lC*lf9}*E#o-Ce-p0~SNhT6MOf><azv!QN0
zQ_txBA}$PLIgY^Ym9RE&u?}2J{W>s4O$QxNBN;x1Lot9Dy3N=wnnD!MqW8}MIPT6g
zdFzLiuub~I`XP30rRvlIX?48V1S+XM*WoN_A&Hr-qSuA0c7&_=0PjKB2R(W4HH~fa
z7-*{AS?-!+YXx9di_yXCew=MYjk=#Kw~4rs-MsWniL8QC7WVjlGo-GwNUoI>6;djy
z^}?}B`YaTCwYTkheVj+fOlSI*uswRC<BJ>g_<6<Aw%_3=h+aTOC!<4_S%ZL#5xw!<
zI31yK>x~dNaRB9mro#xsh=8JeMO6~Zs=zm1ONWm0saUBEy3%jK)`@G#eFTdLWsBTj
zXmvaSFVU^sL80!_%aR!PbUC#nvmj2k=niC=Ru;O*V;Tm-kWGD-lH%7F*Z%S%Pzmjw
zw&=KIY<GcyBC<}WQtbj#h-l$Ml&?3*g2K4QS-+2S>l{4{$Wg&<4M&#yx?z1UD$6xw
z4ex--$tBQsHT;b`XjQdaD^aQiTSI0`te~O(wl7Y(BqBYBh;<-jTclj35mG;qKhc;b
zEsqLdb0v}lFO-vR%y$4#?59I;y|b*<v+(R1i#+I1CdDZ{*JJ2azq${B$3!7USh7&M
zx8j(mkTKGP-u8pRugX|g<jBv8v?hZpJfx6*?13R)aFcI@2yV|~KOkRXY+SX}mtw-x
zXcgp#x^grmnp<ra@HRu+Gi1{x(IHd`779imVBF@*Ov#dHylDQo3a-O(lFhl^9^1jA
zDo<YTsuf57)mC6mpDx7|UL*JJ7s^%5khqpu-afe?_gk|aV>4!rKymyf#He#G0&w{4
zNfaMZZ3HS3B49!Ryvf3PyAfj7M9bj<x!<fCnZIC-x0Fo949P2WQh&u5gO5_kK98&3
zCrg_$W;*6~21h7sCB%|w1+-i-^BgHF8ydtxA{kVLO2$7_NK&-fk;)8eg?PaLa!+gV
zVPsY6Pz7bo$Y9f@uH3WSFlE1u)YZz@X+wE`6&E5PcpM>E0KRuQkv}wm6a=9qwnOb5
z{fhJDJK+wcq+!%XM?Vfi=`69!P#V;DLMysU?I|8I7wOTQ4)u`089XHFKgVhGE0<Q{
zN0hZ2mKc%|>pLP3B|ZZIqI%%x7E8|-mJHb@Bs&rhVG%OV#gjTH<U;_4L1<`G2@4u=
zkW$<@<m5!1ei$<E$Zq>UZyJkfsv*pnz@dTF04T;~qj@pWyaZh6JlrH~59riQ3`Xy~
zQ~^6DXB4T^;VxQ4wBYGWCbI&BA%Ka@&Azw8``R`%j#KSV{iU_ELf&R(qM$;Z`3qP|
zH=9<WzS|(S76++eb)zq%>s9a}BBz-}9aUXQqR2Dezz#!U1v}NgMHK{c093)U_!m`{
zO2|&S0E;n=;P+LbN%Y)R1ZM9<<r4<HW*5G>F^r#I2CPPdY>`-T=2z;RW;X!^G{*;{
z;&1e_{g^&Q&6t1}`iV$dg)D<Dmq!Y5Qi_v{p2^(OQt-?Cm=pO)n4QT&8Gt4KjYE|q
z?%*U(BO5b=Zrs0qzbrcm&Oeub>HW=$Oww{%n!MQKS?VqNYTqrwlgioZHY5JnBjqEj
zVr>FTmy0tCtobDlgDFRV@woxk5~(EoM=_!Oe*9R(_|h;D;cfw1t-3`u&UZ8+jtPN?
zIVLvG6dMq=*(7RsjTg&a*vVBMJ{Nz#(g@=9RSNshUGl<Stz(?o?{o+z5w;oX{+%$R
zOR*f(!Kfya!-^7{h3wWumxLc+ix~-lpmGW}b}FBMn1fwj;Cw8nf)bJ*oxT?KnrwDk
zVw*yJ!SAYE-5slQt#}F*lI1}+SRPd|L^dP<R2OAIb&omd`Z>abAF~Jyn0V7ykASL>
zB13n=<OWvhy}-u(B;fq8*aB3r*9buoH(OnuBPb!IM-Y{Q)P+CIH?09ZOCEuj3Y<hT
zjPc%n=<n0MBk#@e@YAUwhVxeR>kepo3Ujk_M*_nxh$4Hh4@3@j{1gI;_TdhL^DL4_
zLzX@KpJhuv)_UAh2q=-}X3@P$;CAi)J^b$$0B=4Z7A|OScy2z)B0Kz7zEyf^jpE>#
zC_XkLlXXVp1ID1Rd1pgF2wUSv=9PrMNFVdc#Tg{M{eJJKQRdymqvDUo+n4V2<oJ9)
zKXwEmhlybs8O$r;f%-HVNA1pEL%^n&G7_8V9)HJl&hiR`U;Bpl@V<XhH=q}!HLM)@
zsvP5|wF>rzp=>Ok<2{MMVB@$7E@r)r3%VzJofZ+je+1%JPA|3@RXnWRppCjks^*ip
z{Z06g?n0}D1<_i?bse1idhIP4@@fW>9cw7zgv}yJ+%PHY!&>Y7DD|#JK@eHy6OV^c
zY6-}jP@-QuQAltX<&x5N1Z-Yb)TCN6FEk>b=>hs9Le00|A0C_Fh)gDJ(OYRT)YP;h
zLI^rHS39Wgjzm{TCvW>2fW-v#C0cU0h;-MNDf1BiqhEZ+Uj`t??xG+|$p5bETt5*!
zy>&Z%^;K^=)0*->`868N*r0JcQ|*9EL7QA#(Kr;S?vSXemXVxBb(NTV>iMU2pxnYx
zK4ZZom!dm>GO1|$tT_gx*g0%)<z1tb=j8Y7(IYc*C{+c=9!?5%Uiz$zb=#nNoPel=
zhW(4S;Vq^ko~A)u`|gU6qE<5Nlu$;w?J?^a?h^KdVY77=C8BpbkMmM~(CVZ46T0vD
zs4I=0?k%+2VBy=dIBt7j@tc!v&)GB(O_%1wBKU{q+XuWy<8fWX^-&X6c4xp>AsijJ
zLTFOY8WRf7C@>b7KBJF8Ut8Wg59OpoK!C#>;9a{n+RD-Poe;I+H(iF*iZe^{2{Zv#
zs79`p7R^$ecz%7J%3u4tKepM2sMG!Qag3<5C(wq$D4W_Y>BZDp?3-<YnRkjnr=P-Q
zugsck`zsZi9<tt8=vr10)=PS~S*;A}+$lxP3_|{omvkBadbisTt>Re(v5Y~|Zk3j>
zIfGX12`On=R_&yJD9)(e;V8gK`IeByh4(A%3b{!(=ASiE)7Mg6qOjajU({$Lh`=w<
zZbc%2he#jC3vv^j^hpj^hl=2ULr?N&JoTuJ_po%Bw@eyN4)ql;WXZow-f=;YN#UJ6
z>Z-3OsM>m}1`tjixG*YS)z|(eh|%=O&(G|eL>?cIM-B0H=~d?5n8&W{Imi80L4P=b
zxAA`kq~qWhMmD|sKClWug2;AT5(2;1c3B(Rr;zlD(K13$Q`eFCP~qJv8S~MB4*B>z
zLvqZD^P$!LrA1VOsT8%Fk3mDS0*W6X=x_&;jc=WQPvQ46E{kEf(rI|^#{h8mF@rH<
z3gy2g7>-}RLTY0w2*Qw??~0Ihk9mEBD_U7ix^9ezXqpAbIJA-j5EDuUR@j`WAS=TG
zO8`jsVJ$Ri>u*S>GKw;hi(h@~w0LY!Dt|6a?c`u{@4&P4dsII9R@dR^KwVeffVZ4X
zme}lL=*9HufOEUPT?*v_9hC~s7fGoz)N)Bx!RPY?b?Ptt%)%fmJ$HN}T77=~1(L;;
zjd;tiOiFIBk(MX@|H38w!@Ly!gKygm3IG6s2mk>8pM?h}OY<Mm?tg>XRjbR!ZLlG9
zU8p0BiE43d<9SlRyv`EI)(d0;$T~s+(bFojH6oQKs<3^%_y{MUm~xMyRnw(PPOoIK
z56n3|?;nF!x)3iJ4)AsQJ?*1wQReb`J(t=I%Q3he#=?sECB)Q4GgH3<PN?Cb+|XPc
zZaX+>p%1+ubaFYI1S`sDJ&UFzLypB_AG16{LTHjFGqifzhxA2I0I=5SG>N^)P;rN^
zGp;Ux@wcE~mt|Ql;KuA8I&b<`UxXR0W6SaT>8-SfO}AOMafKI9V9hn;@`vZxjJvA1
zG*e>Wt&u&wG4YDEAqFH18gy-F&NX-FHXA)4eHxTgF-2J9Q*3woJiK3DlWveL*f$}q
zQk=(XPi%Y4N_BllI|Nzg-D0;kz=b!Q70rlFq1xa;2nD5SNdQ=AVhT2+9#j~{Gy&xC
zdAjUJDNKXCR;LdEa9n*`W#EC?iVB;1*2tMdruNq`8E-<AG#M{j0%O{(74E_leyCyO
zTOrkDN{-`T7utdl?Li$3%#LMpm!n=qQ;7x(``n(xAFjHP&wxm}dVA{oiZQteOvyRm
z=3rkQKR**}{n}K%x5xIQ`MTup>bc_7l;8%;dnB6l<Pd!%hYzRk$WfWUawR3@GY0$A
zb<4v=lD?#_Y<kM>TIjNPwFK1xIEi>GIHSDJToJ!@9eO6tQ>38P<JN@sQ}KRE1_#Ta
zV}14lDBPd{?6sj6vVH2jSe9-*4$^+UTfmX7P6-6WAJ3L2kt!2Qr@eCwF&XU<-!K3E
zEefOY1!Y+vhcKjBNogzbFqjd~>sdx#Wc+D?>oFA|xN^taE<w2*_7tg@?@lFvn0Uo;
zI#A-7E%<bJ;zA3?pm~?;ZVqRBlc77_>|gNyo|P3QPP6g;PW5mg*H;2NHM!sbM1*13
z#e>sje~5|lVthi<%t`J=urHaNd<D8M03@XR_*W@D&VYa@n?VFQ{rn8D`WH*%XH^#n
zXP!7H>V&`dY!ZM1AdV(ORsi-k`CN=Rr{K9P&v;D~c&8sDvpH8ur#x1P+x^QO#l?Y#
zhF(5yQ$hUlWsPk6X+}D-S7@#BjtE)KA{|Y_4m)5l)>H0#QW^UnmO_a84$)-J3gtLJ
zZ|JmcSW+);gbhcYvi$=uZdY6#fwNbwb==^>5?*!bdnPnFR`QV({^roBe&ULxvLWf4
zGC&$=S4zP+8z;h+EirKVI8WEi)rbRPE^4|Nx~ZDEctj~puq`L2vWh=GdCFfWaQB4G
zf61TYOJ{R7c(w?{hd)IRcgljuA7aTu#`Ma*%(`a%#;T@R;05Y2yFMCf^2(QpBdIpm
z0QFBzYZauVZ&*#H3Y&=3Z{&hj>$;Y=zK4$~vfhC~lZ53<+JhTv2Oao*QgRfI5}j3D
zB&Es!5<0Qxe4aM@{C!<0wFXTr@>RfzIjy<q^7X{zBfaG&b>m|?o_B&o9L#qoYf`^F
z9!{F2yEw#kwO#IrSTZ2Y;^o5MKmB9<P{4$vBbY%+7Cpxxdj3JpD#PHu3ocjjjAIOg
zk^U$VqQpe3+f26!%VHx85|lIavy*AZh0$IxX;`a5tbJtnYzWu9$A>m&>40qW<}1#@
zaF>vLA0uPPm+t(ygGR-^?!If^J7!yOq_I8Sj+?sTjH|<paS=~;Tiu$z0>W5BXZj|<
z8ThqdMr>RmQW{I#M*&?vRoP)9TWJH@Om~Ti{gYZy{lr;LY0bp@`u?&%al&noY~KN+
zl}GkNqslvQeQu5y4!Iqe`Ba<eJ+@aRJsGwipv(6>_);+oJQ@H)ZB}e9Iwk2U!u_W+
z@K$U1{!k`Xu8Yh!J%_!wk&}z=yX+z52u^_vSk*O)?`e3ZbEDxERI*dkrig!KV{7Os
zce8EMj>98WcTH#WG3{mOSl=-6^!UORQn$Tg9GV!!n9G}U&ZJ<@F7l}&62UE=`U~)%
z=nn<(&+Q*r&VM}l{~n<KGx|H5IypPhtExZ(0RO|tiTD{V?$7{$AWwh*03awoZ~w1@
z{Ii6h`oDty^XfkX?cWE{*SEB@bk^7Z&kFYcfJG%HQ1D=pB2f7`F@b`iv9+PODb&B0
zsT%t+-)j&604w<a2fq4$Kd6zZow0?jp`$h9|8kaMbzAuZ4ur35eZ!;fBDd;|28YU!
z3rU(0x+)3+Eland{88D><=w%AD}W`d@AsI4P7ao<Ksx-tOt~IrAJOJKni7bGqN%}J
z5H$!<%7m;^5sT5ca(8Z+sXZ)FmT%hCs-=w#&u5<BcsohrNHjmqc_zC1-hgZ7)aUug
z{cqX9@XA#9<(*`SgDT391UV0DXH$2VALou(-tFy>baM3kGt=GRNegRhOZ*nm+W~T;
z5lW{bg9tboPM%%M)CsRfzVgCbyqC@hJczn^1O|``g1gk?=)8{-?qY_xsBsDVIuN~Y
zZnjv*r_N+h+5m1eYyA)i%BM;sO{rNY?9O1?!de89E@a3>>Wc-$_5S<@Cd5OqjY5B7
z%e82UL8x`YOmW<}wW}3(wh~My4|~^yvmLP(M5ueapJe74$j6XY20h(+U8liFY$_tm
z_JV;{=88v?j+sZXBt#25B{(lYRbk@8aIt@nx7B>{Rb!OaeV|t$!sSe%jMEkA+o$I9
z4_hqOo5+s>i&7<_l#z0gW3yFCBN%Kr3KJKBG>C(W22v$M#1NGT5fK~zxUtEUX0#mc
z#ST=1Izw>E6Lgalv31wV)Rs_loM%neg&Ot6R5%+JZnH?2i=Y!}7U&#z?b@>&%7XjC
zZf%t?ts69MuaKVt#=OcZujVQ`A?zfo1TE)IvK5@LK&mq)!21Hp79c{~NMH#pMAW@e
zDByxiMKy^Mr_juGzZkzFj~|YlmWH7V4U4@0@!cVMiN9NIMw>v8igPN{@~=x63MPe<
zHC13_z*sc8z5c+=o10~U7k_L;RIDi@Zla3<<8t*X)xSeS<7H9sK!%;Kh>C4(R_|Gr
zNtc9l;RhBXi*_;#r0LvyAHi6uHfUGN9Cjb*wR7p-k^>%?h{HTUOaVP3k+nq1sK_;b
z?+{q?VwG9SA=ORS0Z{MWm^twRZRlCCP4c83|8v`Jmlf6?kGbh_&k;e!g67Ce2&s9T
zQxEwy&d|kt0OEt1D4vTgh8j{L6%>FS2FP6xemPVEVgHb2Ieb^2Y~s_G>^dHC8;V_P
zYr065JvRAAo<r8VzawkO%S(MM&un*h9>WuQmxW%iQr`%>I;Nw7=79>;!HU1X{1ZP5
zUb(FJ?%0F@i_KX}RVOBhEe2Tqz$a_?olQ?xM*Hw;KghKncw0`w#~Ve>0r2pxa`nie
z`7o!K1w3uvs!vCFT)0P=e|^KPXIl@*wzT_xL1VFBD|JY>^<8RWgUQ>yOR5+{>612-
z_i)<`d#dU*azp2){kV0oZlVY?n)C!65yL;L;=uOK((a1Pb&JGp@5aWz_PH6`EbRMS
zqjTwf8a+hsq4VZ;$EK5;x4U!Qi@yDtzQMwC1N%<8(Gh}DO1_#2H=>jTUo_ogiZMwF
zA&)WH0U^DQ1fp}mC=b`rIf?WwOLGTgQy7K*SFJ_}dGz@ZaWJ^FB$3?j_&@xM=tOZ=
zs)d1nclJ*{_rmy%7vYsz*jf2^ZEC<crR<h|Oym*Z^93ay?QL^=aOcg9ws~&LKl$X8
zr~AD}LiY7A9r!9rl>`Lt_RKof_8{g?K42gIa+`eJ|8;j|Q)6<@!D@Dc2=)jL;^`Mm
zDM6H5acyx4YnhwnQR^P96?PvR5~0KIFf(y7yBRJ6XBIAZ<8E)X^x~v&{Q~}X{4B(#
zkohNktNg@Jynn}NBM)a&eJ5u}OFMJ@e<JGt5=BE)CStP~P_`aWTgSjEAgyL;DkRpL
zWT{=|)T~Of6iqoDAVr2to=K7U%6I1QFIYi@=0^z7o}V~J_(tM5Y4qp_ST)OEsNGk)
z^*f(a2wF>k2=7C&qJ#uAmxhr>bC@d#QGECzrk}*pL{2SEl{F1#qZY3|3He3~QM@Ey
zNj3y8h*$U?iy<3YLS?KL{T)-9NSF>u_d$-yozxrp)4<d)0Q_Qk(B&H{P@yze0+*DX
zui>&M7268(Cwa~&AOck+Pj+RcUsGtp>P?#+>QLXewa*x|T>3GOcjNKSXY<ErhU~<y
zhgqw>d0jAKem2}i%>#a$05l&I1wf6Rp^i#DhcCmsDy8?qISqs_TEtM^A@Nohp%>=D
z^SVOS!==qVC5E1^EG&9Z7Geuo?)*aW!^k{dgx^k&fiGEHQHrJG+=iOoTR6Eg8^ZIt
z5UWHZxlHes;{_haxec~l$gta!mJXSVA1nmR_3lU3zuVIo+WNymer?1zp4H7Oxk;PB
zktCI|R-EcyKSl}7-CO&+5n5i0T6<<lx%0%Q^dn0JHq_1|Nb)OC(hljsDKwRhAFEs_
zWIbv#!2YpOWTr2PlIIP7b2WMiU*E@EjQ)CKW;bc%9dW1mE%D?7#X|Ef%wP%-N^wjH
zn(?C5rIWOK-0~ajzb4Z^S;L<7?v5!k0Kkb3006<iCzG*_rKz2>zLTk=tEuC^rqlm`
z+q>eicG}=b-1(%gyI0L|uB;;=Z*)uHwa*#(g>;g*ifV1f#G0g}h(be{4?zd0HK)7f
zw`0d50YpNoD3_hlg{6vz1i^y!c@5TU{Cv8DS1&Ow+TcF!o6h6+h`;LSoHd5ZLZ0cj
zT%XQZFJ)R1m8zN^44sR+wk#_0sqRrFnmHCu4SE|XxU#79sP;zlr2WWrKxm}Eau_k2
zN$n%zurQSrCx`-_YcbTwor@a(E)K(Yay%B}!DLd~-*L+|*^j!yqb%&&H7A{+LJJIj
zEPndq`;}QcAPe{a`pJ_f*-uPsa!&-`_!bYziW)I*h;&!piQAw*wb|0jz1J9ZV$R>0
zD^pv$Et{>+FR!Q9m7~|w;rZR@ahL~qILnmyOR{b%NgZ92tTF#BlDdI^iE8rEc{;#w
z6VSg&sVS2p`al!IC(}q9@b;cG&e-FR)WNf0D#$iQ!PV1@yq>In&Zj$D7hhH<PJ_i5
zpsNZJxI!eSeAs2F!f9o)d4B3z9wW4GwZepZvboge9pyI;kWwcb`L7?a)hNt9JJu~W
z4leH08!a#z0Hls3MV`!=x!>)gfB7LzZ^nC1y4uI1!(12P1+Qn%-;b?V6Vq;S$`5sI
zd)7gkVY#9}YyrAsWa3tHOROjg5vxbOSB(nNDyxd2$cc}a$cLWC3P8Iw>_H!)vKtK|
z$gLZ5Qxh0CA$P=2ItSlh-M|-WP+^u_%@EHM9$1w;K6t~C@EaoYd=)XEo_Wjj!qSn(
z&tIQj&zF&1U72+!>Nk2iGOLG?pD#xjD>nD{fdRVQJ#Bvs(`Ke-#)hUqJoPuzzu5V+
zf9vb;F#C&C+VRjTN~~G#G5npi?|V86XlG%lbEo;KVHoE60Br%-kGas#!p>3WQEvlo
z>(MOqI$ti(8<$_bwBxo;VXk<j)}ff(^jLM{nclG{^%K5Rh&pnO-3t*UQ0P}-RiDE6
zfP^z}ZK%MkLEy~<NW6Zkkr-fQhY_HMTQ3Pj_qO;(kuotRiH_XEC48fiox;jpfyq~w
zI6?=Ky7aUJtXA`C*R7FXk4y%$F_))Jf^kzp1iGqHoT&zjT4HYHu<dv8fzg%PO|q{R
z80tf$erDvra9N%a+Egx<ZIl70^d-Dgz@-$pUt2vRS`_@9|FCBioGe8!g67S6Mvb!v
zN5%NCJk$qlFDz{6*&oaH&p;F=8@YWM#Ppa{+asSALbD|bXExWV{mHowcB9SIg-dJo
zpoZKF?xQ<&H6VJBBhKV;gn@rIDsT=YCqp)+6SjI7tyV+Pg^c~6Ie5`A-9m$B8Dj~Z
zjI@B%3^X}qnCQJme+gjx?)R)%PO5O(6wBfN-B7VbG<$*A7Y>*%!!Zm++Xpp8?*CD2
zhEIoU_QaP=<Be#>9N+9z@B{P)s#u~|Y60-e2_ti$cngNNlhP~a^78812BEZfMo9#Q
zD-SR_n6ntP&_ooG#A22*f?!kzCG{U8LtS-ZX5rC52nFph8D0how>{Kh6ui`rc*CKf
zw3ntxx2~5mWTXZgY1E6fkOY9iB9MVRNRQ1gcPpXZ6CpJvurzVX=%ryP1-fckv)ird
z{^<UxS5ot^VyeXJ4h7rQ+|^t{`+m^0@mn1Ct(ui!<D?L;09_ip`DsDroz%@qYWv9|
zlKlJ)9l~#q***tqo_x}&xyXsG3&Obn;W+SuzxpI2cc9D%j<0tf7*w=JoV@I9KH4S7
zjs`n2{IURGy6l&6m3|a?64e5aMaghQ+KeAU8btGAAb3}r24({k>@Bq+<Xn%IIgUDt
zpK_!Lb`bYcg)##<1clLngfvFUM&2YE(BUdece5-)q)F-BZuC~X%?^HHLtz{zg!;`Y
z)t1V)uAova#7Au?jPs-=;$3H&D&m?!hBAISIdl?UT84U6;Ag)!oH4H8h?4EiZk6)f
z7ofy<OS=5vF;q&SpS^7Ia<D~pYkLOj!zo^0CCTD@-OnM;(Bfxcly*X-Oa|N|wJ-bK
zF;~VpZ2_FvS)Lr)9dhbeM9HW_hRE5f4FW%G)hxtT{bLl15v0f6M4~?h5Zl>2-vxE@
zLHWKj_x%*8olz~c#eRYD>h3u3Qc<=<QdqcP!Sk1F^2L)CC~CiP$;^HHQ)t|0-KWe9
zJB4jM0FKloLkIf=*a`tEoa(fr87NADBo;2Cev}n-8`KjLRxgm++A#XPk(kgXoNp*=
zpcw}fW=@8UoHh53ViNTZ*WArp_3V1tprlZPO3pJ%0&?WsrQeVgP1<%CLnP92DiGSj
z3~pwTQ-a{UXna{gEzWTeCfx|lgdh`j##>`}s9*v#!eliGVr_&p1fw`V88S^LOrzDx
z&Zvu#RCYp>2)C}1i=IM6(_H8QDlDJS10u5B&NN1s0+c}8D6AG62S)2w0ajRDEw-F9
zT{v$7K8D;T^oHbRsC4NRuhHqS|IR&*QGnRHQ8BYNkYa|(J+S2wHeRVI!=RX%T$|Za
zaoKJ8YvMErnQ|$C^KE=|u^^l8kd;wh#~bWD)uF)@*6ZZH59Y!^yG)?`nc}qu@{^r>
z;2_q7!_2(Q*Ch($SS!%H@UfuWpEkWaCFk_DO>89AZl1$y$MTJ0n(g#iNa8SPTivBV
zH24F@n?x&qI)Q0T+A>B)v;(^y=~sHbA^1eNV8K0EL3$g=T{6I*0GBOR=|(_Pp-J88
z3P-%*+y#S+dfCnMRa05Gbw<mtx=<&3Cqiv9o16)!nPWrdz|h=$x~@Q2%sL7{Y8ued
zP(T+fO2(6JU=IfP`#N8N^(`@h#Vv0ec44NjQ=Cfl_{(tLs6OgGP)rWHKY#I5s5g=+
zMjeTp_-;4AE^Yh<_42HgPMk8x|L)%ANcEHi5)*H(6_+iWLOCcHG*HWq!3`WSb)?YV
zMlvixP+w#<;Cv`DhNc-D9PdA;nu`EjOM8G0E~`j-W2)^EaQ87<7v^$=6;cPcZGugr
zZZoO7IYbNarm(4!=@670=hf5tyB7~M1RE;iUOrO8XJcKlU+5dmPiSJXbtR~%K50~R
z&LMA1CG#*aN>s@X=|l<nT;e$)i+2c1S-5b2c1_9Bm{5~R3%C#4qJ^0v23HrGRN^@F
zEt+Ul&gWeX#Y&0&Zes(mK`RdL8$N7RsBZXB^<ih+7hVHEjd4_conI@d2nuahC=Q-=
zvvqX|HJxp6XeE~cl&zpr;!;E?YULRm%4$DCfQjp?2u2U{0E&YoJy$WwOYt?>CL`R9
zQt*+@Nqd|zM%=rHZOBSZoD)tCYw;|rz53lE1Rc%^JHldebwq+a631O4Uabc%<gry-
z9C0*<zp)E;dljJ?Fy7P3@$k%ZazVI|ezUv#Wl{-G3-5FleKSKFskZqkG?^Cm)WYMN
zVbPuE|3^CkCxLv3IHloS{Q;u{g<sX(qH=YdfxHdRlQ^h^hvU*Fg59k{?n7WZ?!5yg
zTv<;klRcT9;hXhfdDK*%BF#mibpt)<%J{FnDG(mS_mjG;Fz=ZyTqvejDDEYB#?XTv
zb@*MF*mkMN<M{}?3){|woU6IF`tkdCoz-#~Tp8zLA+7&2yINz?RzFqYej23J2S4bR
z)~9+>th)q&w^RefcCaUAaAWG0zy}YW%j@NM?4}Y>QQIC(3!?~7nFZP2&W1(}BU-<Z
zz|Cy7sh`f3W!tm#T%grcql|B$0%CuJ_E4<+my(leyj>HNF*G7R?^@Tynje#h!%M;-
zHtgK=`EMbJtUh?}DYgZbND*-8nw|Tn)S{1c45N96=Ww6FI?NY|35sMnxYrNNgIMhT
zW|nnZ7%b>V`QTRSz*d~yJ@?i1DV$>{B*GvI%)s;^DtB;3s-OG^dupb<p53B~Ye<<y
zNcn7pPCsMG9T&DErx<L!1_Qjn;ARCOKXNEj>D*yL@GaGo<~w^nKVv~(8~yi--b8e5
zx`b-CkrSryV3mU`0jcny+@3>@a3VW;xdNgUZ(N8Sr56iIsmm}B@wY&irn>(hL3~`N
zIZFnNP4S!UIYPIL50EDQiDvI#UmMhfsgb2o;c!YV6h$wa`Yuo^@^=vZUm&1y*ZRUM
zzQ{m$*VJ(@H=OGMcT$X%ylY|n7g7%RgQJ@_FaEX*H@8|Z5r5;>4$s$6y@Ey-C;OPW
zw1dT`$GwmcR03Iw2(MeFv<b3@qyoQ$@fk3v6R9D$8#^@y)ChKdt=!M!)v)>;`Dh53
z2Yu}dqT@L?a+D;lGBAy(WUvmAW%N=lGrfzPQvLzDc|FM5Wp9$u?GL0)Nzm;J;KqaJ
z?B6{{TIA~xE=3ICt^QlOUB{CaM44!LNj%$!#}U)~?*IMQnhCFKt#j?;Y~`@F%gkUl
z{jq4Fimmq1QZ<)zy6(qvQIN`X6Na1$mcg)=k~aJsFxOwZ1cEvh0eOaEj5apLLt71`
z3`f#s1&m(~FPM=0-QwXD+9GH~@v*&ZThIKS+0K$;T$w<8&#hmC^J(227N&WD0ye#|
ziN`MaE<Nly8SW6JZ2<Alh|XOjbxxh;HIbFrNUewa$k`)-T2o5XlKgL-lH+H0H;l~`
z-%I|rF`s=hBKw8dJ*Qdc+yDdFDCK27GS-gAAy?vzb>aR}F{|gJk!#uUKo*@1hClqh
z_&d-}s||+x>u_8sL~<u$BcCNAxm!S-r?XrxLa2Mc`;0SIm+hEI=8HyWnfF*{Tx?wT
ztMc+e&`27=dvzFT+32g{mv`NXqOi%foxG5`NKW%f+p&z{9W7zSc>TjNe<@Vh6Oh%2
zTrY&Q|NOj~D%Y-r_zyoi!S?lYdJH$$*Gi)CK^yFfVARoVg04tFh&KC8OXCU#mZmQP
z)H2##%O-NIjg8YlMZ?8WF2=7vH3DrpF>N=?=2+ZCHCqZyS*I8Gv-o9HM&fBeRF78i
zsYdlIY=ARdN^i!)=tPqK7=3abdSlBwmr^&zUD$5p4DVf<`&(#(xz`;n$9-X&T2@1r
zj{uQZvJ8CAfzRAJQv$UCA^de;#{gc)lP@l(3t}$n?2bOVjo)qCewR<R5vlu`N9Bm+
z4rE68>}1IlA3}<@)HE=*AW?trdlepzK0sG8HOxMFSCHRFTaBBk>iK;;{k(0dlTJ~3
zwJbIVgH~$uSD0Kke_gi4wknAF3AD>VMbSo3Hv8WJ9V`@m-Z6HB%BTb?GQTHp9Jn%I
zq_><ie3|q+SsE>$_fCS+1!95~E?L9MT?kqaWX6Po5(Zp=5yf{&O9-Ppg0rS^nbDg@
zmL=^GRE2v!Z8u|eIf9`eon~9*m3USF3E9wvwzpqNS65&%3=Bj9xx9y7tjk7`^**;d
zyp>a|uxa|7?9*f*%l1Ycpt9sQ`4a@l<vRqNAY1kz!5hw9;cEgL7D&mbwN{s{BiYl9
z6VB5`!w0h21udf#AfpfsU5l&|iA_^x3m=Y0qR1aZ$r_e*CRx17EbXpNgkG;nL+LFr
zEZCshMdH$Nb|5pUKVw<uU^4ljJaFR$@gC*nW!ENmaYW8vWa(JR_noj8_;?i;w*`()
z_ay@iR%uY|iuU1OlTl2T4OROPiozy#HWT_`WDx>($b6uAK`mnxy6Cp^s1C$4G2TE+
zNyrlVmO2^D?kY^<2(JS|%%Qn!pu=o%84h)M^usKx4>r3KxAl?hIvS)g)*GUOq)n+n
z&3pzZ0Dz?^xM!D}-h*trkrtwL?7x(0r>4>!0FAa0#~TL`yuu-f&kH3dtDY%OiV7?c
zKh3bH1R1)7E`^k@Rc_Y^<FqUo^~9{gn`|!4E*@WlR2!0=*UP(9L$^Mm9eqB8CbT6L
znOc=@C}=xuU2#1_Ep|h<R41RZoWTLwFt5dpne2?*X(xs;dyLr9dGx13#J~mZ8&-fV
z<si3phxkF9Trup5BO-Bf{T-}DVYbO}6_;zi!g8gxJ<W#uO8EQsn_sEt=e||G^If}W
zr;_Q89j?!}VZOG~5MLHTSW9(et1Y+UY^ax4H{t~NcHZwl;Rtig7xre_7=zt-H5%Wc
zFbCgay05~im}}BN9`$zG2F)+9m2lr4^x@y6ovRaSzI`c-e;w;!Gg6QO$9`Nfw44aQ
zH)#$<&rH59hJZp-<w`9CyB5^ALzaA3Q`KtwWN7mY$`bMrUz)!>D`#t%Dm&M%PzMY0
zw>?$;N^`|7VY`5zbrYVbFCV!D(ONtvuMp3aNxa<@D72}QqF>>uomQ|RBmWPI*+*fy
z<ecq~-ABI-o^WtbCzE=vRpD4EV}}Q*902`Iy0v7wT^h$#$q67?Inw$7Qb^DVM(a#H
zHT^4=O@$PHOL30&lscQCty#3>(8@we<nqe|B%ESW<nay#VqL3DrsDebOE+v3`r(p3
zfO==yThpn+5mwEJO}3w>x&EM8q!t0X0dy34k^=8R{n$d1-_V9vI^ViFt*z1~#7<Sw
z_i@XhUek12U~)*ynd3wE1rSz-y4}t>@c7s3;T+6_QW+s#84a^N7j;cM!D_JGQC>;b
zMciX1#(N?M*QZ>qAmp0$3s~ot(MXiSJJDo-IJ-O;R+Dme7WRlm0L34%iAFs;DJ?Kt
zinfmmT&4U!<CPeoX8sYxLEt@=H3=46_sj1NkMRFhzWYb7OBZ?uKn4Z?Q2%N9!1;Ii
z&dl<s@59pbzm@L(2i0y#eaCu(0m1i0pKudB3lur4KMz4;Y&lGjeYL$*G1%q@Q_zqi
zbuaX?a)a`ne9<h~L065BB<a@ve$Bl#=Bm4*DP@4AL?q=nP2#~%JjavGJq@mQW=$wB
zF<I0N1~-&*P~b}@Do8O!r7M}2d^n^fAzQ)p)9;&nf<<y*C8kjYQp+-{HSh~-s&eq`
z#R67pg^j(}j>M!|5QnSTz{jl<s-e?_=Nq;mMPG;b_K|yHw+eM?)F9o#AklRr`GA8`
zOP&!4X2n=i2^G~`E6*<XFF9PFIqidz7su8#7#CpdSO+NNoJ@a)(qR+2Cy}0D0XFt9
zbTQ|!Sb3?x`Bb~dq_ee^;Bi-R@tfCRI3Oo;`0(;QO3cw&GW5q)D)}jtM&>h!M^1&u
zYzFIy4GRMY^?q_fOup?Lfq+5O@{OcCqt$CSe<ZW#o&Y@Bw)u0E4{N^eZi4R~LF)vl
zK|l$VC_lZvaxN_5iNYX(n^A!YsKK8|rN&MWPP4&qMv(_|=yqI+t6^M-1J6L{+AGcg
zMGYoUU;`}fVexv(5mjzG((RBcfCJkX+u%;fEQ`zR282G=C){Nrp;!q@JIz|;C$2e(
z5m&^U^8><-ARj@pU`~xWfgW9G+nu={2OoN8NcV{3ROX~CKJ7x0#Sn(e&Tb#Fa~Tbe
z^M?u)iO;hZqfS8NP4$WcZ}*2ggGNyVS;q{S#HM~Afkjs@jK*fFJP7P+!K*f#u23cy
zb8YHw)5hxWyyd$VGViQ0(m<h-L-tVtp$vgX5~+(_nqKikXp%gXBsY@KU{$E_QHgmD
zmYAaFvOt1US6xz#mTY5wj1J619Jz!}Sa9h2kNR!$&@03(&aS<k>ETm=Uld*CavLuY
zJ?sv(KG}Qkh&<?`ZRC+Po4Vc^2mMzdtHZo#*P0AoBP=_J*DBreoPHOSwf4~`0jnwf
zC+3?}y?r5np=#zV(!5waUx|DknY;>;)XsKS)-P-`!)#LQ3k17|THJfhUj4KT?D(`M
z(IJXW0){DQZ>@urTFttxL(NbzzyF))LSs|Y`~GvG^`im+VEy|-v$XqBTwH8_YUQRT
z|BK+Vr6KL~(`MEEpl+}Uonq*5KE}*P(lSpaOC!IJGY<_$l*VM6XaF!8`Q;-3Kmv%6
zm?9<1y%cTAgVwRb7ogf_5yPaV8Xu9!hLhqVogu*Ky(M5mrcEFZ6Iu4VwPOcwAyrZE
zNK%AD>7eY?y}YZ6DjyMNmANk07>Drfj~e&y#c5VsELyf-%gB$hJjh*%c^x_Kc`@4=
znoM}4uzJXV+6@mT@fBJLmt)7z9b_``n2(!NkFj%T>8qrX0ceyf-B;HX)x*MU2E-;h
zjeU^2fu&mr=Aj78qioS>GyZ$n<r`3*i()!!RfVVMj~PKNd`KwOhcCduiQnUrfOBDQ
zVMinMuW?8BGxDcWNkfgNe-A#MK0k)FJUZHZK7H@f8qq{KQNsJ$4}mbINjBhIWz^#u
zrHfl}4C&H&h<ldWTT0D-C$@x;#lfa*HzGcPkm)GBmEsz?lMfUTJyDTcOwi5wvyA#K
z_f1pvb84|Tx+qK63i4^u;Dsqcx|A#+9W+pa4l{;TNGJ%7pP$o~b707J!0$+;jTp8-
zp2JPg2gr{Yvf<=lDJFDiRM$mS*9#&DyhYZrrhV)2+IA>nR3J}c9gdEi+%oL~My~tw
z<=0Ic$g=J`xS3{CNXSXW;I+ax1t(yU0QfG|%amTQsnr|CkXuJOk5cQJMZ<QfYWJ3f
ziZ5R5EyJoKqBobLSyomAtv}GDag@FkwsQfjwuXzh?|ATb`&?hV@Pa>!-o_sTw<p_x
z42eRj=Dvb5=aP!o`-fIh^|CG|UpdHPHH6Y3i%(4m;oGJ7Z;K_0%(ph#44QIm+JF~d
z4M&XwF6gyHbkF}l0=ug0+$M&uc=ftH*n7H1qO!X3*P~o6{Il?;F9pW9DlpvBGSnO8
zYQfeV{=DV0T)(Os@HooAknBxnf`Sz5Adu}_YQdlgcEBrMO~o*wqoYqO74pQr0vpB`
zF4;-2*x?~ubZ;k;wGQ4jojAae!^y`lY+*!p&(S8OIEm&zadA~b>FK(blIxXzY2W>p
zBcu#4lP4$wz?_x^lYChQQo^6CJ~L*H*EwcV+7vRn<bl>ziZu9kF|l#_$t?)Lb0^8g
zwv!M32-C0D4DKB%3;&Y0c2*0yw4-v>s$A&ZsPffA!%gEc#FsPHcS%mF9P31^dP>6&
z-LrW>8f{-o&XE~*l_hb-BdagHtf%B~ox9`UVLiHB{<yTWGk6!akM)~s7NpN4OuaE8
zopHA>Qib{RSbu$HtIB*~4icv0a^qu<Q{Y(+r%mQ~zC6~v@-}pQneWhssQT>Ep+LR<
z-SSPY)_nL84Yyc6txWrY_pn+2sejJFTkYGn^*Hlw?=^Mv?{youJwPi02mqiA0ssK_
z-{Hi{(A7}i(9zM*<6oNF|FCe!sL9%|3m|m8t5<n9kWK(49U@RFDN9tUS<&0dBWlSS
z0Ek~ScXYrV?YJ%LSK(s!$K<?hce3Lxzjoiyq6YTSPLRF>Mv}54N}D}oocILJ<2bs^
z@C7k}qon7oLOoLuF6GKk=8Q8Bztyl53r%ZgqW)p3N<JgoAwn+~1Qim8(Q~Gg3hESd
z*9&5?4BgH!g|VViYZy8^X7vqW-}Q2I3s$cD%9-w77KfW>a)F<@Qj~_`5yQ8ofEe+M
zM+M?U0i{>z7OWPYd0+!-GK-VyJ7Z@IT9&uzp%pZC*N?FIHNuj~FPf)#nK>Yp(>)C8
zm2QU|jJrw}*DOF2sOgRUEV@1jZ>K@z_6ZxPbtKMd7Z94?pn^n;fgbym&!Ro?o9*y%
zz9e(JP7`?+%(Yn&%Vl3QR|@%N?ql#k8P+*7U?~f^ajvdcU#!?s-wlf(iSIs(MOfp=
zKNp|s;jtA2k%ov&0nDZF7U*^ukjya23+0R+4VvGolb`$ox}X)7#v`{DcPv)Q!yDih
zoDotsr2`s`=FAxFdYD#%=l(n!tNH2uheJSKezI^!H|2<z^E6|-L0WN>F{fIfyZ3Zs
zUnf&*uPHO+G+`xKQvVDT4m#?iQ>$DnoYgUM4g;P8+5X*cI6|$nqCRNx7RrB?kl>n1
z7t^$6yfQ@kdgb-{V1o;nJ4i*ZKlqxos#bgkxbj5!G48B0x9y<I^+pmxA~!tG8%eVw
zI}AwKkS*9tPZV}1ZEa4ya942;;7yucxH@qh&yQ0*gDKs@<BtW~lHc!4r+p<{Tf{Z-
zB$QL`Lbmf~fQ;#hBEfXE)waEm9}bD>y>&0ZnPY|^CHvi_;QGi~{w(+V|H(^pDh{NG
zWFt}5e|pnyIsSjL@_)>4V?!Goqo3}ue@*d<pZ(DPjQ`f5cJ+VvM0@#yR&Ii0Ua;Df
zc7e8=8{m%EE?8&gJ-d2Y00*R=YDAD&QC5_@)qj767U30<Ot@y+d0t_H8bu9w%s7~p
zX{EX_7=k$M6Jq86nw(9d!+Hk)3<}bB7*|i`6&*>xLxq%R6C;0(6j_yrqgib;VG)UK
zl<^o3KOo?braYjC@lNv`g!GoffJ>z7j|v8(B##Z`P~g_Z`gI=x{zbHt!3+5d#3Vfs
zYQg`Vv5-y@iNAeZkbW^K=+ZoqjtQeVs#btPKIr!Z_!`jwou<LdkKJ?z{;x1^M|dej
z4syw9%T8bKEhkG1<6L_~j&`E;bJnl$9O~HvTD=p7XTZQ2X$_AWwDwaYSgIKRH~9-_
zsUW({T>^xvNr-oeRZueylDw88>KC$LZaht+*?<GHcsN&GB&V=<!hFgb3mJbX4i`%W
z3d_&eVgi2CXeL;>@Kg*M?)`+ZmmxpA9%sGSo!FeHyAM%xQg_L0lN86eZLwQs0+MSY
zH&K651|1Qd>xD8W2ZuV~71thi3_iqOfN&%@Jg~nbhByGQ<xm|DD&=`~HuAd-I}@e*
z#*lzjbPc}F#;?kItsqH6=A%j5Dn6Jazjiw`>TGl~y+sxwC0`9LKZv~^GjkJeWkQ7{
zh_?G3XmZ8@30ho@sJ!3u)78FsQL}X*#I)Mjp141hI&(+jgE>qwgPd+3T;*b)_YQSm
z&&CWeuBVnLNw`{qS(59UBS>*Z<c5P`cf@NgMVOx9<~rPNX2&=^efY#@rgwOLPtSee
ze7-*}QM33lilW<3lP|2n<nSf&pDe85<drf*^1r(<Wyr~s5cx$AjU(pyY1Ki?=pe?5
z#b6~Wkof!5fyaoAMpB4GH~;c77FnvN;7_%q>2hc8kHMp72Wa{5u3H*T_`9Gu!5cBC
z)?<P<(XF1O`J9f(u}g#!6yjTsqu6ic0kzd<_Di-9)j$-Xh)Pudo#fWP?X~-n_*TT0
z@me?N@mcMJvj1YcvqW$#=GW&Nmp$s`X+8+|U+=z60CGrAVtj#yk|=9`kY6O(@$0|5
zDMWt0vP9iw0q8SMckI|Hri$txX~~AV%?b5@&kySe$fEQrhNSSO7))lg7MNLcz@bhk
z=m4u(N_7|+=oKn7h`Kchwo};O7$}7J+Y#G&M~ykKIgoMPCElQ@#wqZcIP1%hs7oeA
z234_rpBE^xVP;E_yXtl0>jgLqCMa;EaxWlN&)uG46f^0w5Sy^E&=(E2m}daSGY`ea
zsIki91#-OgWjZ4_1s8Pfb54pS#uS0bW5wn9!X&X262HQMo=gSo@mVczPfPc1CBI7x
zb(^aDwq*(56l*B=Ig7WuJ<G7i#QF0s-i@si$+AO7tjo$DmbmOAn?S|~u)Dd_89SFb
zSYm<hG{axPBp}m|l0O=YU_5CxM0|_#nGq#OcqQ`x4`c7xrdiN#*`}>Z+p4r}+qP}n
z&aCv4wry0}wr$(isonjh-_v_vSO15I6>CO}Id0N>8}KhsY?ln07Ro>Gk}5YAR6hY+
zvdN-9ChP@oF_;X=W#zB3*pbQQ?Od{zfS57TG?L6_x~Tkvd6i1Gml#^IAEuYlKfNij
zBOyo}zyhC)7^m<&IgKjP{Q<=pWFbR7hSiB#<?9(=#p67R;)jRcjE~BGr4XtOz_*;x
zWAw3|_bc8z9jHgC`V@sQ190-$@OPR42=+06ib}#PLZzLmc`8d|ZL_xa8>y%^k_az7
zmwC7ChL^!4=#&%Ve9s(VRp*Wdy<+N>P7N$QFSBE(c`>)1Xb9g|7pB_8Fqe>&=%>Qz
zVnee~sUXTC0O@Z6^rb<(mSey!K6i1A;2(!>DD`G$na2`kN)Pe_<@1u9K{Zk+3u~)p
zJ*arZIs0Vr9Po7VJ-95^4=3GuPC!7ZK$RSM-vtI2nP-fv;=DuF+ti5W^pp?y#I1aR
zcaw0j(e>+_@T2)pEkC5cChx2rlqj-9w#&6wuXL7{TBX}eKcJP{O_*Ipni((`_o@|Z
zhUuuB@=747x1SN5_es*0?Re@o{u+<4OS(@4r6FWNl#Dne_D(uEsN+x|WlcdhY)~ls
zz}T_Cs$jx3ieyoNX-Q+%=Xmj;YJH-1BgS14Y?A5yCLx2^CCu_`iGQDZvUqj07;bSg
zxEVFYBfWZxSy*)c@dXv~xh@(6c@6u;2`0gXEAwTQiq1sEMbY-+<J)w@EU&Bpt=zy?
zgz+zpa0^rZ`e}80BVVqHuX6|4qDy|kw=gNm!!ja>2+8aPttmT<CdlpS(MF?QQrG(D
z?R2+Aj*%Ben*aSMHkx1In}z%%(ji+WxT;$eD#%9D=jvMW0D!1cvL+KHm0TrKg|OO5
zcSu=TmKFoy-Tb2QsSt5)My(Y>mxOCG((3r7EyZaB$UJ~5zSeYXeojdCtskqKh*A<u
zk%;1i^qXM+@VSh}2p!nf-pUKl@7EAQ>zIcNB$o5;MO=Q?<_icVi&vv-zvOC%(^ZC)
z-mlMJi+jQ}5mJQtYo}d(s&QPj=cM;cp@gpg@)6mr6aX2-6dXEV@KM3-LTFM-!%!Ou
zqcpOgEXi=+JHStLp?88xhEk$dN8s6;%1ONAxusGLRbSiLRnI+wP5miC8%0Ctwbqf<
zzg(!@wN8DkUH@EUanuA$Lb@w08>+8z@*<T>7EgiEz(&M3LXJ|h2wwGFH&FU7i_Yt>
z*ASUu)T-%tDsHc?evDJ(DGH2f`-6oxN4`loJt?t?&+<x#QquJlfg2Lf7K-Vuq~qrh
z%ZQq{doj6psyWOLtwz~7nn*AHT$yplyiVb)TH}IaMM*y8x$JVMYumc?Q3ErgER>3k
zSX1hN!n7oGV}-c}7MLH14`@jlZcO+53{^|0E9L~^jN?(sR!fM)5l=S!kFaE5rK{Sg
z;q<$mq7WyvY4&)a<%~<~R40~GOWY8av3;@pvvE$>%vo9Hr&=+yF`dfMk~;L3brWA{
z-<wJRYwp$!(RH5eYDwhx3Q3iL9ipTu|0>}n9{4BU^d_zDh2GW44_Zfgn&s)I8@*aG
z-m}e@)<(Ct<r`bG_Xo7v#Bc44ZanO6twwdcpX*VwcMNPbki(FY=zz3pNmdgpbu6y(
zy&cBXUymuPh%pK_9jg^o5@?!Xhvp%*Wsstd{hH@dmEE+OPf{HXVEKP=WN2TK`ksS}
z@9mOWz7+Ig@)Vd#|Bbzq!H16{l3H#5o4x2+LF6dC#v(KDwPszSv{Wx}Rhh%Y{Wl7O
zEbnL9mKXX{S1eZFeS|LTvhW-9@yZ(jY=wjnW}KIvvGkNG0XW^iY@dBO{X5t!1P&g+
zjdrz$_r`KdG$*RC!fCVvQOMHJd){r7F@aP;{WhO?=I$$i(W)5ow)n6mDd3PitWUKZ
zwsY-K1;wnKq)Npyk@N;|A8@QI&ZUp7y*3M1jr5mG%XyQ{9mQgd4d~2?U?z&MEGmoO
z@!>Fo*G;uwY${@rJ&>ipP{CojOO&B~IJ<=Luj;%}RyI}cs?4|V$@(aT{y5le=sRlo
zi<gOU`k^m<a`P`ORGuBYSM!+k-i5Vp4?kz$3_9<RVBhWUrH%`V?}#8&4E4n{hGAf9
zUpV}O)bT$wPTuWq0?$I-r%EVD%?8naK+giJE1%UTxheqvT*v^uJ^p3p{M4VP(aNVJ
zk<OkQFjVigi$(q11RT7EF^dk*p_79i^RVjyS>L;9b1*eG+&6y-T#7-zLyol{yx%-(
zJfZME;QyHkSFts;SN~lxF(U&3;rut5(8TgTDa`*N5ngF%%U`k~`OVbU^-CAjsBHmt
z;YLD633O#UQ)MrN!w^l7CJpWmmS0^q{B(DdN@tk_7KC=QAVy%#_&5{bIQ3D7IBr*{
z0S!l}!DGNQv&woxq$z?2F|-LBsyQuF^~w=LITfzi$X}<po5(r{qS}nMKt_NR@q6!c
zz&@6O-7(i4cdC8{sl$Vcdg37lVE3WPb6yC3ujYJu{RYDT#R6B7oFQeS*`eb>3XYRp
zFo4epTKwg*?}((+egG?0)LGEW6geHgcpfjhf=)OI@BL)^7c-GMPUdsL#GMLFk!2_a
z2ZEg>m_QKVV|53mn~oWth)~^d&+Q;i&gXfVM8W5Ip0soIX#zBbhNk3OjOJ6b*22vx
z<O3h%m?+4$__+=F-bdl@Blf|ViPsff4)nwguo76W;30RJoJH18&U1C716xD*+p{=s
zTpYr<<s2iAIaa1MO+f$yg9JJ)AG9M}4G&uGBadsWF*4KY9$%3iQ*B<WjI615dUkI4
z+Ob5ktQ^%b>(zc7b*!DsWnN`&1Esn^{$UiH(b9b2z3s8Zm5CCKZxT<!Y{^jw;*6t*
z>R)${Q!(C_elg?Dv;`%`NQ{gY7j9?P+&ZL0ae-pfu2HaxV(Fr%vM5@#6|qYcdm>_X
zJcY5~GrM5~7CJpW8*}b+OJ{t)#7XD{&lS^HTM(*VV&YGTH<84)vq|?3A*;ThlVPlR
z|7Q@b+;yRWmJJjfi@dFyGG^8LWw596P(eBJDqPZS*D&2NUSkF;46-Ykp_*}0i!GN>
z1%#`W?{s2PPfhW(iYU80#Z*E#Kd}mUj7=8!Q4Q`uG5I%3@P^&F>cS4hmwO0bqIWPQ
zP&u-KKEmO6;O3##0*=W===!nj`Z3pLX0^l4^6G+MYrOLp*c+nO>5rJljhMur`@jhs
zM%(I}<)!&<G;W2qtj>xYm>m#VbqV)kxBTRHq6>8V<>O^kwuHC2HlEPtaP$YSZ@kXW
ztNYqagHNj#FW4`eJyn9Kd9TKEE%P{<xll?$j<upB-}y>it>=!?dhU%cdr6Onuu*yN
zGASx1jO{y|u$3P-o6S4#ztr5zGV=`s-Jxdf6D?0B@m>^Gw2l&xaAnzK@HR097s8)e
zNo;!`J}NkV4g~5moR%76)Fhf+5>%1Fb7E{0EqT@*@zlw4JdnW6Lcdr1l@`Z$1P;FS
zT;X#b)XL^vU&l^!FfT~E5T;c3ROAK{v>z7gvE#&ly4+;YQHttd_m07@NpFs(&s=hF
z61S5;5Y`J8D7P^A{eN`=4Sk~=1}IFZDL#XA`v0$@)_>Kqpn(2MGUk^Ctcphn1VmQ>
z1Vr%Pd@RfjT}<8oe^*HVgLCYX*V1`ItYPPgdU&=H=2QU+SgNJa%d0s`X6^o5OVO>t
zv`vAKv&bMAC|H>3A@14zl3?RH9hf0F8p(BV2FHGxJTNEcI;Z>ibdP+^t6OxqZ}7>y
z<M8WfZ<*SsA>VR+SWqH6O-^DrX~jimQZH4bYkLL<$KN^el?%#!Y!v#y`h4y!ku@!>
zFKhLgB{C^~oz3winlMIkzhf4-nU}pJ|3FlCKc=8p+Q`Z|0)E>G*8?AK#BNvIasm2_
zH~&nVzeL}72CtZYD5>p$n;F}w3>vS)c|=-J@1YiF|H3BAFN6Dz)t*tlb5%E_FHP(h
z6p^jxbZ*hD;9!2C!Ds03@7eEsUl*>wmx+68eLGisM?(fbMbrlKDT+*5Wq*yh^ksDA
zZ_)IF(<qc?QZEG|E~k}yXR#I%n7<Q2=!90{Y~NXpVZiW4Sfq~HH0=D@hWmW>Ou#O1
zz`jFb`3rUT#R^SjvKd@bUQO#m1fM=$X`%+`#`Lsw_rbrbCf2`6@}aLJ@MLs%?l$=?
z%I5i9N*)<}$pZb|>~Osyyj(A6R4caoGTA>xQ++w}AAW%gnX$B#(mj!X9%O-JqsaXU
z;J?*5A0d3hO?(=-bG9)roXOJ}ExUSGynHpbf>Z{dYD3`)$PUXZfQ&uKuM21C%+LQk
zei;n48`eV#gwsbvc-bM)Uq3O&NX5mNXUJEETpY)TC2e8GUl6r9Fo1wYN|?uc<y(1W
zk3@mVi5D09T6HdvKxaKw#0>zHIdSXT_27gU0wwmxjaEKBB@ed>zKZeyKL|)qj$|9=
z`R)DApIg;b*eGo^Xy&_3e=QcD!>!sm#^lDq!5Qcq3hx6?0$U?lmh0Ks`HTB++$Y8I
z@Ed>Y&e{n^@Ic|t<-^xR>n(8voWh*7@qQSegr<B?Ft+EbAY>}YvC&30g8jGM&j6n_
zPslO>7OZ9FOyI@E5FbbCIKdQy-}txio?ZT^KQ>tn`&%S*ju`(G-pUG>t`OT8jSIO!
zecUqI`X-4x+B~I1|J}kZ!2P)_Ye)|2I8jpZk=fm@><4O#kXk!#YlRZ$A1lYshsMIk
zo*F?{RO6H+plq`j{Fq~?#SH})KzdK9dwOXPQMbZem91x<stBL|#VS6$ry)b^2;?__
z<#~ISk=RJY2W5$@xGzJ_@cC<&pXjtS1{L-L0|#g5n~BTl@wbZ)-s&&s&h|W_xB>{K
zBL77N4XsoQ?E~=7LKt9t=5S6Rk<CEsT&s>c@cpJr+4~Lqo`e<i!^g`++#KCbtbcRm
zneSf;4~(7O+ytiSrx73qh&Z`S^yh|v9}Px#!*37~Ac9x2%!W<Eef4^KBa?+r)|=Fc
zMFg-fLoG%e(}od#u$3gLeHs)z-_9p7b_vua{;~2pE<IB{*J;b0YB8QD<Z^S0CUCFe
zUqk)QBM8H_1wJIBh|Ibl<dmNTcNP)Jx=#Jrm<|~1^xePP<EHKPSY|U5&uP%(N@ky=
zsz%H8;@(c~Si#*{rkx171Y1>k-<8v>m5#zq0G6A%{f416>WnRtVVhq&?Z#4KE|+<&
zJ;f%-Nc<6;jilymOj7sp{1_n$$bdA`)o7ZRZEkoUs-mEJj!f~(v83>dup{xTFb<t-
zvy#7KiGKU>l1#bu(7EREGm@yrHksV|lxcpW9QfRR<CNV4P_Neml3|kM)2}|v>|qm7
zX1>E5(86hDc`uxfyfc47C=C!G^V^bZCyKscvLPbb_Y#kFaSqdlwM6Xj*dZDm6Nd*X
z7q2a(c8?Yvd7(y?cX1<J4In^BL8<9@NDEEG?6zJr1ct+;fgd%ObK+8m%1xMg<&o3+
z$up78%9Z;i;c_7_<^$m0E<NRC5}qpFoJ)ahq0C%$nX6O95WmfX_@~U!@kBH4-O6qI
zcu-|HA}RW+YfG8=1(UeRR^$!`9n-qb5(&V>`CUlmOnD7y!T%;S<hss{?GBb9tn%m0
zQH9w&Pn$fq_8UPY<<Fj+7*sOc_<Vm((E1^pOu=a764fgF%zvt>OZ>v)ki^Xc4tg8$
zPBj!N=Hu49hS-{3KW!pzAd?#(vk5IntG>i`z93N88eq`eRYJwqLLC@%z&HKNfQJD~
zcIdB-tgdvF&J(aSFo$F~*T@#K$K^>=Vfo9GcfOAt=MXuWOP^{Mv?Vct;^d9mY-2ap
zGQ@6w{mC4{lVHwrKfw}->7ds<E$ZPGm6WeNgtx);o?^7jwxb(oqXl2lejP2vbS4lj
zeki8}4pf#29*k)|**{J~tRkomEt=Oz>&x~jG>SlA5t6D7r<e$&$uKT2<v8TQAk+cb
z7dAc*4w*(xK*1lxchc~bLI`9(b5R8E7YX1Xu2&xdbUO6IYB^1O9_p*AW~%mc3_vf<
zXp98GlVU=oW$w^Fr3!s5-UQc`(V&w$J%o+ie|8p>->1c?c#ffn@E83N`zoK1b`+$y
zcv$+^wJuc1tioWDHe{)9|AXO=7=E;h{A8V-M3?AYKs5u}$HDuMjnA$3zDN6WikF@P
znto^xL_VZbXZt)AdYKA_x{qdP(ZboMWEYi@gt&!7D2Ur)1a<m{-W)3`X4!O{h`$d5
z{b|25VYam3h|O<6OiC5~@65_Un_VekYzGfz-D1UaxF=0ZILV;2O;l)gQvUAy18RQA
z?)yD{r0(?uBTgkGHV+3z*G6d%1(d?wMn)i>TPTzXy0MZqfD0u_v<P??wLW3bz3%G|
z)Sdp@uMv8#zxTpmk9~*0J$br)OHpyqPcB-8zcE~7wlWisBoxc7nhW{b6qQ&xlPUw5
zX3h6yG`w5D+N>b-6_%SxbL_h2xop=NiK>^Jw(#E=elbc=E7$}@1!zYLhX*h?{)C+R
zTz+KRP=Q6VkuTz3`v;J5{_FCK!Aj{sMOiZebD3W8)bD;$T+62|B|Urk#1T1uo#|$j
zpisQIa>>)3KrYeg1`hC7lLOQ2nSYqlY|+O{^r*dFD0^wUtzkJH1xPYce5i8l`<+K{
zQzP|4QBIxjy>xjZ6<7D-Yq1FvKf=w=b6a%TMCR*8UqNhUVP++e5a2}?s8egpcJ)TO
z!Y@i_Z`B!nw+(|eg`i`=xxj-=jutv2$E=B16&%GEOOF3KOmB>@maUnjC9?L=ml}AZ
z$v2%`Crw>p(7Gkq2sdCBV|z1X7FmYTXHsEM##BuEnWf3>b4>)=d}8Lz;S+4HY{-yg
zC?$#_V@yfx>Dhl=7s>P|o?G_UXMwV8g8GDF{)QRxFYQezph`4GI}2<RhoT9R9lEk?
zycWz-mDbHRpDwp|nw&z=$Yv_V{Juxa;hWF#9a06{%ru?5-1j?@V4uxH6V9Z^kx$!A
zxLnk$s1$)gQr@D&$}0L`mz`9PD`Mgt|H4NGfl7REA%GzE>X##(491B%LAyHz27LzY
z57X@jvlNu@m)}&#$tu4fK`SsTCvZoCIjTI}KKYvHBF_JdFvhVr^5Z0=Ri^7!Kr$8U
zv{cF6OkNs01fOYU*4TO`r@-}>LZ%C!e5q2G1;hA<uv`yttX><#dK7G3;>uCIifArs
zb5&?HmM@KxdY=^<E^b*e&rxldPdm?uE%Xw8S}KEAg8!vTOJnJEpZHQg{j6S&a;yo1
z>@y?(D|OgdC7ICfXO|Oh!1*(sF($uojeg7}3X-G948j7Zrp`)z58#jXz{>rsSlI;q
z>t|wCR|Flhlcei!hIV7w7rYYfWd{5YNyd}0<|^NyhL|^*D(&VuFNvqOG95hPDw1#{
z#pRS9jzAK@*s`qhb|a=^ra7Tnt(0{d1dA0+IHjeyXs`=oh`af5G{w@ueFmKn*Kkk}
zH=KgvW*pH~Q116n{y!rcn;!N}w%!$Ho1&MNU~V+Y)7#y9K@S&v&SpFUQS#=-KRx?+
zVU>e-cnde}dX(x&j!Fs8Rfh;lmo=Vg$Dd-ROVt)x#Lsp`olxzf7fmo#DWZEhH?NHO
zky?)w$Rlbl<c-5N4|wiWzq>1Au3w^aVZ;lsZp|{p<xVCYgXb@;7e`2Eu2cyvI`jnb
zD}*_}4-~GanAR9J0TLF=31M_w%u^pWr8SCxc4E=L<TVuoIM@$ZqC<=FML+Q~2Q?Od
z2Mf_+6|Sw_h3=4W<b?p>t0g==UFeG&ojS3DrW9_$&9PYuM}aq%4HacgB>Dnn-Qa^#
zXn<Hq*XP6b618v`l_KwRbxHlre-9mwT9t%;*EM553EKl^Nk*O9BQ+<(6BKD{-@lm^
z8CWh)fjnsoHb>5DjU&XGw<&@5FtIe2kw{-g2*{hp^{_g_;JYyBO_U93>tWI{+Rro>
ztZs;fq`}d4<vo5T-Rgs??g-^UAubRw)AY@hRNhaVt~Oy4RD&h3MhZ_Ur_^Bzp_J_T
zLlU{To;NjF*u5`ARw!Z{4B90TejL~1RmYZ6+GE<Op{|fEB~~p|CSt%HSRpA5=xL|f
zTq%Z@gkn?D#c9<M1w?}5)8Tn)pwvpRO(SpL*FNA`ep(}2Q!E4&x$l(@d7x4$E~~C_
zK}WS}ItuL8FuiwuuBrw(spz8k?t;YMUD((}>U})39r<Ik@1F1)&gaMLRAC){OsPHX
zYq^lra!f2HnHG2v%QKzTCTdn)u|kcg(O#7tS(r5Af2`ZPACClO363kw#c$)LP~jCk
z7FQ}!*~0bR-RiIFWWi4Eb&OTki)`2Gn=8x%>O|vY%@i?Tqk^+&!6aKM!t<r0tX*BH
zvX1m1A%nW@r3K^pFlHhLxVsAJ7Hi9Ss!>n*q~_%mD@0h_DNy2Ck`rPiaJy@V)wzi^
z-P=OcEiTF5$hHQh>_yhd?ra4-j>JWV_svoW)|BPY^%jZsE_BF|`4EC04kIL;2aBNN
z_UvKmQvl%#6#d3>u@}Z528%cftl_#1)wgvs7fv2?n$u$_YeyMjp4g~Kk>a|!RK?bv
zMcX;d#Fw68qLyzJ^H?^Cd#u#RgX85*U{Etvduo<+mG49rM6Rms{+_WtJR_hUvD;rf
zBiTzW%fE}z+$X?!4FK_T!P(|t&nZE%p#E+u&kirzC;$X7gKGVS_mLJ5+w=|kR{bY^
zn;xqjGivKc=MOQP6P4@ge6rvzwK&6<4Yy{HCKz__<sK4Aesz{getHm<C#}zNUQ{|E
zxNDt1lzZ&GU?Pj~NkHE@UVXF>_(f9pWl8|={LZdk^P#&!S*PA<eRD|v`+k-d)_}o~
zK8$k|JRM~bwrUL7W}PENCAb3&cXGQ}Z-EU7Mk}GG7>o^i0O+?F{mOW%$9oBU18t+d
z%}pmIapJc9!;2_BV*NPbi=(L6+n42c6K8)n0y3sQs8d$N{Y<;tp?tlc$6AwcSqevy
zuPc7u3jsluckc8uvEZDQ?FGW4%@|nZb+icQ_m-*Cd6wGCG#q@b5X&YM!-XZreLHjy
z`fLt^jz;g-Jj2IVcIt+j{YHb{qu!G$3>Lku#Uk1y;BhRJ7sf^Xyi$5HdtS3%@pj4g
zbayx#BA^i)UgCFaF{^|l&y}|{8k=+a@IpKMdV`E(=Th{aZGC*T%PRXs<{fS1&O1`0
zbW;7lB)-rEEC`>AL`8)fVV?Il{)`1t*RnC;t9^6{K3;B|0h1b+G|@OxjY87~8G7}j
zH%wDNgJ>o9Y!BJ#Gev(sOf!Nh$nMS6-Y<84#&a8_v1I+|Ko=3k_Ja=skHpNxL+hyw
zs!CKq?#%?|O&_qfrHzjRcT?c(SOX!6PNUxHU!b)d4qrXNJD(i6gUxf9VR0I~`McRc
zCzKI(pi>7-CL?}xRWFOHvUySs#wXDFdHw==z=;a@#kerBYoDe?#7DOshUZRJvt_$+
zHZ0@5<-^WU=eJ&}XQI;^0{<|gpTfwyiwLXj7(Mt<SG8{Vg6dfF`bCe0)X^rf-x~o&
zvI^BWc+8^A^&Hyks@|9)#fH8QlURv-5$OiUe9P&Gp~ys$)C<~*PJ|erwWmw$7b4qN
zjby%0w`mbm&ICSVWZ{}z0*T>SdcIT{W7A3P+JDImex8iXS7R6+KM*Y5(HN$xK&F~2
zkveULMj5e>k&;we@r0z&ut#g_GxSlA3u!=`J7u;aURBgB(q{2nnMMbT>htA%v<|~;
zwpls4ndEphFl>!>=v|zrH`;tmZ(qN*eQ<p(uxAqEvfh*xB2zD8G8@t$>l)r!h{q%r
zH%6ISo%4J>W6CyRJ#2Y7YPl9$`vpgv`HiJ%+s^J!kgY}6!YWsidj)|zz_nc0nln|4
zZNftu*{R&JF`)Il+_XUzj;yPzcw2#O3~Im`H-dZ3bqebb07kq8giaIe+&A?x^!H<k
zZU7D6Z$p5(nw-WZ{pMG7Tv3ryuO`4DC8&WkIzhABPDh^G4tamH!rlT9ge?W3ZcQ*b
zW}unK!4Kd{)9V}aN#sc^hZ9)WSW67-rJu5;@oV&(axp)oB(YL8aJf$hI^a`f^kg#l
zcT6|~Uv7aP>%hK*e>-5b2k*>pTP2b6iFFni0im5%vCaX)mc63tO(8%*KP^MEjuOxc
z&99C@#U5P{pr6OB(LuX;`&T4NFUM76G`|Q3^-JPuB4kwyqY&OZgiM(ZKdJM~MY9aO
zAb<0Hzea|5>~j?OT~cn4XL7Kdre#DCI_W}$5_4395`^`h#1)9G)qv^AMkAY>qfY!g
zHl}ZD1l_iX(7{QdGtF*J=sKc6lNg-5WqsYM3A97Sd}LUB)UZ?ue$A|uIm8BiJ^T?q
zU~@tDqC;;v56EvKA#+(f%fI2QG={z&kA{$9LaQ_#hF_mqQ#p~w2W)_7(sQ^MQYV-G
zdrg^k%^}yQca9)W*-6L)(C?LgzDV4|>SV~@w$Mh^BH@d8GUS-xLhgMXzUq_rNGklM
z!+=d?xjI{lchxc4IDWC*usbV}^H8#tut%WsG;%4G!9f3cTmMt%>4&fGP7%7$8M+@A
zhSzu51U0VHJxG^`kPt)pcRn2@_BKkf9z!0$l?mc01CFFDZ0H_3seJ=QHQ$-1?8vce
zn3c37MSH%~QlR=2x*bQJVCW<Nb06TBF&FV0^lEfvCk1cD5Bu+ev<P8P2fiu0t^w4K
zF}u4gI1_BN8Qd12M_TQ*LD!bIm305B1Ce6Kx+oPy)e;YTv55F#x_&>fwxoaP!sy7t
zZ!)^8|4&{&=MHMGO&TMWX^C#Xqa>7Uo538jDY135A&)b-C8q$2CHfA{9kzL=6ymJF
z7R8+d(@Q;&v7Z$Zq()Zw2FqozCROK<WnbFO5kM?WeZQJRZN$Lp1nFtWr_-cX53Xcs
zMQgFd1RthSf;!zebjWnO<_+yo7LQP3;9YLV>up~()tE^xR_+S%W8?-h8xom-KJeE)
zY~recJ3^F7o{B}UU&2|0t#XP6E$1FFCnBFjWjF+VxCU>My>qnonO>$+7}=)DQ?bnc
zblyIpB)<%$omxez5Rm{OgHkkK$UfHS1CT$f=$B_NaXNt<-`~d;i7a{hJI4gBP9fYu
zn;iob+wPE=M0(_&3BvUnWw{9kN)r8E&BYzodTQd`@ee}ec)#oVqD!g3y$5bKC(r{V
zrG&5tW-{gYen~Gj*)BgzGKfn>5pa@kZ0pLqMRn0)&{4+*lhFKRp#mC6;F?jmO1|)>
z<1nIO!S%=h8m5|<_g;A^Zx>T`XAifMJE#_;e8f3Th=5cW6W`o0fa&zq@nnY@Z6_X9
zzcLnu^e4H6)!Nxc-`!oh>Se2ME@$iT82?pp<HFw2_gS%v{)YBBZTCtRnkUM`9ssz6
zt}LJ#nEz9qJsILPNcX1K9}3RDK;@Hbdhic=P9thFR`YNmv_Tl&L8YcNpWc?!^`OW4
zPd$esD%_gZ?e}xjoAXsddR3EH4rf#Op~V;xq?(4|;7)WZmxgR|{K38ZHhE)vG}x6A
zDuqOt+UFlU2(xGcGxW`5t0eeh)j;xP^h2V;ibknBNnbo4q9KAsV4aIfq77_;`olxC
znj%^{$z1~a4Hv^-i&x2no4<3FTt^#u_fr_l;Og^hLa+iQdXl#(t0G~PYh?z_T!GP3
z6&9?7aLzeQHECgRJ}4ZvsINmU&km@;HSd@i0qlc3LN>j^f~KXx8xpZ2(}{r*zpm3C
zV>giKP@z7)#@&4<Ya)w({Zmg(T)IlhRy!%%nBULCeA|m>ivkiJZ^BJDMshWhI3~9N
ztI};GS-j8kw8zNd$hM?pay}Y?ML$RqGDqT6WH8n_FV;DkL)z#{EP%L=p^>59cw+eO
zs-SwU-bXB%bL5a_Jnl&T?h?aj>XeO4l_e@+R|wrRU_w0-#qwr$s4~oJ>XN&Y*NYqL
ze-EszugYuQ?wNjQ7$TZuq#+!48|C)>gyn8c)n!VZy!0a`{7B@V^}NE5KbAM=ej6=;
zuOHtqg1%{gJj0UMu*?WSm*Kab=dTM_VQfP4m!+zP8*n9Kf5Cn1xc6}TMJ$FN#Wi`S
z=@&>G-g$I#<BuI=V_e+>&6{2ri(Lf=IXqF6GIjf;giTGm_|Cqu=1_1>CDk^jW}04!
zEfH0j$iU8F&yef;vt|jMd`bLZa6RBFy5OBmgcGVm4LXOv+GZN9dLG;kPAn0Iu)Hb0
zY$0?B(Z$P;;QU}|eKth0Kf!mSA+R<Y#eBMpcCry%LoxX#NVTxLJLa%=^2LX=H#P>b
zY?gEi6_T9Lvbu+|X`gwrSvnb=NtW|Hl{vO<d%ETpC(^gEJ5BS!5PtK%QQ+!70Ivo>
zgd8g)fv`PK>xcv?f&%4T0pBixup^gV+~*g<?Upgo`aJpiwDRjuwJH6n{apEZSR%Py
z;?gD<QO6I^I*P=t$gk%cVt}jO9NH-GO}7q!yN$LOVuU@#8XiKzZy)Si0kg>34tl--
zc7pWw@)EE}Yk%A<t+~Q^z>3#Wj#HT^B-~URNp6!%RirIrDp4rK>BUD5$fXCrFQtRL
zA2E&4g*z#1i4$>Ix=N1H8Vc*A)EQ2~|BPY&(~e;uw8T|>%?4c^LiP|AZ@v1IJ$B4`
zKrS@bgwQJ_T6a`s!l?#AeaA6%=Xz#DG`6iPH!j@GKqkRzGa-QNK>)n-*8ZypyD5I#
zaY#LA6fKOGs)-V#W=8ygwyU1U-dp&F%NsO|!_qzn%znEQx?8uF1UHVBKul1{PY=w$
zI$qhG?xLKqyYy~ME#xEWweohW9Bdj9YS*`M>;gxG_B^?9UyJH&z8UP`HNg#qt&z%h
zRj2p7S}SlWN~>c@XYpB+n!C~Gi$rQ%-6U4bG-bo)SWM8{yO73~!|YMk>H(OfJa>!O
zf$$yScU}2t(YXrc)CD&Ni&)`qFyfBd^Mlt%-PuEvpzvP}H8-SWYlB<STfulGJ<%c`
z>g#~j3gOGrpfCQ-LIT<D+iKa~*M~3D3%LO1kFTchFQ*YX!-npj%WZNVu4lkFmOq+Y
zsrlY4?<*JSPy%eN^)&TVdh-_S4Nmy_>^S$f+HJREKU_U-SwcYxZg+0X3{RkDC9ex&
z`Ej8Aw5_?&54ORoN3MCf&UcegDZZPAB}i;Cv}1)ApLgX!-9ENzYwQ~9pZ3g__d(K<
zyYrEenL^IgXHQPJN<k|pROp%qnx&WD7DiwfTG^G`!50p5n2y$mElp8W1f6;j%tj8u
zMes*K%X{oB|5TT^G&;X*09ti1cE+HQT?A#an*$Ihv9!UTF4att<UW0#&dl?FZ6b3A
z8B@Rez=k-dis(U2Iu9n*Mg!1PlkpEU?2=yY>WTrX%cWQzD7##toP(I^ksBo}UU>Al
z)$9RBa4exyoLIT~GVT5N^}<S`1zexeBC0(S_d%ir3BMsQt^(t&8XFATUvo%`49l28
zB$7_O!BQ^kS-=BZISdqS77zQ{jdE27^O1673O#w?E69JnR9+8wDL53PK+)@^8$@Zg
zNYlLlB+<o=t*fvK2xnSI;sKaxE>`(DfX{7_v5NjL9Sjn+(XPCrrgHUBj@YCRjV02H
zkFTqHL;tsA5kE;j4B*9@N3PQ#G<lpRnDXGYZsbT%B}q~Jr+y+?J9M%YN1oTV0qhJD
zf-0(V7-RumFv)U`4@+f<0-<Xp;Y0i$XG6g^B-FN@h(w`L;C7L7VGg?TL~DXBlZ}*o
zbLpO%2e|ozT>wA&BvvPJ6fFV?xhL34*>Juho@za^cRAl3tHR)>G%x~=LfJamcH3E6
zYu)<ORuT||((PfDV%iI%)3}cs(Ks5npsGR@{h-Uj338ib(M<va4il}QV`=7Cb4=;q
zDe_QAL{`VZ1`V?fgZHD`Q@$m1=Z2KZyhD2(P8AM&B4KkJx`+hCd6I%ET=Iix`R%n*
zUasiRlIt`nxkuUbVu!)uCZ$vo#Ew)=dMVy5>jq~dv~iaMBUAN*gWI_hA=~`!ymSPi
zoed?l>eSz^7LqU8iR2TM;a({skBWONq-^EDjA#O?1IxJv%w5RWIZl`VO4piHm7b|r
z!%=RurWvfepH|Jk+bxMLEbHoN%@dE}Tw&FoC!6flQGRrYVX?%NlG@}8iPJ~oc;Ion
zKChU9HdgX5N;QMoK-avl-cH?I>o)3^PuIS(&8Ryr*mlKRo7^u_NnD~Wi%WE@*xI*-
z*60#nV4p<MKsTILEl0%-;XK)R)v2b7Rv(Py)C_+yrvrjmpW4R4N~kK1+L)qLO=<bw
zkw=5xN7Z%a=u#Ffu*k*RiP8jI4y$kT*%j(`i|*kB*6u>V3=LeD1S~3>;aJMhwZ@})
zsc;aWR>_T@<fgJLXztDaHM8@$O%4F?{L@-rNO-C!qhHJS)mE2}IF{jOb>vA{+NPk!
zlin&g>VN9nMSVeNn#Qz9z8F1JV-}SUJ{}csw>HS_%YFZ0t`FZH7BQ?YvTb|q8z7~u
zs{GOp>mZ14YjCBQdp0SFrfiv^AWN6M`?RXB82^bVvAw7V-TzY$+QI!_b<_Wa^;?+Q
zIQ$3F|35HFODTPJ8w_w^H=Z!WU6i5*d7^<J;7*uf=v_7)mvjtl2b2I>!NjD$^7@i)
zO5Cz7xGlp=vh(s$pE6Q<n5sZ0(c`LN9FU3@rnoA_5~^Z4Fyau!*&x%^i2#yqO0mK@
z;<WR3I{-(+G&!{n$;{jwgf)JO+w^+#?^$$M+Q`CX9UMSn^R~PgER<6>X?a^lEv2i?
zJp%|oTR&}F`qlGG#xoona8rDbeVCwjnqbc1q2K&*z2h)BuVl%588i{Yx*QRTnsk-9
z{6E;pdQR(J^G|LUx(ampBlYb27d}M+*q<}l<3%7V=DLHrgBh;#TEbS9wJEHOjjj^m
z8F_k`xEAO*!P~ysdqIBaL_F1S^>SQ!MX9Xt5WdtjX4jpPdTxbKLj{c>dq$%CadG2h
zP7YeKt6lf%-*#aEr-cuAi6ue;8n<r)bU&jbTmwGk1Z+iOCpQ6!J8+d0rLz!hI2bDj
zdoC8G#~^!gt}DS_+!|#cJgcP(R}LSrdnhhA@)%;7EQ*^yceHIm1g}B{Uk9+FWZAcN
z`K(R5!DKAGiy1L0X^P9&ct3>Icqql0S%U95Us0)joa;$TxOAV=`+g@rk56y+KE}Kw
z2p^?BT_Y<i5gyHSiF$G5TXKB5yK@8TU-H<fvlj0^mF`fvd+`6cj!IYC7rh~XfYvDg
z`*mbv>Fo0VvW}*-bnQ3TP<>x&4VEz4T){MBwI|cUpGYpWP<89vyrJO&>uZhM8p@PZ
zjD9{6l1-FR%l}%EuSggqyx`y8-s(ja9C66qz=Ka31WE*L_v6CrBPi^C^gMd?9As5P
zW#?JTtic$qUV0%w()?b_W6(LKDMbpSTrUb?N~W00q9KeSfj}Ku1KPWY8RQJEyFGnE
z$cYggrZ!PNyY{)78Y3T)v6il{y`x32=a}Q}tnB2wZFd)40s>&9rtQN!K%|4n{Wdg_
z+lTRJ<NhJ+#YjfxiWl`P7Xa(2TDGRFs6gMu-_MP4G!<Iim&F#2SYSEiogz9gp1DQt
zw*fSQw@G_2LP&gc>DdEIyMV@2CNsLr-G@9;G|<B6OoQnHgxw4Iw2-5SQOP3q4<V#(
zu<T9n4`_sIoy4}|578+<`peq@RXLPheOSq~(R9rJpcgxcyN0UdN6$IwxkWyNR(<yx
za)S!1n`_^}QYGpw`N1YeR=PME<Z0OJ1$vfnzLk>waiE1(on{DK)%pH1_#y2iN<~iC
zr7f)mEM=1g?|^C~VXU%|(sPteUDR<(+EVTu?8q!pgf;oo(_JFr(xw45mPf^hc)Ytv
z)Q1r=hZiWs?jxQ+o#Lc!U!+fJW0qWIp{F|^f{FFsl~x?AFTLr4Rd3PX+mhc@r6c%b
z6#aTPn4{A%_3MPGB7-RLD{oHFiyeg=uC?yO0U{`(>tF|t`Yu{#e=vtE7m7O+FuJ1d
z^inQ%5C=a-QZV0F(T|YN91{pN|4z@&IfzLIj=c(9oL_4dBYR>HFK%KYtM`9^lGcz_
z?11dHH9|&M<0&}g_kywnLu8wM{mhgBQ&Ns^1TQ)&Byum}!XjYr?t0v6HeN8Vz2N_#
zenp<>&#_KA<^UU2QVM?}lC*&&6l2;YZ<;(KuzPf@i0SN9PLXq!(brH5Sm%JfP{5GJ
z+-Ks-3q9u*CA0{z<`85mUcgkO&7c8H7f9^r-~IKTRYzby+M8-G+`Pf!7LoAr&<03Y
zBgB2Vr_3!(jLS0G$=Ub2CWMl*7^<yHlz*RW03PIp5O{O33*{CG@ND_p_0HR#Z7Rak
z)XYnf%XESYN5QBvdu2{+EOrZ3+yo@_L3V&YaK)WH2CMu3x*FnCj*r$<_I6{-i8X>Y
zHB@J^HOe6?oT;E*FO&yKWK%`_$>G1Neza;)p5unqdMzJDfI4@4M&Dl;S_3gsj)*)j
zEWDUgqts2GiOUV7*8(V3RW<2x8t=G6jC%I<n61x~1!r>CAKT8^oLfbnETFGOV993x
z!jT<jGZIq{kNUI0!i>#Law0!{<!Hv>Dtc*HF6thxqc-x|Uv!tVg)Clw;iH$pqnC%)
z4Nhuhkk3Oq;D+#9_-CI3(ldtBlC2PTdh~#eyZTAe@@vz%oFeW<{<VkI*XH=TVS}Es
z$*IZ9MWFGQ@1G=*z|~x0t1n(cZ2CiWk;KAa#U9?=OE_AKU<mjsr&HnzrjXxxAq<CA
z4lVsFj2!!=q0*jA7y0xddDfk`s2avx;B^@@1Fjbf;V#;#L=KdTIpK1yMr@R>ws+bX
ze%4d9ekGoEybCI{cqU1nY3koL3f1S0A1rOySJ~pszPxt^6N7(U)|M+2SI$un+`^bz
zCN$}!!K=VYTRN$#s8R)dor%$9!s)YPyIC!<sVs{PHw7@^!1rw^X$2cml8P%g{*l`t
z`|rU<X<mdHQ>>2`3UF&db{W>EjeB{RUEp?gE4LLa)1%vR(fYnD-TFu^h@*#&4WfP0
zlIPd*7p8mWhOTT?cl$%Rp_&RL%U(O)K6>HIusbeB`Ks3|h{2V5+Sh1*P6)HJgK(KD
z+C95j4indwH2^ZRVVc3ptMS?l?$#~YM*cc-6CP@nwst{3yP*_QS2rIfz#BK-n3c~x
z{utOUe?M49hGtvALajWaw&@i)tvm6z_F{I%?8jF2c_~^z-lBQN3&7!`c8G3@?2Lka
z#7|c!tdu2(f#@MDPNQjQjC@Mr%6z;-*+(Zh_NJUId1sizRg;?gX6FPCE-3q4Uepi1
z!sv2LXnJ(l_v>tI)*C#PKXYVoxSvYVSM+gmmdO{qR$9<YN4y|L*s=3=sAksTu5*nD
z_(H=hvA5l+NO_i&pPdxTyJTN@&E;XX!vN749-b6h1q?E`Vx117#cur$!G)x7*&t2+
z>Eeq{z5mC|*Z=Zx$n2vTUHt3wVnhG;?6$4p|I>fbklK#@!aohijJ{#NG?l7iX1}y3
z+3c`@goJlMsa3<~aXY!J(SfrX#QM+lnxuBe6`QQYZYtNw$K96uiSq$REFB9JC}$FU
zM-aF-x5=6XPgKE60jqWSKfPp=rZD)EFKW`a)I^43x0P&VptIVdl-0m`iN-jthbrc5
zK|c`+Y_h6#fIGIL%Pb2-JlHg#-Cwz+_%F(5ii6G2*Yp8KqVKtg@UnePC_1YG*kepJ
zGn|eC_4=D6u^23Tao=8PnJTyr<q#l6S^y-<2po5%dhDnre+EVXzJPMvaLBIjR;$c}
z!Bvk%VHy4dFWO^#SQwrDlwDf9Nqe5|8;w7HuK_vYQrP-1)aF_s(;kds?I?S_7B+D9
zqsb6y(j7pkQmJ=JxL{tfdg*vMMyRsQ$nh{H{mkWdg{kCCZcedDuHrS8iub(6zSSS_
zx@yQu_(;a+3bOK#0OpY*qcxp-O`~0q+{OMQzTp_s*<|7oS#qR%1(VxSQZ_#WvZ8NW
z&lAlFI1QU@5FpR8@rl>Wh^u%<Dkx1{mqNyp)GN6yEGCzCGEF~&mCOIJ&;OB4&^dor
z%4;NO$e^rbrkAKULSvES#jjhd+ISyv<D>xFxyQn-Dl)@SV>x7Px~)FH^KmxzUBR-I
zTcq?OkuCK-AFKg~?~-+x-?1@o<PU%`^pd-Zib6EpSs&)UR%dOneL`wk^DFuk)#o;?
zMe^iqnuCyMf7@d!DimmUQOeMp0Xyx;lkSjuCk$37^UB8{@_y@@nl_&Dsmt-XzlTb4
zhLfER&(4DXz?AIsWMH?MIX-&8UK2cG?MM!&$rlw(C)$wea(n?Zg<nhnF-<BdH+QT)
z5~8W_HWxiNo2pZNW~Ir{HeVmLwQzzM<Ia3d($JrZ%@E4BeXY+d&}0yPK-SkI&WekJ
z4SAr!J;ASvYe9v^mR)rapl{py?Lv}a266(=>+qNwN*T)usIuUQ>$qsoT~m|7{Qf_1
z=KtD13)u3Mw*U6e6YPJ#f1FMKpZ*88)OMmX1^#JlYB4XPh2j^eeL0|-rT{5IXMzi*
zg)~VVlBlk&9oF-oFL<ov*tQmN^B39j*Bm|FH_l<Mx-03HLeh+*Abj~@r%PZ3mAB)L
zdd8OQN;|#Ui*CYLN8W#jSkcljM~+4TDV$LCRJP@kP!djL&7lnvDNN)EWGO6WW*~nV
zn5bzZhX)+xa+VVfg|nru^h*Szv{9|EEy6$84#Zo!>}60mIydu&v;GF1fvJzl(1(`E
z*y7Is`~CBL1o;`%Fd>&ED`?S;ITS*ulTMa2)o-HtOhzQ%a7;4^8}G_m9?76<&$~@)
zcC~!vaZSy}`zvEaSb~RLsvlvEt3ev;rpE;aoz-gJes()c4>nB%x<8iZpqFtWGvO&5
z)+Z~`WNw!_>n6_Bt?SX|cboPGyLH&G5X|I(mCHog&7c|T(KpxQtwW&AN5vq_5^fzg
z`w3bm(YywjJ%rk{3Sk@IdSCbBt@g(jlc$5;3_#c2v^EZFPDzOz_Z=9#>)qR*V;$u-
z9aqFv-t{b)*F;h!LJHV;7}RGji=~oTH$EBli?O(Tqz_k#yKbvicwniH+MLSo-eSLD
zY;t0eis~kY^5Yl45@jUYNjKdna^rt}V-$YT<G1?P>_BVAgmeZ7X$fdZV}XXJv^7B2
ze&`ypJG7pV>G!%i;Psxs|Dcuy%N>HA%_mRL{-Nai-s>T~3Y<;FQubhDA{jOZYSSgW
zO8Mr7H8DB)nFHZ`4I|0f)0n2<S`3cAMMcWl6d$?I%4B?S4*YZ(xpUv+8jG^u1J{3g
zMc~U0q85pjBnsJ>+`ofS*2H?0|IbJRt|_II_0P!9@GlzwckLY)8|VL*L{qG${a=a3
zZ>5%SF?=+wgiNpl1e}G&wUJeD*`vPE&;ll?B-Vxt_1`Ei+?$mf1q|_)tm1)BqbQ!w
zY4>Zm0R~JQ0|cf5v^VhMLl*MV5ej~YzL_c~Y1{3Df7O*F{?vX!qyis1mwzfIYIUr<
z<cNaB;RGq@8s<FYBe*0savQtmD%@EyVhcNTfJ*^OMsTwbZTm%%8HEMEEi1jdJJJqk
zogKBIY3ri=3R{b+vYlRSBAT~>OW<<#T8T0$IGBH`$Gx>PQW9g5)AZ{X>Lh3yG;<XY
zOaE|l8Mrv}DAs*RC!i?CnG(;n07$njtw}@PU7Qy?r)7&s_-mQ2a7@3J{_JRtz}&@&
z?E@2K!YR*9vrVQJ%}*-82^DOomz+yZPHrd9CLm~#%M3SETrC;7EGcDuebVBwhSC1w
zZ16xd_d@pvmiK^xeD9@mu!lED-l+LcHArR_d-U1kc@<6Z?j&?|KmE*tWwX1FRHEA$
zQvM}~&U+Sg;jxfXuK1MtRjHdTkg`ewva0h~6`8qaSlGoI%wOvzPr3ANI!1HexzV6W
zV`8HQD{wD?ARlox!LE(nHyW=*<QzTVaZQ%Jha7Ee`1KI|Ab~?ZMU<xA0x2q$nlbs`
zk#&#TEn50-pK`O2#?`nyMm3b?E^3Z6D*2-(!NujHmiH|qW?`JJf%DPJ$u&mE{tRa5
zUg(BIAG*>U*V;@g!>6o9E0UF4$Z%ue_pxK0PGI<f>aKAD(;>EsX6nlQ%}S?oDrt{%
ztruLUyLJ=n&Ac$*?ua<&Pk94PvQoz>V^XkaS5R;I?itX9b^Qg`INqs`(GklW$+2R0
zyH2O9+Vu+YWMrs&l&in?lniDmxw7t;H_mcCNPalm+H!Q`YHKMoz8sjmfCs|W4{@{%
zlx^;YPuQ8goWDP=hNj-aQ)?c`3{2^pUAGNH($Zn{+DTa_EGjk5OP*~8(_duQIlX5-
zOH#gb--KhrU?6F9x=k?W<-BF2WR{!RG`$w9W1mUpSzFLsH(OMTo^@LB$z|i@xJ>?9
z_fhvZOd2fo0f$z%8N|{@md2q0WvQZ%V#mV8bJ6)ZxaQY$bLa-|<nyE<2-j3UKaPgW
zdR-P862HJ>i}@p|r&DxieoIf_>7ACCt`j5(w+k~b>@Cj*oPcKX`8N@i!Dc-Gg_0>>
zN<n1sq*?hP^4Z}O0CA&{ydpWSYQp>xd`L>*=z=7z6NDij+Sw)GS1XkHoj%e0`w~&@
zoVEY1SYfr>9Zz*8?ci|g&T;0XdydoK7<BqVxXaEwj~xVZf9bJj4%in|b$N@;z9x9x
z%h{oW{bg-7+Nr^d^4wjLI9-cW*_@$%N8R%}7t&iDMJ*+-fwGg?{e0RN?6+$p9O7zV
z=$~fi?`EU>**~kGTw9beVvsT)@DBZ^tNvBYXd``Z)nZKi2XoVA%Qrce9Gr)N14-?^
zqaq(?Cnw~;t-L;VBMqq*hP?CB)?7Fgr%nlB{F&?#&mVgpc=AdxVEB27<EUvA*AwKm
z-UI{We@Xdb421h+B*xE!WU>|;9-LmG^xqUv{O@^F|GR<oUq|*>{d7$4zvKGI^Zyz{
zVryz^@8qd(V`})Hf{*{<Ht<Tv)_J`h`8!8HpGbd__g|yTa$(Zs_*iMagq{fH)Y8j~
z8-IjycX1V0Qf~b1$(-y>w8hjI*}}rwU!pfft#av_FOD=?_wkTlKK?2W&)zA5FIFxG
z_m-P`WiYqTHg4}`(zPa@IrAe#)ZHw_T$W)C`bN%1(+~acWID>uaB@&**?e0k3ZSHK
ziN}}X7I&Dzlv`=ir`~;`yWWo0*@)4{oR(N{7$N9_mdBHpph@9%Z<BZ#j#<V6_K9Yt
ztKrgKSjLV~F(+cZ_MKvcU?*<!p>L=R;Pb)b#mvDxLl2RfJ5*Rw5Ou}1YOE3=T{({;
zZz4>apH?yY79{IS-&Qz&xK&&(m+`5l_4#(v)q1>fz#ljDa4?sN-cY*wZPt(Jv1%53
z(~RWGm~(P;A6J66-bcEM0Oe{|AK9bb*cM%=wr#NT6b`pOBk901F2NAIz2~S^uKZ2P
z3Q;PxRyV`&PAhUdm_O^dK)#CnE>)=e7$^RvPV3_2<@WjgNdJ8&{4=n--}q+6;K2?D
zUeRKEeR&yk$67E+JgMRuT}=@_Vu2sUhD>1X%#5i&sO!i2cM6Ag(Zh-JqkHFUdeGb(
z;fLv1t^%r~th!~1j$r1b#yA>*tvkh{qJ7Tzv>dxh<fKWEB$cuHZ4*wF)df6GjI`IV
z)@|<Sh|G)ipV#*n7nV-vb=<5)z;<@Gf1H^8J5+zw3IW30S>!9%Iik2iJ3s1CIDvPU
z-wy!k@D#um)4?MyuIR;>w+q%-Sj;z%+FUPvSK+kyD_QB-R&g?ZM}#FTx3M03n>-DC
zP}rH_&7XNJ8L<yLW~SplFCjUKsX=10(V>nTqWJB^dIVHn6@Pkzy?fxJwb8f(y;w@?
z414TEfp=RPW%8V1frZkomPj5fhqHc(56ZZ|(G3hp{B-ig;XS<yBJME!(2yY<NIBUe
z@_x!~(}wx~>+Gt-s%pM9ozmUi-5{ZWbhmVOBi%@ebSmB5DBWGsEhXLEA>6}PKhW>~
z^xng>pXdB>c;DG;*38~Bv(|d)6D8yX$Tr1`w&%dx=eR7)7Ul+5$6kr_uXQP-S_%?H
zPRs(onLB49nq21noW^QeYl#WG8yXrgjO1|~#BD_iep`r>tcWysA1)DeNC|V+IsxaL
zJo2r!A>ADNt86HtJ8}U|T;XZ58f>~Qr({_77xjL)LnMM=bOr=dt=d?;hLF7|2$$Cs
zB+ehV$25ST=Gc#6(BUkKOuC+@H<>E6!;7n|amIZ<IK2FZp2Y~HZSWi}GoVUF2qm;s
zx5Do$rT~SaQ-Cp_D(2k6aKed<0jy^xUGVumJnhuCA+?^)6r?ttPqjrl&!YW>n1Zuy
zqstW0@=NrT^HnoPYM^-DPosi*`FerrcO+p2sPnbq!$G;Hsy%Dy5~>p1ZK;HM*2zXb
z{fVjKU5Lmm#Y@JI_1?xT_vk!KQ4C}L2uaXgAV~d*IxF$Zq$u^4HzobNY-0FCg%ONw
zZ%eR|xGhesd4ug{yZHLbgtV@lwAh)I1c1Td#i{O4v1y!Mu&dv`G7V(zwG`%aO0aNi
z_>wHsB<6Hsc<aA3E9q7fazK_&hv!^*s4NJsr`q8x&i@WdlF%(tLxChyIZqbDzF)o}
zk7S)^y+VX**>d%CBkM?XZPsg-W9(~zH<qT5@-F*_^&sNNNI&*u2#YU`LQK!B3t;xQ
z2f!!g;j%5O;e3T#oV}wYkHNR*=OvH694c;G4Tj1~p8yLke_{aZi4?-o8|k9JZBF*0
z^^b{}kp?2cM_?M`vQ#y934~Cah+fWu9CN}1Nj6k#h=VC99f%gyshUt+()=8l)jiT~
zkN(~`p4V_4ZGf4q(8}=DS|*#NNtmY@%wlvtACn`yEDUvCm6|dc(t`NR>jT+3+1_|D
z4m1Qf<e@J?2zdkOs<3O439cnVDq@1?;uXg|9~SLrxQIyJd;<ZmjeqeTS;tB^5l*az
z?Lz1(DZH!5XNvpcjYIqpm{W*>-x$=LQsR)%6&u-!`@1=sdRP<_W_KDDX03Nnp9)8%
zN7STZ*DEC0V~2$fz}c$Crz=z~TvXrDxfN#ie_E~O^H{gL?^ilnTU%L@$W;Jh6t9rw
zS2fR|y(V<fl{Y?yA-Bq8N#$Zke?wmPk-vaAxS(ueOSz>uWk=!m#z)13-Lw?D7aynB
zV#$BGG=9&vB2BFGt>%Jo-x(ZsuXseiyF{#5?JMw}iQZXcOzBvR))45lHQL>z>5JaV
zfzpgth1o_5yCZWfJq$Rw=E<PB4DQ%rb(2PAFofBXfaO~dXV*I97;{pC7Q6yFpEpd%
zsx^B&ns2w#>xS?{a^#2G5taIe&lH0THeo(-hct4onE9zLaKg)x(N)rX+>f_{W-lxd
zc7%22&ysUVtT$l=Zzm2U?!%oDY6^5vx>6o^yHX`|+k#`hs&P}Yvv@@8eo1_f0ZcPx
zT&u_<EKr6U4MXl+9<vB2whE-*li*|Z>}tv^wX=_&fMtd}V-Qotilg#nv9c%%sAkSw
zRBdoTcMjbYl^TUAPUyV0v_1`_M$Q6I1Da$Q8$aTjCzRYQ45hFWr(J6gI+pGvTL=}h
zI|!DOLU#%X%PqGJR8EgP84Cd}M8;R@D1)0%SbiQXd*2L;xa-xjDlB^&-RP?trg-so
zq`5Ow>}9S>BE+sdQClMG&hjbfnz6zMs`20zUbQ1H*k^sLP&Lpu6ydBe;kr=xq6(1n
z0p^XtAK3s8F?&{NzZ9qqbb5;i6rUxuFJ0Bw<{eoaCsSC@tY5(Xt#YektA@zQw6n_s
zcCEn6<=P=0$b_p=s2FAJ>RSI@6(4#vD6Hq218wCTy=#h&n#!kMpAZuozGE9lNKw$R
zv0f--2bR;?X#FlcIQKQfSZlJ?+v}?WbKM~VYr0*b%EmbRQ7v&|<d{)QOT*~bS94il
z?%K4S3oKqJEjUdw_IkCbQ-m76HgIh_AWNF3(3$0e#WX5SY^%X7!cGd$qM?zIojM>v
z$Q$$q`*Z!%qIQhHq-gC7rF6Cl7l8yY$NhY~gQODXWOf~=zis;gZR^P>RFcE6$>jyL
z4ZT8b6T6Oo5ke~~Gi9Db82>h412BR|@&$OcRU{@Di+T!lFOB_m`Lq{FA*NU^u)|jh
zyH;GcwPfba2ffV(%jT1acxrP~818!!_Osz#ccc&U!K*f)QVW#6#Ek3qeu_A?R>qvT
zlsONi3t?**RC^AA;9ST?w$2~ON?J-)#fiydxTD9*?qVW%5MQX<>3d|0Cq4&DIyG)H
zp@4TIwk>^+8geb=U%AQVHD9mAgPM6=m_jh8jdpO(;qR1>L<!Lts!9*)r(|p4#hVKw
z;k}ZQ%W{hUeR>1d%P5Nqs{-5DENMnKX-1t*of?aYM0_A~?aI?_FiWK)W<y9h$|J0)
z_j^a|V%21#tB3Kmr|Kpp_q@uAC|6idrt=l}rd!Iu?66OR!0s6IygZMjlG+qiqeBZ=
zvocq~2NgY`j`6~Xk&^|;^U1sx>fCdf@^mOF!^-g@Ujmq^w=exql<UC7q}<x`32JUl
z;I@s>_E3prwfgQs$fb?tyaI@%)D>|xQPe5Hv>_Gmx>P>959M6FQLoRpl@Hterv4>7
z`Kr~Rj|TaOTc_vb$_~xjUs(1wST|OB-IU5tc)PW|Obdnz5G~!RwGiaHyVWMpy-b(>
zVt-!t0unFhCQ{>`uj61%IA~ET_J~SV&AGOT#DomDd^^~3w!enGW-596^u1N}<i*5?
zQsl$Jg{uWbEP!liv4mf~qgSvHLZyb8`l;^h@b<P1Mf>J+td>oq71_kA$Ppy4_xH?Z
zS>PYg2)c7O9T1LE6{qsr%*Dy#UX27RO=-uaZY}O!O8`>{2HY3NcP$@I!gwbEslDcq
z8j@w^M8(qF8&?V(fHZ%ep$z&4?h6~^BEiS%i@nc!QDxgSW^68by@FEso;1fc;rixm
z(bH<=az~x#3F5a_iWyqUP^GEfShze8EY9a7@%$R#70`}WOrc&<ba&rB9}n}rj^)N3
zpVMW51%!O$1qn*wdTK7qmh#`WNv_CE5}M`6v4rA|{TL(AJDY863fRiC2bBN&p|qu?
zgN41Ry@{QnuEB4p6Hdfw0%}sD1{|ExuxigBN}d4Y__qjiz!>_J8`Wb8Ql8spN>=%i
zatt&tMxs$j*O9{ZzupK&l-S@EIUw00l|zlh0iENQRuVekAG_PZ$J00~g>vaP=TKQs
z7_6~<6Cqn7>i4!SFN#@tyo9|46D?c0Yorc2zRyQ&sl~?BEoP`FsuC2PSX8!-3;m1G
z*)UA*0MJa1ExZLzu}Y>;LcaW)sV*EX-j~LXtpr?dFA*e~%=GtdcG;gf5PN9VqYT~3
z(!7W6?0){a|69-pHINlG7E474bU0P$G!y|*^4d!K_h0KvqdjqPNvA^j0ztN*KDWNM
z;78}JqpyNLl#$Kiw(T{VY|bvuw56d)Jk2S2tCv_N8vsLH@R9=GG3PcjG*4r0Adw!w
zA3}&)E^?u?$d`G*Oh3&|L9XqUD3wJLDR&Sypd1T?Z}c+(pA$qdIAkboda!Q@#qcs}
zLKPcVuYeOehG4lGHYdaGt|n7zod$~>d+lT8k8U$RVp%EMuEr>gC_~9$nOx8mj1<dX
z6-sYf)8`8XVNz7uFDq=4rdZ6e*Nr#N+rZ+rpwD46hNtJ}AkDPGqY+pTC3|i<)fBII
zCtjFcg*a3Ce;<v>5Sa1l$WNZ4zqlTBcY9yOR<x5Kf__k8ROu%4q3;$*-?nm*J-Jey
z&eRHMcEOvY*Qw$Sw*Uk|*<fY;2VF+X!%7++nL62w=j(Jy{zI}rDgMvS@vPv#_X@j%
z^$}_<R(}WwRdI_D+H|=g47<edYAU_M$a#U8`9*u?cBFsSo%j|(pi|;r+lD6xNuLY@
zO<yj;@;Zq?sP%`9yh{DGbU8qbOCG?7NuE~LuyJ8E)BdM)rJ=K~rHzH*FG@bg%1TMJ
z0}8Yrlw#V6Gx@PA!T5VuLbm$CxT2`C_jXt%>;}w8ncp=g%DBl6b2V|CtT%4B;LoDK
z1Y?<i*C!Bu1w*7$k?67#(nm<Bih!zsq%Aj#-+;FpMH}C9Ktf+8y$r193r06kGOsxN
zR(1mnqr=e0jI?zGzU6mHE2?M``Anad&Y!8QpV>59xk(yEa!;?nMJj;H;5uIznb)-J
z3upu|DlqPfNU@E^uJF1Im}#1C^N1Ccr63Hc20s(Z>x9&)$wbOJRjbKDw4JS19gK}-
z6{UJ?SyfP`1{V+LORDGNe!kjL3*7pHs2hFRYq-mpJcBeuXd4S@Y!2w!lWRlPqvh;Y
z*9*?P7#R1HdCjj+H(XhlvNo(Bh1ryr(eao(5j5e=T#B<Rh@fM)wxy~0_fR`hJ0N@#
z<}zON-D;koZFyxx$8fQ-k8KRz{}7Qx3RfPr4XBC|4(Kt`-{>voUv!t0Y`s-G<FmOl
zG(QTrqw#$2+-E}E2GkX(;H{kYW-1828PV4VBtler@P0O{^GB?rHjJeV3fv?9hmw6N
zYP$As5dFjb8qs<%$JE~=Io2NdZal}O*spY9gOE>l_GZ~h!JUfSi?DHyDWhK|ite+L
z>)Rg6HZYwjpMt`3SGq{N3}?okbt<I=JxgLMu-5MaM)Wuq@GJ3!85!9`DkQ#4&72j^
z%Z#EtH`t53)()Y?h}%~{x7sg4rc@_gW--Q!Lxil5Na1_-)xNZdGjPPb5ZB>^=}I;4
zQ<iNbl1Y_F)sgXtLd~^-{SWocm%xlKr2sZ819&~{f3xF5XpW&3Kvm%P+}}a2->jVx
zvEhcM>Nyvp{dWF0LCWmx7-a0tY}9nlomVCJ!co!N2Q;cx(GVHYuZ@h1jj_#9?U$Oe
z>0a1XZQ#rBc*`c2<7=(#a)rko_GHf;UmN?LzjGX*Q`sD;%b+ytP4;y-gv?y@JINb~
zGn&=g!$1%MKH4h-?(#!VDIZO@h-0J=hQicN80!^e10DqV>OA5XiGRkBmyGuH!zwQk
z*uD-9#D#&azi5#QD}CCY+=$e>CN#FjTu}M#sD`u{h=e4$=lk^A0~Wze)!02Se8&gT
z{rF-KtD;f<pSnL5yz>8A?>vWrrPr;Uk`u<aHiSxWMGq56JjP_yl~&-*m{T(x5=rX<
zznzh5x1b#E!Nh^uRaaZStsJ{XY)m48B6nRdp8rV#lvT?BbVpJ$RBeu;C>bky(o)>Y
zfo!cKi0{s4KK+9G&8Y1y<kJ&}^lykYyMQj+gaiVj`)^%lXQ=OB_r}!G@HhLcQ&Gx%
zj`3m2Lgbq%duKGJpo74h^*I}k2)^-ir)1;GE^UPYX)%t%^&(AgSaU^UmMnW6ro_oY
z;(>-Y(=I4Xx;!kp{@I(rS}A(;FUfG|wEdz{0~OP0lyz=iw+uu4Q0<euv*AJrjKg3}
zK(Cpj*bDi0Z9TF{>Ep{2R_8a(I#g>Za4z2E+uFTM5L8N^>kwfA70sMC@6A9pX|Gz~
zH$uDlzAn=Nn=Mgxq(}>N5|JV!+y0F-Hns&h8_q{W>BO=72>-Tti9m2OQldMfGpDsd
zN;GvAWqmugt*O<$6mCL<B=z$|l!IhEOVY*kW-nFvopih~HH5NofJ>Z@0pa_%1iKhB
zbkXmX($Yhsf^=0PT`bbmL0u=p)DQ%8Qzq0Z<=4w5^9v8k8K`eFJ&1eE(@NyHiBi|@
z<qZ!l!ci|v=&uH^`Yz(YymHxCQ+ua5v49DbCGFY!EY*RA+e0hV$2GlpsK{O}934bc
zXNQ)*w&P&>3T&0%!9WTY?@aHUCG|o7Ooib4PM1||gJZ`~xCb!l<UuRjWdq7>u);|y
zuqM~&C-~>{sG3EdVbo_jF_Mvm9|%<fBQ_u-ttd`Qq&E~mu?)05)EFw2pNGjS5!W{i
zl~-gLoQ~TJ;XWteK0TN?a<J*Vf_>^zqDY;GUjVbI5WuZ|P&xSH+4p;6-ZEUu959Cl
z9NeJ^P+$azb<@*<33FiXvYH$SYp!(|-nBR=w{3criawuC6ToOJ#rGi0aHW!rMTj77
zIHsUgK{%XdrO8e)&GVo(pH|4{^|Hl8gZU;76VaWU$u-I`BfZHI`b|(eyRF{D`cqbn
zla6Ip%cQ_8h#F3hh`DX?%ViA9dwEwTmz!2DdcFAs$!-Vr;P81PySqxPzW5dXgT9uF
z(`?iI-kP27(4^3O!dGl<GH>`-<e?SWQgR$>K90}|O;iK34OE*jl|Hr4;HpR037|wD
zF4j*~xBjrt8&hL|=*n*$^zFTtvIc~}Hg2exY<2Ey=z}|A!mOg&s|OZSmhG}Wd%H~u
z(49M`tyR7Mo`*1Nw@d1{UBb<}NmMD;BogGBW%qVadPufo!?vQ~<qIQ%A-+6{OrfqR
zs1+gllMua-WrseNpkq=9_4lf@rotbg5Npwr)}r+EnV3_Mj6;N(*MlfEHthNB$Tw_B
zMU+^n@+p%>yDyxU@LUL;#YpEPy4xmjh%TWLc!!^f@Q6v8(319&CPNmNIw!QzOud<H
zDC4Kfdd;K0W;e(I&9f@Np0aJD?Y&IV&iL}YAFj72p)I2&N)lSXnzZh|B)O2IDcYz*
zFF_QgDP>);nW@RA%T9lhrw>i$tA0T?D!W~)25(`vQzX-O!AYvjk6BdGEckLG$a2<V
z2GM3g&S&w=U8RJw-n)YiutiPEJ}lU4@$U#smNZ002p7pZE<`frI^H2};)L%vcL|L8
z8R0iK`VL$$2!F-xgWKi^fN7sbiJ=3@dI>BoTgHE$x!|`3%bFjuP5v?rq>}43r~hte
z&?ya%x(>SZut7=H;pwTTU`6(H5Vim{YxF1D+!UtoT<kY2cm8@l$n+y@GK2@p;yM;K
zt=D^C$tvy%B8E_AO5<Ut39RQV6S>E0@n+a?IiRCEwcUH}pB+_Qt~atY4#`>Q(insX
zgcxd7=WS5kA>L_?yc0NE9H)F0AGi)i3=U9~#qzO2WI);U__!LC_<nMRtCq1)EDfD4
zXw)csf;_I!<L%ZKSYaNyM0Koxl};lF<7bB31cec43ae$Ufexm2(-}?j(Ear*{`OKu
z8;QZA#R(YRMQa%e43~Ii(7xWWb1MZolzv_f`Jn2{-4?R~dC2LwrA%F(XPG6i<>u=y
z*qL!bd7v07@L@CYRgs0TMQ-Eat#s?5VzyucA0wDKp>VJ3VQi#0K6G^zy0!VWwG=0K
zH|!;Y%Q4G4UU5@=!5LCc(>l!MT?p|7;Zejr6O;izQ<t%M-M1CDUyHcGU&%_x(uR~$
zb#i)MR2_J??*M(Ux5)0qLnm5`C`4i#c(%eV)_lHXFkFrigHNNR-N#>`S4wV{n`AZx
z$!Nz?l-hbd0fsHOVwaJ^72z3u1;|~cI%5k640DnxKcGDowB4!z3M9S5q)!I-Y_7Rz
z-=#f(%xc6aNU4C=bme6;u?kglKoxP(&b|=JRuQ6{`24Yx4o3L}(>KMP;mCro#adx_
z{Z1;=TE$AK^se>hCHYgw_l)z;xbH6H!61)n8GwBcT>}bVT5Ex7Kn1KYcd+R)vX@$B
zbl!tW_3ORb7MaFeZ{`|(0YrnxA`aR8A=8h9WZ4YTpeC`cp6Iz$AmN(Y-J^1!@qiNf
zPN6hkSpeZ;{BS@(SWh3{ZwxJr9;OmQfS!z@rII2v5NHOVF8;&yLv8%VxEc6D(Wr+&
z`o%FaN5*U}I<nxIi-F6qPH$jPqrhD2DJq8nDu=~TRw(|}i)eMk_(%<Ttrni}Qv3K+
z6#FZ!&P!KE7q^eSH&-`?u2r3<)b+-93KloCrj;A%P|ezu#{!=))M|#u!9^Ir9Eo~h
z`M<k+S}zL>V}N`YYw9`^DDy&1x8BpPyZwIr!suS&Ds|Y}OCyPrYQ6KUrPDDM@p4%R
zWhE8HAl&0T(d*e_Yvetm)IzAGHYy?ixTiv0#{GBVcMH}TgviZDxcA{Wttbtmu(Tq<
zZLw)YqLBws<p)I=kv39|Wqa32_;(@&{j13*2%RY7E@kc^c-b>@)vutd1i9U^o)7UI
z<^q90bkh4Je0D{WTCOO}AYIbwa1tS?oLX;(JZjKFQ}KwfFEaDp@Cyk;pM2NlA>i&R
zx>0>ItC-Xkp{2qSIOE8mElhM->L8I6PQ?p_i92=SiYp+BR)g$PtqgJgwij)i9oIhg
zD|C}B_t*7{BF%ob*Y?N0HL+Gv-rdI2?Ly>tNqJ-t(ffr<r|AbYaRW|M(uYjmx0ik4
z!U)ros3E4#D5qwWmXVW+qz8>(I-8gwBikT~bCxSd!tl_pneEWtc}vYq_=roYA}jOt
zmB?QlODh?^W<JbK$lcCa`6wdg?ZIoZo4h-{1KcgnJ!l=j{xPHydvA~bwfo^^$)tOD
zfCl0w_SDDfRkRkQua+EZ3L7P!!8V;Tuf%LHZ=p83QYjrfoh)b~D8?C3FIM4G7S&ff
zSu%TPljj3XieKF5z1oZ6x5>`;m8~5ctB-lB7uOW+u|46zJ+TX#9H|=}zbfxMs#U*g
zD>IWQcb&Bra`PGbr7}&#Way#$>2u}9A$_1Mx=_<a*Q8z+%vG`W@%+>neU^ZM)p~0h
z5EN*-AaNI!d9qkGi1!d;8H2k-@fKj98lK&yqQP2HJdU!U>g)xY5?cW`uhf$7E^A)+
zWkpnb?5!O*pMW4@?z6nbtHq2qAVQ9tw|v+2R+zKb8{K>y_4Cr1`{I;FZ?Vg~(`fy3
z)5NOTL3EFPY{v4V@~BnnY#5cBxZ0h@-dnwd!<%Yg+by&g(m^d2tGRb@FQ{o7F9^#P
zP{@+IeJO}t_)}dR4+lSeiC0KJSaiYSV1nO6=*V~D`_|Y^*mDDqA6umx=Qv*mx^Pg#
z#(kND$BOKskLjJ0h`)(m%j{8xf4sm4t|yS3y<KTi6dyvcjT``Nd$z0LOh+1+Dj;aV
zeQrBhk!kRXHMnjFdup^O>g7;&<@FR;t9xg_S1Bg)?u~}B%n_p*WSTaxTPTueJMu3U
z@$qws7$bu<sb5jHNx;>gnM{WAL0i5Oi<^2IgCbYeDJdLZ*i+Ft0=I-1BQ<NeADwaL
zuC_)^){Ph3#aZ5=9_Ns7yx=S<8W-qEZBy>EgBjgqx1PDSUK)$%@ZO~N`m#unzLoNT
zN!low6rK}u@a>W#K>~VmngR{U>gTEa#qU!zC~sE85y`!V?0f~ohs09s+b_9s+1!gu
zz+vM=m7ydM#`dhr2Z5NZnP?Q8^Wa5pp%}B<d4~Kd&s>bt(<}WyxVtyg8^EUsde#_{
znxI|7>1#1+8t#?|pjMR88EzKnI2<ZqF7NJxg}Lk>5|1j%6Jo}CLM*8dK#Zb6#=D2a
z)_V93S$)<WBk~}e`Gi0cSj{h^`u&TEn)IsAT^p=4MoXF(uR0t7%7`HnKVn(Pb>&db
zE3Ho=C}pyCUb`XVsGLi5Yvnjw`6PMS(C%>E*|Wo~;v`h>mc7N*5-h*ZO9_(|<#V-?
zb!oA0T6G3C({C`kWi7ukY=qJ)4T@xMn5xUek@V3P1<H_qsa0kLPj9r$@P;t<-2g!?
zqTnR0#)=5fAndo7oGQ%mN*oyW8%akcT+ebtse6d{1j5l}2FWBRoqdkG!(X7pnZVrh
zYTDl4V>zmgUdIo-K)e$fX47nTc(-wHxb3An8NNhl-T`yxezG2(nym7jRj|}lEoJ3C
zKS&#yi=nQ`PT{iD`0n(5I;Ft+36IGl*)%GA%zG1S@j2+0)xLEFHin>iS$NJdI4BM4
z8Xc6r*HCyO$kjZ4qZbWXubOU&DZQZX{rkOE;OmamX<usdqi{I6UGHx!9`>I$e=M{h
zzU&3FHP1!mC{!$%46$ii8IbHbTp7q~)xB)horFMF%C;O|wwl!8@H67@0OuHFWXiI`
z0N1h2#<C+k=c3g>PEt$9rSL-PYxZkC*|pX<^tq!r@OT%^)faQE{cZI=F^J9P&LCh&
z<OX}CnksUd1*1v&B1Xza4{h{1@HjaV8aHwUmoO&V#P<cCjq9Dc*A6;IqVdvqzny{_
zy0IKbQ}Hk9ZhFpHhr<+<r;*>@sIze96hyol8~FrtE_OEJTqLIbwIL9J;Zw@~z%csf
z1|5nw*r0lReW-*HDm=bzZ(p!sv}&H|R7Ne;TE0ZZK_mEHiL`)wuQ`Zd4Tp&3@>MUT
zcSv-@lrdw^t|49XsG}e;HF#)9!ixh*9nY?Wg*7F|s9XI8{QE1>Xac~|my|DslRod~
z2&xrJI{qu4kDe#zyxJ_ItPK~Jyp`iGtw)><HP1CC_<GfI;>}iU+<9<hQk=@$^1s=l
ztlv&ke0l49lV(axN9~#>2h2<G3~DNk{EGp!D7z6|m%Dd3Q8Gwin(_NEIWdgL9$#x`
zpN>32-+HTxlTcQSN=|xo>(OoLj(#_sRF>ULnZhVy-Ja^%*QgLL$rf%zQ=*XDE=rHo
zfe2rEVnnp#=;*{-D37sC*R9PQvvk#$eoT-_KEi#C#IzD-bXOE4CQ-|(8sz!5>A1kx
z+muXCD$md+q1`pO&(&EL1U4Glb|qY^mZ5rPp0rGH5%sxrdRjDtNCq)PC%8;P;6c2a
zw%k(aWnzD?LrDOoMiZAb9O6Zfgf@~Qs{R)n{}uk5%bk^X{adFm8WBEJxw>7-I`j)d
zr+UD|F?p=ZTawRoIpvU8mE!lZJ-5ZlRKmF+LPfJ#Hw6^?@HEhi=LqZoi)cii`k?bl
zfd$vMENyDELw3E9Xh(TB`%?qk*@Bj@T4VGx6a~+kCbVRg847P7ao$o{|1sWVaJU@q
zg`N?xD?+P-#t~*x)O)Ak`|!|R?WZd`Ph{4nD!`tXGazj9hZXPtVKLXJ+zZHOL~Pul
zffy8WL?fy2pnP^2K@LA`zI=)d20|B#l2efvp{Sm@+5BN#oKCi=CLcaPXz-Zx08Yf%
zhK~y}6`a*ot?m2z`t2vJa>t~-))U7dNlsV%VUI60exw<F=#3HLtE8y#5yd*X#3TK*
zMPC$V#DWCSJ#o(N-$Q+=UTI%8q~27>5<n$)+A1_u^%V8%(XLF@<P8512fht$0|~O`
zVFCNh$qemSq*>YnIRLW-2A>|k-m&4}1+*9P*R*mO5+bqH%);~Ob8rvS@OnC<93fLX
z2#yd?Wd2CdBM;P!IT|@EQS3ytGWM_3L3}(S1X~EJCCL-Ra`<xIb3VPKpE%yjlgWbn
z$k5nuR~@q|Aa_$oB}3*{7$wcEyBIq<UcQZ>w_vdjKaK~d5`p<<`*IP;jYkI#oXdEQ
z252Ch4xK`gu*mBQQ=*U9?R4!J+iFT>r*v_$nuJ6|90gh~$SHQGlImj=m7(0-0sBZo
z0nFUHG0ykQmPLxm>9hOqg(A_7RlKsibDMPK(s8}~wk*DmcibO2;Ca9`l5d;AC}-m7
zXYx12iz}ZsIjPyP;*-qA9V)XEDb}5tXDC+K5O(yi%c2H#I&0(}C%`lZN=5I*v5tk{
zq@=}D<w3Gy7Hb-QAQC~wO=ISurzN=!t!oSH_Cjc4PiAXzqSAf|-to$TRAJ3cIL()T
z3UeQj($8}#+Bi~A=HxnBtyaLTLp~9_l3jqsDUgN7n}-0%MqV62jcQCBNopQYK1m5n
z;9ehB1Qa~3;*kd^;JPK@dxkU1W|FOzRrtrNeY|(^gZ|=X6&Pl)w`}>Ed>Q5a^!VU6
z0eT2jyCZ&ojU=@mGw1so=xNLbUn6loe{qang#Po;!@(3vKuz`p0^*P8nb&DseXr*U
zuX6)=oxFMG@Lg{ISdRb>=>Q*U)irTD1X+Cy!1sp>;qUc`qak49(Aw(nO><cqP1T3%
zhgFES_Fuo#)+Xd7q+q0HVx(uMfRU8|2EhmY55oh~-yd)w%)h=O0`2iXe0|tJVNg;8
z2Lk;UhCiYH6MRUj13E1Q_$O<?h4>c~V9yFD%jd=4P}<t2R;Kpa+JCeifcg-8_(QvY
zM4jxz4sQYn0)of>5q0u!)CbWr8*4lJ-$Z>dtO+hj{U$&*)Kd!x2=z%@7XT@+f84gd
zg(*Nj^o^k%z>a?-|H29n$kZ`5y_kUQs3^cP68;G?D&F77)|NK9|FrfOm=9r{3fGx_
zV*t!iL?9rfCz$qxe_{TkEc|y<f5Wr~<sV#?04rS(VITnLpMC#lheeV81^cHkoVA68
zp}sx9=idA#`hy{OOZv!7WC7dY>cBskn2Y%_`d?M?|3?0WIUeflZys)rGXPA%#PXv{
z9K8Jt*~ZS=-df+<;&0T4GW)-9G{IT|%SwQwK>=i&5uRAXB>FGZzn(sSgZ{53{`cVp
z`xod#qxblY;{O7DICX#Xi$V4$;Dalmd0&D*47rC3{Yfhep920rhFXbrujf7>UBV3E
zM}I&_c+yxsLo0m~OI<s2reB)kKhr}d4yEK7z`Ya!4ERr4E|dBc-NoKe8zA3nYGwSN
z?2KPe77sqRWd3c@5CsTmL*qf>@UcfHr9a{Lb8u__Il7;){KEep`rxU+_hT0I$|o%U
z49X`A|Et^l$5v03jUO{~)I4GMN$2?2c6zvjr&6bn@lhI|;QulHb)O2K{+i-vwaLd6
zC(S=o{BwZ+PYeox<zI9se|$PT)#rK4Lec*-%M-PpU*E>hnjVh{6o-B$_(|adK=K<?
z&(jhfk4gFtekOTZ(Bs#)@zCK<i#t4KKtBGN;c2Od-(z^1ZTpyE<??5SC%L!3NANTq
z@i76(?au^Hk`sTA;AytlV}iB2p9!Aij{P3N)9j4L1e2h^f4+45`7rp`T#eu4coOOL
zm}4K|DaXIWef=KA(_oy(6eY+{DE>Hp`)6c7KBxRWqNi~rkBKx;e@*mfRLSo#J&m+@
z%#?-pUzq+i?&9|tp9Ts%W>mrWFO2^lHt>6tPY;qGQx;(TO!<HBuBQjFkMaAkf5!i5
zCI9Q2`1yq9F$p@s6Otd#YXI>7a#r)>^W^E)!(;d)%BS!@o+|&h=Ktq?@acl?F@6rs
hU-*xgcL30TeIEe_EK-1g$N+yGfcJL@+K0FH{{dOjytV)U

diff --git a/python/lib/py4j-0.10.7-src.zip b/python/lib/py4j-0.10.7-src.zip
new file mode 100644
index 0000000000000000000000000000000000000000..128e321078793f41154613544ab7016878d5617e
GIT binary patch
literal 42437
zcmagEV~j39x2FBHZQHhcw{6?DZQHhX+qP}nHg?<o-ZROWlW&sA{HXd}NoB3OR$Z5Z
zGzcgv004jlXaq*ftWILb_ksZc@YnzVA^;cQ;K^#mU}R`)?O^ZZLhs<IssaT7c}<<w
zI$%1}a`k`#0D+!^0094MThWk?-(pAX{ZvPsFf{E)io)GAQp7l?5uGQdOaSwO1P-N%
zU7MCF7E<b}^8b4v%q*(d%CQ4lE*Hl7b^1LsKPO2=J!(%cS?12*r^D<0V)INxXRl4f
zred;ouf#HdwjsHYkY$x{3}4)MI~>>CvkN2FC`PIKdTjbL<kix-8631vn<`+To1B%}
zk2dJmEri<x@i4cger*5B)i$YK&D0rrxngy~h8A-Rx{>AX`34>8PU>@i^KjzbxRdnn
zB+RF;R|m<b`e1~2UK@)0aN+)J(vQx%5|v7?NPijg(_Gv%HS#K6!3x13@9fxW4s|rj
z+GD=BAZIrp0@*HOuqcYLCST0Mu7{eND$v@;+90&KU=RT)k7@i+aa7PQ;&PI=Aj(3J
zy4=zpguDRBy;%*DamEVbS_U$qn`r>YL%{iA-2?6<uSp&cu?0g{H_=)EEJMi+B7>_Z
zU7!jh>L$P>>K^$-VC%T%aRpM%5D^BJeMvMC>o3P@>OwN1h1~0HkiLZyal6>nXCpXU
zA|vKIC@@Rx`$TDd0~n@W!fuGfSr1hzmQ`yWq~XanN(?kcRyPuEDIz*KgU(MqualOl
z+=_@y&<$zZq}dBEC{hsI0*=$S>pvAplua&-aa%mWL7=*UfT}-@@j_Ht;JH{)Oxoc!
zj|`hX=uZWvajH_HY&+Ntg%azO8S(7ITriEi#GVB6GiJsJ2C+7>xf*q=k?k2_+wg@<
zrLWtr*9~{zL4pZOPDKo&Fx&H&n9Vt@r>SiIdBBeXHR#Rq#3|}BsbNL9i##!W3g;ca
zEm7DH7{`Vsa3niI_E2NUfk-vT4h<BLUpKx&cCfNs;g~|%`$r`<37zO*L@bMrYh$Sy
z7C&AVm6-=iNluEmAw%PG-Y#=igwNZ{L~yRVQS91tZ;^CMC`+Fk!;<5f<`l(P#Ibm~
zUj|n~nT4cY#|S1DTtgxn9m%5h|G94tukdMKOY=F#*?BNrxi$b@frup)<3%~5!|B2D
z{2}BvMA2vPUvn888q)IGff9h@l-9mi0z=*{*8Vm#qGP-D{)L5i)adO7W)GXEqX$z}
zvL-bzTUFwy&6Vtm#RdiWIOF80E&>8sbC%9C-4{_w!sF2How@~5F^|w)J+V;{nTjry
zxbfjP18pM4NSH|&=62wWSMu~AoJD2JQ(!w~R8;EtEe4LZgdq;<NM5C{q5<S|S%vQq
zr|hVc$rdLIOh_W80t?pGzSvbl+)N1F(a#87=%mVdZ<ei`u}zr8Cka%}>0JCPRKNkb
z|4+q^#QbsUKV+RG8#wC_iq?c+y3ARM7W?$ReEoBD9JI;1O{Q@ZjEIv~e+#W0gzQOC
zE2M}>mKN%qu`E6|=QwD$`i9FsSxe6u5+qSJW|>C&z4+i$vZ*ifeOMIEz{9N(<@5%u
zHU(khTf6NFod9N?L(#J){c%UN;%WFb+}H1JZP5`ozj1t^Gl%%AwRx2^!BwchFMwJO
z>j)lHhpy{d+9m>}bSXMT)464g=JY@mXYtLEI$bDe&c&;-Dr3RTowZjsf;4~J+&bvd
zB1hVO=E=Zxsz%`V5?RzT*3ytwP6Z$0Uv5UMyA<_SNTHD_@?3pLboUCx?J;{b$ncaB
zAJ3p-s%jf&eXSSD>GUQOPG=S0gmP@1yswUphw6AS)>U~w;xifo9~;?!_}3-;uC;$T
zAozDC)|axyy*?gEc<G?uTq?O{e)>^&DOlNL>`K@bIml~+c3U}w`s}ngJPJQQra!J<
z7EYZxt$HoA3lI&vO}PD0z~bZL1|<rWn|_1c$-O}CmOWND;4CI44<*k`pkJiPPlGj~
z|NUKe?p}(^k<U|`L-yp$S(o`SSANncMsIyQzP^p#=sJ4T>~dhlY%zGIDzb4}z5eUA
zEN3yJ9;b@VLUS`u02WueBo_1QA@c(2ZeZJf*tKG=@V|2)TO?2B$NBAs84~~?vH$>3
z|2qeaZ7fagT%1jv+)SPRI}6Y_&$Rw){U2Fy&1>zv#hJACO;bM^Uf_ndGCO0pl2+-X
z%BMZyvDV<+%F@xXL6?<8m}wlfoTS3jS@C(hh5#6jZu}={b~{u~9Vr3~m%q1TL0C6@
zwbzIKXqA{=75V1ydHSq>H?d&A=%G&68j5=DyBg)wLv^f`W=q1{FrF^0IfUS$J_Xwq
z<=T=EnlbUBIQc+{;y=S@df3%%*QQCdYx?-voihLYUE5P9m6%>r&CwFlHzqEAMh2L~
z31d2})$Ev$J=3H*mVf-L){#`4I{mA?vCoqa_O+>EN`eyr$yjXB*urzD89S0)85V~n
z=S%uP2CFZ9No!DFI&-N;Utg4U9q!zM4;(x10W2PX2h~IgeXd$V);1bWVrA+VHP$VF
z+n?{$tJ>cX8ptEojgLonWkt8>(qY@yBBMNPr`l$ueF{lV=hQE)#!8YnlT`_jplajf
zQlg8tY5}gk`YX%kpFkIUEn&1dxGa8{mUo43n>%wto6Ua?5zp*hexfvE+w{2<e?(xY
zdqHh|V%)mapF{%m*Wb>0qx7-R3IO(1)UnqZec6Dk7!00=$_SwuKdc?*pmp5OB%}Kr
zv6TmG<VeoZ_(OX2stHFL^2||M_C0Pp4R`{mYEF7)tDkJysxgQi+I*ZI4~^J*Ik~y<
z_H<m}clUMnp46UByteJ>>6YHs)%{#~`|UjQ@#gmS$=c8D$=2}+OhqRzU9_oVm@irP
zsN_g;aTJxwYTL3Slh>C6#M>`fw_0C^;9Ib>vpcP-JXIsA4{p(4&Q`pJg2L&3ooU(@
zy{x^sTs-?em-NeAb%9pzugD~fRr$*)X0S2&bS2ET(U9x4>H=4m;JWum_LUPRZFNik
zHX6}j%|cD%t0m5{=nw?r;N#&Dp-gJwTHE0Y-@<sM!FoV7dx*^gytjQm4_^LV9+`fe
zemxu?e)W~DU5r&-^~u%&jT5r*q?8hph(8irW8afAG!g$2X!?hw0!++~4vr&EyAq@m
zin^z_SY@#_X%q2)l<)M)5%}<ygfT;YuJvp=01G&VBD`V7Aa3h$s`k>@Bf!5Ne=(Rl
zYI2$Gm&|5BV%Fi}&w&iwaONeq|7koGj)q44#WL6*@T86w3%f=}-tS1_R&bZt@nH(l
zJSZ3Ru6p-KU1_o*3lxn4UgxSY;~(UgoDWy8B#a-%nb&f%hzM`om9$45ZxX^fAC7kg
zXI6%}5hsTrt(<v4M&+ekiM=S&3z7whb_K5Hn+ZA~!ar%8(kN8qFWhSE)#tV2>|F#3
zwK}d2^Q6$B8)#-72?Xa?<8fPw1X!qFFsF|kW~lKy?JsC6rHqY}RQ{1cHCw5K7XmK5
zYA$c2j3}k57pC#Iif5=o-b&x7PPovjt@26_f)w^|o_Rs6@`Od7l-oSD*0g1-7X>$v
z+W+PA`0=_~YV}GWb8Yj;gU$Y7BJ#52ksF7sBoLrQ&3fnMHE}Z@s)%(yjajE!ZQ#_8
z(rx*MhT3MeI1HHO;J_%0$>jzt?Q-Ydp1wN$B<!loY6Jhy2i-UroH(BUscB=p;j%un
zdj(CStUO65tqdfcQP_I^gUF;`sF(7jjU^pllmH^wGX<V{z$t+IfNC5;_n3U%pm+%D
zN}j&aN_cy^_wzNwu@_?OI?W|`;w(<<6pjlC(ijtM!<SC0`s&)EyMt1i<Jc^Jr;!Tc
zLgvPKQCNxJiX+&Mli2^yeU!i`;SXa?`J@qr8GY1zi;z_zdclt{@iU&Hb+I6}K|MI3
zYFv*SYqbvNek%^D3bsAPjkB9PnTHdgSPPp&k&tO(McW)fH`~B0mNG%~7#UKwlPOLW
z*=e*givwvsp%TeCfzJinoIASp#sM;zLINMp0oai`pSdcrM}i8bB;};gp1N9Zdx9}l
z=eQzlbFQUqa2jAeM_|A{?nO4QP+%>rB;$9B)`o*~O429jfm?(L=N&ktr2aQx!?dT`
zN`c*(d{Ab#iLRPYefN)&wxXgHSRMaEnJmz-g&`C)Y&1(<8PUKU(-PH2fJD~cEw1!t
zW>K~;Dfl1v+nj^IfGkscH(8psqxaTPk8^a^EA9-BT||(r`T>Jf)heQ{ydW&okScM8
za-?&J0fuIA+zS&=;kPjb3A@d6Y&vDZ=wT$yso*ph4@H<IuHbsp=+5O_WTU2~Y9eND
zTEy5%wenO-E^<CAr<nMJT;1n5hRkpg;CqO$RA<Q?EvrzKRQe1e=7=P)ITgx|d0iDi
zWUiNpDO+1<j0jrQyT*WISOAd${qe8Uz^PcrG@mfnBaU!&D6NmaZ8RA5k323l2+K5B
z$KAeo8l-&q6gwp^x-|km+%J!rXY`z*P(*gM59k!LSv<@ir-6qv=E@{0^f=fO7}O1U
zTexB1jg2(Qoe?`)`eW+sTM<zoFdGmnv|tYP8g`~oV{g=uNq-D~5sWNku**MLQ{abj
z$fypjHoWb*Ey_{|M`feBbx=>CQI57Q5YC%Ni|eq^0{9ar>J1HzAV7w^Ezy`tt?kJq
zEXP8M{&Y=AzAJ#dkyNAHx*P2z<znuR&d;!^bi1#4V7AA!mRKip&xu0q#w=r?@s@B1
zx^7?-PcODHTF`U7tVk?&W2}g5z7qswGsTvnMnYr(bk{+#m#vBlX5ZRKLJXPbHEP9o
zxnlO_mmtoi`#-VU3XErBxPr82AC3RM8r1E%A4sMOyYRcNt6se>zTrQ{k<t@Gd+6zl
z0~9aFBbJvrUZ3&7!|A<uv&LgG`=e$)8X5bvBFuoB=dcREEb$;x1XJLLxu!K+nUPCE
z9ThQ)uOkQN@phZZ;kj9tiun@>;g8Ld3#t;85bbqBN-4;9P=+pU^+`iM)4}<}I#IYg
z3GWQ*EF|oN&Ot5rG93anmX(i9m<<LYAyWCuA{4`7c7^7Pm37;2lov#i)<dCxD$(}C
zKCW48(Zji}?kNmgoICQ}9+z{$Sd<CdK-L0YDp&}ciTh)uO-$w2uVaFPk<tklXMCp<
z>%SkJ86qCiLIS&D71WVc_4LE#+y=3{D=Gi|nPm~W$T_IPrnphZ4+w{Wev!O(ZEB;u
zkM35Kn{VW={4TT*xPQk}Itl~Bub@0BdQs=DTUKq85<+;A8nKB+2QR}l<8dV2rk~(s
zT0ZJ~d8@Ad;a#^jT|tegDXlY_&3N5Z#D^Hzp<bjBtQ;_N5~Yg)r^3WVn$2Vo2h|T4
z4ga%<*h9s#EFCS!_M^#QIe1<Mi?H80rW<-NAJeu0A`*#XLJI=$&zKk)e$q#$N!V9u
ziOh2uM;y-OJrJ&Aaz%#F>_BmE=MGA+kUx$KcHsyHP1IbqHK7JgoShdU6m-E}MsvS$
zNTIt&g(*D-Z25^yUohuR52^o5JBY09%uf19qN63s&y?+q{{TqtkJ7e`qR`fE$fL)d
z1n-&!y7JYwPN;@sZM4;h`kkaC4sfddJLtah&D3pyNUFJKTAOx@N@9;p*~@f<X3jwv
zTN;<?AG2Ct8Sl+V&OFunU!4>!%nHKu$FbTo8!X&$u+$eJ#JN$=>q)*w3i0-${nfgS
zX<g#?ybY?4sFfK8?|DPob$DqI8(-5n10-_rPU#Ig%#1~$L`c~~1tmOD){@K={#7Dp
zMV*^4!o8Yi8E>R|?wCkhQqVR5hf(;QezAMNfsVjCbY5FdYi4-?WY>cW@dvj?iC@D`
zZ+MiT##BD*afvwl)4ax5N@6EK3IX=}k-0g5i6$X_rFPL)VjED;M*6Qu-lmW$%8ZZ#
zsh73)OtXEl9^!=xtPd=?p|NWb1S0(6?F^mUQBr12{X)%0+9x|biPaK(;vgnoBiVM;
ze%R>-UsB8C(3t6a`4Tyo#8E!BLl#3D>+&PM-$05ze8vRl+sG}ReUNiR9_Oz5e9fp>
zX!L$mid_{R5n+~FNs1!gt|QvwRkwCp&HnBOK>xQbDfcWCGqZKR6&D41j}Hygjm?<m
z#iPV5&JHje1n7h#*vDh+!)YW9ETd_7cMv3f7G^X}&ZhD1u)sQzc)03Atr2W`wxujE
z-Pw|1_eN`E{HCFJSN%&te6Dx-D$q4#%(FDmQMsfGf=*H%>wfZS|IN9=vxOhYz17fo
z29W+`mmvA^;A{VoFEVd6(2W(geLm)&Lnmf85wVr<*aLYZ!+xPm`dvljTa3x(=Nm?5
zumCPC6l+wz9KO;Ex{3y~o=dW3>22$%j?6P}3q#0;`4mR@Axv1=vIO&_2WUqvPKmzd
zI1$u9wTTVm8tK@VL?*~<8-U?=0t;GMjVSWV;jrY^>L1X5Nr!M4=w<;kAaZ)3RDusa
z?c_D_(6cTUHI9dxX50dqO%|(-Imb}yzfwBE%3qSUV{NXYaYaSp*Vo??zmKan2rCb4
z9((;*ITH1+FN{pW+BAHC*{32uu!5mXPN%bjU+!(!`2C;M_5&Uvs)d1yiuqtpXXO*U
zXP||N)dl)W`U=cZIm$Ry#9F-<$1-yD$%?k-lz93c_=8xFJzrX6jHUUS7vBWM_$nr7
z?~mD7CBwvb3#t&p@FD^?CSL+lJ9i_Oy>J>iJVPS$9%ZrU-uN7m<x*IlU-d)iAq=Y+
z*nFQzamc?Iu0IpIX*Wssr5yRt*18?9j(zGEGIP$##!ay3*RaE;`ybgOKzEO~zLrc+
z){?4an81~VV+mWdC2fOQhraj$S=u>QyQALO25$Ow9-ht^r&{C@<P(0!g-OLbH;EQ0
zu@l$R)@LS_DlmdaA0!F)s_;?qN~#B8(HJ$v2FgCsp2>*{B}1j7+k?ngnC2)+>esdc
zeB4nK9N5jwLq3u^H87?Kkd3clJP;y_Jj=s$h=)7$cx#Ht$>=524MGI>ZTVlJg6~UI
zY_8n~_PE^@mBf(tV{fdp1Tmw=*=vP3fJs&j4_<i2?<(<yAFhJ%UyHvVDwl!Zp-7#|
z%-X+wEZfHEDq;_rgkwFex$VJgqc6i3=^j<P=7X<U!69sdW!TSYZK1roKn<VK-8Nd)
zjm3Ia%ys6P!JgJG7o9f83b(gdrq-HXs+cFo)n6cbeL`GIWTi#zx>nz5%U?ksk1$$n
z;ZZ0QB_mf0^;Un<&r8b%qP_tTbSZP@>1y^X3K~V1_1FLUb{k_FrqY!g(vD)y(Go4<
z&j;=|>U`2LJiDBMu8?z%&YPdC?U^;SmX-qJ(ZZyCCpfpyJ%g!=J%+%_FEul)%;2zI
zVqmRo4W4CJtPY<~GS;ML?`V|n7SzMn<>Sp%kt{{#cG4^pIckO<;0U5uaY73nj692G
zSS5y;Fzklk1fd=p=q?qFD5-eez&3DL(fvA#imuNiGylraa+w@5T(&X$A0=~Kyq`R&
zd6OPxS1t9GaI~#0CA$wm^-y|qmcZUG$KqqY_EzXVkQTqKyBRL_7O-yAyc~^jZBed~
z`gzP<@-Lce&Nx;G;K6<();!`!B0phyP8wo@6q1dBcg*4+Bg?D9Uv3aCP8jQM??kX(
z8nRj8vCG%k`e4l|Rm5n|4Xi`bD5>NcNMimi_x7Tqg>a*(q`faQfM)lC@O@Xv1z8J6
zl85~y9jVN8@J(Zb=YM;T|NVS@IIBCT8_1Xi5q(R6=Z=@>+xE-O)9dkeck)cr%j?dL
zI~<(n2P*4{&Z@S-FOe)f`bp!%%k3B0X-%3_E-OE>Fbh!fN@GkBh%z0=1Yhv=K}}yB
z$c6<qDv)!+YOGhU^`$?Vf~~fVD<e+%UeJ^&c+uedX<Qq2Npc@=MdCE#Xx_@mEL|na
z*-a)?=^HRrJC(^rL3>A&L#Z^J3uxhYJw)VY61|(ILJ)YBaSN3JZ5mHcVt<3fRTYy7
zzTncV1O)Zk($7e37h&~ym-*-n5B(!WJ-p!yFv%B2=Cu(=sy-TOqQLW7gBjv^gMzJg
z{T4P#?Azlc^bxTCYmyz?*+$W}6i>0ot;83WP^1-rwB9<k=8rX89bR|!QZ=T2SiP9J
zu;&_!?TTKOHP_}bXa%wE#9{i@8+<P`+gfE}z~vO5B^4*g2^Ft$EjF%V!wZ?3`rhwC
zB%!A^$>~xUm^vNU7K|*i(7D-eMxtw9(-WwMhj{TCSp=r9=8y07I|I>czux3|U?y9R
zl4f0+|F0#=7TNr2XR(LNSgtQ8HfpP?iN<m9A(XZ`U6gE?uzwl1itb(vJ?cK_9u_yf
zxGYz1KvM=3lfpRli8Son3CY+~7kOuDgs;gcv7{}v)GU$**eCp<T2nUFOuZ#4tU=F#
z@!`*8*a{W+r(1#ov+)t(abnWO#q;xutCwr$t_;K63l&UQ016bHfw^l<m!wUq(5_=*
zI^hn#jGq5@ydhw)WFHDdr2;5EXp$?OQ7_W|-U5Z%lPljUcQm0}xGP`wOM8>ukS!2m
zL7HeS+YKYI480WkrN+zj3X|cGWBYleiD(&%DZxI8NDrPFGo*b>SSB{i&N$>Ux_%P>
zj2n{4ARao!V*am0#{AmS7Q$NkMc2j5%*573y*Y8tqzNX!5AVD9Jhh)FlwUI&zuHbw
zY~@D0WiMxpa)xSLp@W6Gc5K;B+fs5gjH}o*JJ0=wTb1k@;<mv|?#3>-=a|5EneN~c
zUZmcYg};&25S~#V+&=|e?VYXUt6#w2z#=_;X<6HxmIZEK&kRPRgv`Ss#HgT%$!ks5
zo)q*Eoew83n3;_3C=aTruTBI(iS&9pfe$5t)YjIvJ?z&U&Ap<ZDIT6U_egewf$kF$
zE|)Uc+(&M(x(jsJC9vxZIL4bhh5(z9nz0MYCGxi?QK9AtLDZ?f(U>3H(Fx{6Iaya;
zp4?ntwpOpi*`W~_*yyd#fMOf1xn$Y=HmBdqiMkr0A(j4CviYS4AI}A#>(9)<>z^lC
z>qxM2oMTCS-5Fxw&7SpZg*(0AYJri4BXSsyp(2=YKf|Dv$cVxVeAGROIOcN10fZnY
ziffYYvtAD5ngom?$7^K7mI@0_dY4$<x-COaG=|EAv{Reg7|N>!4IXh5_(eW9dcy?*
zHkZtl0H-;pPp)YHcD7eT%0h0Icx$)$VV9r`mvCQyXiwgGecGn(d>qny8Fj>$XQGLo
zojU!3$`!8J{&BNszw2L8yu%lW<UY@fGmiU5V2()$Lly%bao3t|`z<G<UQyjM;G^F!
zwv0s!^R?c%<q#4#bc-@{<5Y6=MwQz1m%ItzALAURLFE$YJBCL@{v!9cKRiQoS7>}g
zBtBsrZ?q66>7Ie#`1@xMKY3^G8$Y@T?HC@u3<BrT-95qVg3U8f>=vw}EkdKPuq)Kw
zQ9HFQSfI`dQ!#d<ylgHy(QtMvnkMW<IAvdeXv3Pv(cpUbQaJlQmQR(iq<8|O#V(OF
zn(CqEBMYTQbD~y)FR-)&dwaNR?0UJq29TN-Jw3bDnl8s;)<286asD(VNY3G+v@}FN
z1}qJ4!sjdRYdhrn70SwI=0;P6KP%-pp87)3{~1+nDQm3yo=IVLZD)^C-jLk!#nh(S
z?u_sPY?@kYghRQM9<j*#%ng_rKNCw;E(9%{86aBkO1yFL!L^L$BH2tN$M}tVPo$+>
zu06-gXT`gQZxipkNgO%aMNRB&_QmbB#V*A@;%LN=!nzk+-6A7tb6MUJr!!$N%4x@L
zNXo>woxm7nR$3a>-k9&vzdc1Ka)l^GPi^84-P9m@u^AWdg(=u-bnNq}ET&nO@7*^y
zGWGJ_DUz(9NxrXYlLzboko~)ED?#-8FaiNIah^w_LVerd3>n_;-9Q53(Sj&do%K~9
zJsY#_mMt!m2!9=&e?{;VRx$S7AvR}RE7VdeHfxVj|D2>{-{j$Ro`(m#ukBw+v*ZfL
z|B!uuaPr9?Tf2hM&_~+~93)A3jmwIkP8W6PlSJY|VT#$Gy^1M2cUJdbjGMXGZQa|E
zL&n>?Kkiy+b(cS=?j4?7<&SUpQY0q~zCF{ELEN_mtqH^Ec`I@*xUW_ek?rXAtdd80
zYeMNuInCdfkCXSxBSN@nI9}?O;UC`#eT{020by-}yjA7V=A&S=`fd3i4x$RX0XM!f
z2c6}VbE-^w?PdJLp-e*%buLZ#9F+fEUc!K%Ux7!5T7E7b8~6?C|D6b2o85YTy~=;l
zg}DG)-1_}!V3ygXgY<H}sfg@tsi2iW5bA8$wo<-IOBaak{r45_{~j4G)uPfh$)-_1
zrrs%4kiPXVnBI<rByh4j+*;<Fg^EGnvl;OZPu$_`-T5!zfB3_HacJrkhY8I8faN5p
z|FG!)%^&P-9Sr|p6uL!iCT@ulvG<HhY)8728tIZagxX7^AQS`{IB*!KhL|GLrYC!|
z;@V~j;|R=yY1GXx+YTe)yr7}){dDYT&XJxl6$`_ird;OdP?j-!Cdl1Mn|TOD&%rsM
z8@b{@$zr@DNa`-f7#zxOorHo=ut^;C?%jZ0sh_zrFcP-5kPA8ptqM|=rk%b%4B>K3
zz9DgB)pR0tTL||owE}gN>CToawE?P}IEIRYTJ0RHUSz6}8Pi(~kyVD?C9=Y@>?7Y-
zrb%)9U)E;A9xftid^sx}=_zq~eJlI|VqyOu*g%4+6T0d(M0$)nuU_Z?KYC$L4_5**
zTHia@H`={KULR2q2OyyvjeyY6z`49>=*P14<ZTA~13H~|K;|<``-<UeilIMrg)?yD
zc=qW)Le=c!%CNYB?Vy)b9f`SRMpPts@$QP>ysEuzXmMKIx3P&thL|m?`{{s*h}BCj
z=CuXxkwF602)r~-QSMVM&WgvP!J)+aKR=ukA`<z4TKi6>5U)+hT1khnNK|2DvcxdB
zwUXrD6|CP^QZt^}6SaGv);O+lOD2*IMnh)TbZU~f3>zA5K{+pcCLj0NJ8Urjih3c4
zZ5wGX`D$L|j?AH*?jc@(2in=n&i&1h+~!}Yt)AK%C#pW*n3JnKk`*Na`G^)}@w!>~
zEJ`JUio~CIGS2*s78fC&5`cMQHoq~~i4XfPMK0H&KJ{}4<gar31cBA+0F1yvUEOoD
zWKYT*+iN5scGNw0RGaP=TnpdIpJTq)JJUYHEG){#j%st1n-x8>f;+6N$<rSi`-VAX
zf_KNAl}E85MHGh?JN$A7{fAEYtc2gp)`7YaODWKDpc&kM;q!{=DW053cmAjjU=MbN
z@s6*H5xRYKb4Z{lvDv?fDh+})M7KfkBC`-H1B|pSVw>dGb-0dw$hoX>4AV}^Zaa5U
z@`!U|fMO5#x40jk?Q&jOjFo_p+7xS+M<RQ>vEGf%Z+IwH>8XGjT5(Z`OF*5h(qWyc
z6SREUb6M3-$9euwGQQ8HBk&c0Yv_mI9nSy1zLY<TLYDqhUyFzU0MdWgmzkxVp^c^2
z|G>Tfi|BE!z7xGAhUmYh&!j_Og`r(cHc!lXt~(`2Xi3z=mwKozO=9g_rQ6UIbrkY-
zoBN$Ww(+pju@){=Na4-;x9@!h_5(a^kB&$kEIk|cr@Ke<=a_V2WJL5q46R0>9<52i
zV1f+Wz+s3C<%`5>9E=5=HA_r#XsuX_TK1P?TVwecF@shDCbO7das!Kef&n{o1UMyx
zLuM!*=IvnQl_81LxTYp$c0bEfyP)3-qOTE7YBW~9V!1MGrP9W>%CQAlk3&k^f~*YL
zfs7N(Y>+mfCKkD7M0H*wsHBFno~DJx=TqzC_1?6}`<;G9bw{D&o~@%wHi@)gQQQ6-
zg2pj^K&EET(C$nV_(>s5-GQUP4Pslc4asUoWVnV6G(gq6+aUViZr6fFp;#tLUM>g-
zOP-OGt7l8#WZG6#@gEeUnTJo$EDwS<G*Q^V_ITA+;8rxL!JL?kW;4{B=LUh)WlqRc
zPjf!8S=U|Y($95rSI4`MlXWYdjKC~=Y8d3b<P~h+4^Rk(3Q<fvA8YWuiU@BT+G=;X
zA#axI8x`XP4iWyZ{!6j_!S|?0ozVW?ee1c2tH5I`LFiJY8cG3bX~TLT^h<nU*~x8e
zR8v^@#-%_rX<-H*&Y-;+D5)wa55(a=;=U%HCwLWM8rB_26-VtDa9sXSfBC#K+qz(L
za%X~f=X%H37S~br1cITfDb>6onzm;^7|?rj&{9!0u(3Y>(-C7h5iQ6j=+Uq8stvPN
zNT+=EJP)vUIjX}eBYe387<r$TO;Nw_hISTM@W{J=@gT`%E(0#Co#K;U5PwFZ`=HwC
zH$lcNt5>{vz;6+m%h<qUd@6E2Yfc{<?ErH^b98gIBaf3X;WdasW+p27f?bJ1yeR3-
ziZjzpM;uzv2uPFH4m(9gz_fx=$=;sA+!*o(A`|Ymq&DCK@d+EBQ~RotOC{Rf+8z+W
zmIm0bD?_&-DZ5U=(Tb^3Q9GC-T4Z)3C<P66|85ejK*~tO9WUZq;8>%iuWBQ<Ad&go
z4=rki31+_j)!sQNe4fvU%ypB;{wjb${TK_2Q?BVw<}AXF*|5K`^S4GpB`(<9q-7qB
zHm`~}L4-twKMpv1{v_t`BuwP%>&GAeo=e1iUd15H89!Z>rA<EZmLbE+X7roR#|3X#
zv=pcs$`MR5{80NXl-*wWdjIPk!Ta;z{^FZG{PTq7eH?_RU+3@Mk>}Qe-=RTq&NvcM
zR^MOp*K^Wz=`Tz(pr=W$!XM4>2VwjOstMxAbNGT^dB>UEl50@LATckD*Wtu6>;Ft&
zQUj26+5b3U_WxP`+XB<qx3sf#(bxZP^8bHn4$5kf<*fHgkP<Spax(HKQZ$ousuA(>
zG;~ySQnB&s<I{BG4AhDeGSstkRCJQD;dexnPX>*q6`~`V@18n3_xB3q3T1RM8VHgp
zF#pG+;I<*%OBD_P02TuPF#o%0SsA(+>KohJ*q9o-SlZh;|38j*tz&Dy#fkiLqd(x>
zC!xJ2(fN6RXoEg(w$t^L#2T>!hQ1?IV#}pQBh9BMcS!Tnw>Nz!TC9glA-6@V03vGZ
zWX73;#g{`=!eh+f=P^LO!75h=zrM9lh4DL_CfBxgonyMBN~}SD%q(rn6L#A~3vu|Q
zbqvN>zxI_ibW4hqOgyW^F4>+U)JSZMab*-o*V<IX{b|D*Po_<Ia7U+Y#38@#p;}LR
zq|9V`#Fia}rB5c>w{(Gf7KAYm{h;@l(&MvX%<%g1%9RV!8>ILE@<KBg-61h-G~;)q
zPJGf(Of~&rGf-&JO0CurL5$9jrm0u2ianGfnlc<Z?n&$&6K|SU4UI$<6?aJ6k{-r(
z&iIy70}cPec{D|~UN9DR*hiBnCo=c64xa8q2~A2j7>7>2rWD_-^ZV4mKY=c-_eN3>
zc4rU7ReFg2cBb=CI6eU0fLjMM@R(D#!q}jbwBkSPrg_ZEeu~@|5Ldu5EAF7}0!H3*
zV9kQv`7rZFGTkb^H#clVD=?T!Sr`haP+swi(W5oRdNIvI!^((utmx$)+TgP{!npdY
zhw#9&WZb`v!ILW{eOGKX0mexeTkF{f+;w_Lon{bCDVI(|T1E&8Cz)|QqZYb`C-kPU
zpZGv=enUJJ?9_Sy1QlQaWG*!-yH8FS9~1wCIj<wywFW;B`Jbn^0fEjtfk6SOq#Fng
zP-n9UO?9&>m+7An>S|26AU@b=n~D&DX<q~2>mj>SVlqLsK~OcptN|m*I+5CJQ34SB
zlK#oSjQFMYh(duVlT~A|xrJgTPQ_2&EYb#vnWuZ@&Eh_`HdC=dzYrws5+9;<qN~IB
zS~8DOF)W-h#v{_a$O$jtX#VVhW&n^5%MlL~pEM515i`abV;;%1RpGkm3YOOkCajdd
zc@%ZA#6}Y4b2V}h6;w%me8rtP#~*OWxHq6|{)`t+)kMRm1T7>SEOP1(RwWH)DF;%(
zbP6z0-hjW%wjAgI(G+>I>MZa=-{os{_oGdF)-(jb#UM7Y01l`x1A{9L7c<nz#_A|I
z-$>0_GYgFqW2IUpr9@PU;;Pmi?dNs>5t$P>zJW-{YkA(9$eCD<t$9Q@vCbC{52LLL
zI|VqUK#H|eLU`)E6ikmeGNl``G*+J{2Z$0%N8q_FntlA8uCylX%o@TzKxAx5F{xj#
zq6LsLQ@oiRxD=HU1Uc#)s8AIo^7)Me0-_L@JJ}=h=w48Q(>W;!a^vO}B5$<Q?I&p#
zutVQf-SmoiO&Eu26s}9`;H66xDZwEmu0Q&V5;(?;HUZueNre&pPivXA7p458ALx%T
z;tO2=oA8jD`W<JgcE)tVk9hS}8qJLYNf4Z8BrATlUDci9ATa!^V*aP+Z5_yq=Wm@I
z!NWKh1Mo3CUub*RQ2G_)(>c?|7bv`tN~2{0Hg$F7`u7)B@vmysjgZSTznLR8qV_oj
zu*G~BSX77R)L{s{x$0n@cePZ+<clblXmC^X0>Od<<Df2zMX>wr5Ixb8)(cT=A|cl5
z4y||FYu@*?p9wLL0`X*hl_|iT6<DxLyeJS21bss)sbQy042mz8y%4n7y{b8Z|E@bR
z%ggG?>Ya_boE{_GKwxJh0<YXcVI<a)mV#$*yo@M+vLZVhYru2)*lteN8@1YUvv0cw
zE@!Jbu&rFst*X{!Y<-}X(34}?Hqt>Ahs{vO)+-*)kp-N*ymghV6Q(Inw&h7&@X%Ib
z-`yG)e-%hRx^4YnU<+wA1iwGis~OxL)Lf))-<A^n9tJKv?J58YOedkLj5!ZlCYYO=
z4;3tx)Hu-8mnRyG*0uVSNfVq#7tJ}jy6XvZA^A@@_Gm@XsLumbJbI3Ds78`r7Lv2+
zar3zp7-J0Y0Zm0pP@sXRLKi5Z+-i(G<&^g6Ju&^EHD!B#`6QV7DRVK0=1(N-oO1oB
z7ZqXZ+Um#0{5f5~NYX6%Vnw?v3l86Rd(sG1`p5wdeASL_d;J&S+9swmuxreZPXwqQ
zj8N0~O{fiow7>ZLb^OPNmMT!-PYtp}@R_S%$zqKS4xvoJ4wRPAfv&ph@p^y$$C$tr
zQ)%Z6d5B3qlf2Z3Y=|-sE6o%KSBZ3(+Fw-wUFySk(bQBCf8h~_xw-gRpXiymImpYG
zyu5Y#jeY>+P+Ol}aT}IhisH6F2q+O|)2I1nO~w6Jy;~6S`NguUpIr)m|GZdebkS5|
z$#F$DQU_ZILAo!M0VEym{O?+Sg_jQsWv1;&&fi~g=12-NZqTbCgbPk9jw1wl`8bGp
zV?R)6G`Ebu)V$vsJI92FU)-JM!nq*P+-dcKsJH=Is_Ms6li(G$Bap!#AL<5))HWhu
zEtFGt`E2-K28Se1Or@1#{|+2=YatMJJ1vFtk!%pK1D3%}%vt~rL%E1ysOkw$uNG}V
zpj&C=EwThN0IvNPJkdM2Im$xHYBM6J3XFq+$~LiaR9@E~(5_Y%#RvKmT?3mhhI$b{
zOk>DoO#e#iM->QMK`L4s4O$H^n#U1YwwCK1g>-*jNPItQ$XX=pXto4CQ_l6vc2j8;
z0VoYsR&qMhG9-3bZC-K`=>Hgh0-@rNIUOv8hZTuB)=5$6PyZxF`6CWL@>|={biU$p
zpc$-C-!2}2VLMdJVU<SJEi<Ww#e_Rl+A>k{!8!~ggPKNEu-2G6#^}lo;+JazZ19dH
z9<3~wVFeOL3EUTG3J?HB(I__?I$c?8wCof6=^=-si|!l(TzEPBT!g&5`wrC0%6H0-
zP&(4NmwHq0ZH}*;5hOR4lDOVr-1+S%>%})DjI9mes_6{F&vw82W3NPAk%lQyy5cO|
z?|QYehSG&BuJW%RuPmyZ@Gbe6x3OwTg9zl9!7T<hIpI376R3uBD)ofwXbUK=$-&7j
zT7p15S=0APXk!OUmen^=`IUp$ODPvp@|8*EEQ}0_=AJ%iWjsWNChhkAOadGWuUuK5
zlN9m@^xe9$GHb7k-e-$j&schZ4bm%^)2zwV6NBuBEiboPj-t^op6nUYotWzlasi1x
zLuGjT$h-!1NBEd_NoQXPb}&Xj25?(CSA#x4et9hoIDb0X=WLB$tH#cK1!6TMmMuuS
z<kqO?YV{*u3tdKjsuH2>`r|O+`m-X3*$&p)c#mxt@^5{Bi0HBCh#o)37K|ghtlDhH
zLB1{&tiwFMV~DG19s#y2EI7#=h>&E>ps?_x-K}rYU|5r`F7k459d<$G9Y>8cK^@=8
z6qy;%Yc3>pMasI3Hd*h1+@RQfZNJ8j+*ZQ9<*GGjDV)yBR6*))=a)9_=^8QpUuyA5
zLHMbb#@}AojI?J9^+R2dp5nk4?Guh`oa|5zRV(fEuD6ZC;}sK}NxkdlnGy!SRvQ2N
z$2;VVc7g)fQTe7oOY*&`gHflqAMuwmilGJc5~jorkk8w*=e@Zrg0t=)Yucz8gb-h-
zA&zC_w+;HPF?qVWFtFkfyL1Eo@kF=gvh(51V2bh!-ML)E*mNzqGBp9UX(C=<ToWEY
zXFng-O)%_*fpP`mJ_ZpKe$(P9j?9htKcW>D-J90QkcX-hp@y~zH&NQvNxEM*FF8HD
zJ1Yw<aIzK4xcc(TBiXc<T^fq)?H=M~&Ge(m6i0>O-lGTxx{d7oj%87sVP%BRM1$Xx
zQy=7x?k8KuMcrB{?Mwlas&iMcJIU*MtZAGDm<RcAVti6b{X~?av`wa#7ekM<HS1V(
zn9^Z*I5Smo6&o=2EOlAu1!c|IbEMnJ&$98Z$UE7Z-vui+spxcHUADFMa|FJ;G+@7c
z2{cXJFJ{ZU?zIQmQ@S&69v<ctVfM)<N<0R3H9*dP-h3VYY*#3KD>?7nxxBNmOPo-T
zsk@zZ8*l{KiC>%oZs{(bJDcalb519;SD>#JkWk9cA{gYlx9B0xpRIFd6zpdmv(q{Y
zti?!gN!y#3tbp6;Vuz_gRY`*jIq7DWo0&Mr)OC@?*2=UFZ9V$!X8vQ8t@VL?GTX1Q
z^3&+0?v=a>w0ba^QAQa*?%E`D*pDGEJ}h^uS+mon4`a>WdzlgNGw{sUMt2-#c}kSj
z_yTEm6!cLa7XRAQpUv;s_F0jaNzerDmOjLVPPhEYP_R6L2s|h&7Ft@Jwuz|zbFb&m
z;mv0}=FFrRU(C_ROMA9Ktytw01CxR^bu>WH3wATE6i)>X_DiaS$-Bm?W?(CJD(ieT
zQrjVU8-@;>lRiI#+b}22D|+6SbE9M(q_y5&SI5ulb-n}sB&z+T5!6LYsVGGI4H6XL
za5PIPM2#b6?(<Gr)p#R2K3x1_l3aDYVfcD5I@SV(vfk`c@JT<!AQ41ftC8!ZW9;ao
zt^{rvd>}Z?eLkF?f=2ai)y$XR#%1eoM|dZI+~1Ts|F}giL_mQUul<`9SjnQ2<lyFl
z6#LVT`J^$3aYSYBe=3$a)SbTlV<FeR)1Pv)?QWvQz>QFYpVoD|era2t3sJ_=ar1}O
zj*T0^hZ$Q3KXzAV*W2aujGn#X&i{Y@mj25f|A#O_)usvns5t`tk30Uq#x!$77gKjb
z&;K6L{x9zMKOI}=EwQAZS$!sH{gohiu&KBv_bF=M1hP%V!nRaO?mu-fxX8lD=#Yc}
z%FOJCpO>q2PY?-esT}E>p+8iSpk`iPUY*`~H+c&v4X-ZpGLM`Sry!0jUOlh+Z^cX3
zZgUSFsnm*R#1mGkla_Qh$zDVc6XI+&G#=E&Z*sem_v}<xsWKy(IrK>~k(!wdq5pJh
zuSD}&_QG;!wKd6|k}9Q^VKk#nne~^AvL>wtNS0y9dcD2LE49+R6fz@K(W@G>PD?I9
zS^stD0IwZV&fTb}Q823HMAO!ZR+PRwRWxE|?nxF;dde6W2jQW1%Oq>3)_V6JDOQ-&
z7y+rouc-L!U@*MA{_bAD9z&a{!Vl9Bb}3Y7)s8geRLPK|N!17^#}{r?>ZMXgi8RtW
z85pFJ5t^cvKut2&d<e-^^60KmAEM*k{Fzxo9foPDkXh6CZdStkCYw2P&S-hhQs1u8
z7<*rKp$z!D*j$df*tGX>QRB3C=>&EgA=HwEYRadYiucOQpt6dQPLVFWHNf5|zxrYf
z$5lw~<TV9)1r~w$JUNfE5Zv3&1x7AY=1pcV&32Vx+5Lq%sD+x3-YGbbbXS+Mq4PW<
zq}EtFh2hJTm<s$7=tIB?;KqFT)u=#Y^@RwjQ{8!loJ*1^o#<29I6l{!UABIlWNTZb
zedG!^MQS{p!x#Om9r$<hapFGII6cMZD`NWGn^*P0$6)^+Y3mPZ6{^LP+WxXT)lo^J
zVxBpo2*kBkMo|L%JXy${IjEy4WR<6qudUcy8IAS=)I@n8sIMY@jxhp_R?ncqmKqwx
z?20g%aj`3jHe*7!B*9d4r|Yj}s?AM^vP`TM#4e$OtHZHu{%lzZofGPLQYV-?Ex4#Q
zR(nzPitkZ7pYr@kxrk6$zI2h_qju<%TDlj+siX?Ni7WKo!$Ejz#D-#(xeP@4TP^>S
zAM?!pr4GoHqX6Mr2;z<k2H(&dLk#oG8x;3g75(Y#Q5bEoJXPMO7qJxIyCb0#VufgP
zVp!YJD|@^=eZD$Sb@6@J6Fz@-`n~nP!bQ+0*v*M696CCxI(TN|x#9D_|9imq_1bN{
zr>CbAyT1}`-~01n{dzaH(1vlgaI!Lj!2dy~$5kjUJ~XAs%Jm0FZ(AUIX{hL+4Wn#q
zhz3`@&8Y|%FE6i`i|2!{6JM_fj4ykqV2UGmia_5FL)YKm2@6j*Cl5Yei~{3q_-PeG
z*N(rlbKqUA5`Ncxt@83k)IFBp>O-+s<rPGtOt}IXodE5RaXhA~s~+lp3PSU4KUd(?
z$(#TKh&!+t=vLCMR}e{N>3O2pVO9i`IIN0}h1dSg9vPK_2AXJR>9skJ8{L>O(bPLM
zJ1QgJx^NQgONvdU2p@`B0Ld11L~H=S0i~#@A|l7Kw1W{<Xp^Z`4wd%BJ(qIfvyVg3
ziAbMOHdXc!Z7>-c%gpLEnp8E{J9D}HeR@CgX}CcZeEGiM0wGrZ^SbHhfz1`B!xo-r
z+ITf#!goNVcNseO;`<K@6(BQnf>zBnXU|-p9p_J=Zu&fV0*ZuvRn|8U%T(3JYc95u
zV9PfO#mc~Ujss$#lMpd>nAjt;KH9=HFRUJzS9G}CP&M#d4G1hQkWoA<I{JZ{Q6&&x
z7!x8Zt%1g4n#G__;;Ka=;Xh?eqX0%$uL`v#u)~3?WDbfm5_pe%4uHL+90aQ%1XIC5
zgyAYIqIZTQB>N9=mx-D?9_7KGexvX0(EDWywpcSw_W(SDD%w-i`^?SCW+iR!g_wTP
z8KpmU`9&aC2Ma5A4y-Y=;kSqcxIm2&yE?ktI(|16{~|1umKGmsAx&$k(SY`W6x9M1
zX`qR6%7IMOgS4P`l#apdd03#w_LqE<0)JGrN-<sp{|>wOBI&}}*M>uS$?TORWkw>X
zTjrutUBX&IumET%S%^6lfv4k(cuCa@Jm6>qm5i4IVrZWx1yUpq2!S)tcMc}XU*ANn
z#sO<!{6JHIwt_Wp&(9;4F20UW-ww;}@!Kk&hn{mN>|be&OBR*3L`llq&V^i>)OYlP
zkJi#ZolW~<LL5RzEXS-@GZmxIDz;H>h%N*JAcsw72YzjP8FY)**motgf?n-3Ak!rl
zL^Blo&iLW96=Ow$7RkbN^7eY|alSx-!nQ(C5taf~3u6Lz?(?CNB?h|}fq~a!L+Tv%
z?4=q)0rCwq1J-u{sVV)ccJcX($;K=a)F01ZmJggU<hN{0R>Ut#3%iQ6cf2kll#j2k
zFXz|4u?NBcCTUIkOlkXQ>9-(9;RWmtZxFNZ$LRljQq%jen!{1CWaAQs-q8?7z9z2Z
zB^II8XG8HKk9mO$Jmu1|XcPu~+$$+7j4T-4-P>X9kX`oH4T%R(m$0Ouc#Y!70d+zr
zRsbBv1x;1UEt#bSh?83|yKanA5T6K%1@sAh1D9$7!WQvu74k=6SbhZd(an2_!1{~p
zjSc8ykF+L+_?CR?RpGbmWdb!}R5%enSr;5sDJ(sp1N{&*U(FWP)ECavjt`ddaC#3w
zTm-OXmq3XI0#CSOvg*oP<U|EnCVfNdM|yYzZBNb*|NXoB;^gAz!HKHdo;p8<HC{OX
zI908RRVL*yKo;J+;cue<vMRv?wwJUe5f6sI1$JebJUKw?LSC1+p`y2!qY!~KQlbg`
z*?D{so~<?LQyU~Ri26n!=;fSa8R7Tkf^*WcHn0B`B{0Q8Yt-{-H&r2z&~mcMU&Cmn
z?zb#Ctjov@inG|Xlylp0xmd@)@Bc)oZe$#VJtTU9T*v4Zf}HCdb?5<wdF)OBxE8v#
z_2~cmk1&8Nz$>xja;7eC0ChPdRzGq0dS{$x@*<s+VxeQl&&B}QE+(yxpd6~_U5_2S
z!}^t%DXire-f1|(0k=@=>mM9V2ulywsLyR}03N_Thj&~?wYHC*_M>oAfzhNsG|VQ&
zCgDmet7-&=+oe&hG%3mA^$%RJg4>f~ljZ{gL5TkmL6sah$9E%AI-!UFw<kv&svCan
z3;ksH)Y`ahx5;nh4mHN((N;ZSS$<PVMm9h&04*-L!{LVeM|1Ezb~CqV0&HWL@i^3k
z`E{1}l8wf4RdzaPbPZQ9zKLDbZ0Vi6>e=fw<?{MsAWZ9jQvLf)`{jG8L)P*^a{7g%
z@}aKr9{eE-%CsX)kHW5dI8p#L`pg>h0ZOd^_O>An<4upV_eQ@RR#2g1x)h3}9iY89
zz<1L~R)bL}x!H2fk-`iLxiavHfRlmzI}#bii1b2Lfu(8Gi)~U|+P*dtV2NO%y|KDx
zUp|?H6Qu#%I+f&KG2YFOnOb#?Dglw9{uQx(8_~e#ycJOnTjhzQHj2HE*;=fOjM1Na
zkCV2ZaTO+zXB|#1#J14dn2a16ODv-i4AT?N#-$w6yn?UcN}U{fxFf{<q2*f%MhtPT
zM?LGF0hxFlm8;=^Nzp+Hl#d=I4dkWd4vtiEB(tH=G?M^1di04bL)V|zbKgFOMDHl+
zk}n7}kmFrri0c?{zu}_9<{Zm&urA4z@b^@cE>_^;9(7t6IJzXZg-J2Er=4D8gF1;E
zgQJBKaj}6`$dMW4$_ETYB|?MlatNBdx`1Xu%hIrdL4O7VkZTQQ9n`}zqGaScC{B3d
zt(O@p1#=g|q5zkxG*X5wTqFvId8wetv_?J#g{Fd4BiZjunXo~7Rw{Wi=n{QuJ?UU7
zga*?xJXvKKeGnNY2j0^Jhk*NUu`1^Ij}MNOqZzcSTAH;B)u#4l!L6yO4quoK<Z~0x
z)=-EV?No*!PMCK6-MntuZUjN~6*e<6?vPEXgF3o#+ejP}nGg_^jJ~5^$htMGb%jDl
z7n9^wjz`uDP2>@fWgF=TpKU-=s%d026H2~G$?`kSV7-Rf(4Gi|1?-ucRbrh$G$FC8
zwOG>rzrt%qx0`%<qStjsCU|c?`#gTU{D86Z&rx3vqU%oE)_7PbFGc3%2hq=JJ=`T^
zOSNXZ)OJ~gdAN$^?U(-im+cN#_eLP?go=VW^gy}pryQMi`sB(JR|AOX6+Co&VaGtt
zDR0=sjlzLOTmQ{&<A{RE;_`+#ni2U`RjDgls_bYAvIH0>!s3PuS;xZh7LH{(_p{;w
zZ=d_(1)Sf=WOn#?RfDx+4tEVDwoTKG>V9CgeY+>3h{;l28+r30cA8+V8kQLmlgPZD
zlAshzG2USjG>S4R<x7?wbYU`JbQ^sJ!M+|VoP;4dD1uqR)c|i9$0}r=eb$);Z)On5
z5@!Pd!U5j(MF2#aFOwjg8#`Mn+6qYdW|(!3R8PNc+-C;TiU!C32S7l-zm=4=C1A1P
zg(7xkTBZ8h7os#VAN?QS9lkw`zP&gbKAR*N0bX1u_t;@WN}+-(#Gory!;;$3Vtmx}
zr8}S!@#B;Mb>EflEdKFR7hc(cWvcx8g{?Yx5C%BFZ>%00q!6v(ngANUhfT);7OY5!
z*I*6mlUx?5>XRY(8_!W)rh7-vbY<f{&;DBT0kCpa)Q-l<Os69n>IrU?bNZH?5As-;
zAO}#AG`UlKofccd1JEw3I0WJ=JdssR0b7<y?&~)qjm9!~@Uta(T`-_wOE7*=alyG7
z-k-|+E{S1aCqUUah69amRmN2^Q$Ys1(|Dz<*M;(+O14@~l*N(`@M<p5A91}6#?Va1
zO*}2di@kb<r49?Se9BU%gopsK)=K^0fvw0@gmiUB=co<5acMVx04-bDs8%;)WnM>k
zw7~T_K-UZ*VNwDZo2xqLTGQ$}ACk;mmu9qOi8Cb^EZX)E{ekIX;eqVW5pCwY+r+qa
zN5{0UWs-VWO~kokpG9$C=ufv5A!-@*+Bs=^#4G3?5r83U!<?N>(1GFpy#LuI%nYTT
zVd~TYt;&==K?A(19Yb%6(5#3mRj`x{Q7`7ynHHC(eQ(7AGTgK6{5bUk#nFtW;A1nx
zFT{I3y`=S-D?Hr~Mrc0Xa-N=tw~3~`J!rNEFnCg;+U^xep}IEn?;G#~B)HigHtRNK
zY1&kENdbAUQErIUWHk?TB#OY&06?-aBPf5VOfJ|imBuAzlT5V7rCm^Op&)76p=>py
zxx?ZHfIE~|Ni^NiXak3EV3qDYHE)f9zOBRR4}&C?sgwsBJ357&SS4sK=R;ixAN97d
zNJl_7jn2MrQL+u5h<n)9X$#?a5Vtm&t(SQI)*4hxQU@d`PxOd`+oEqtSVQHegxd{v
ziP{j{xt+W1)pM}ns)(U^@ThYMdJuex;v_z_mKjiXfI_A(N#;IdR!1+-X6EJTQ=cKL
zt;ZN7Eh$Gp+bhS>&dnGgf+s$3Sr{GQnGRDLdeRD)E7r$|C><ai2yiN)M)U}AsYK%}
z<_aFQ_9~%I1v__VEMJFF<NzXWtCip(2hnW4ouBuEz2_ecPuFx3d;^+Sxq~leUx<-H
zKD~Vg!S<ZN#|5QNj&$!}mzMQ?PE0QDU^8Iog|rMo?*WH_oHSK>7ZY3T$T7mU{WwN0
zSrT>xq|q1eTa*0oD0<eeJQp_be8Uev{J>zB@EM>pfc#gg%2(x7=|Zn?5zyD+H(z<3
zsbij*ScK!An2WV^Vu$9zsRFLx5)QY|$uvBAOL?mZVVfWW{}E3S4WL6PH><5Wsk6la
zSxVnm(QSo#-8$mq|MryAEKzC92#ABvzMw}Drp;Yy^-9%c0?}fnV?Ko_1X{#m?3$0A
z)r~!hWB+uM$;0WS*ld+DP8|dHIdgL`!@W)CZq5Fgv!f@3&tzNGa5o;KWP2t|r!7sE
zhlcQbI{vvMoyF0(nkP#61)hHc0=LW4uTO@j$J>k3{OZk+YSTMQ({2u7#O1t3k=k+k
z8*9>y0UlkTUPyaBzi|j(7poi3bhX-Pt{L0N==>r&8=k!yUKutqTVhA>&Xw3PcAoTa
zdZcca>*b=eD49u;;{g|gwKjG&C(-^6>GxWbc3`QE%^MB2+TKQKzZs9|I{yQUw2R5+
zcJ0}(cjD_oL+=Y>sCCZkK~o0{X&5<L$(X3@w0BM<gBL}EMiihh6UHM<)KsptYlTQ5
znP*FL3Ix+Yvd&A-KTl<*vqU*;?rAP#I&c2R`2%3x??dzE=6sF=n(!=K@x}~8r%XNf
zK^JYbE%}d}c6b+WHrzzZip^I}p4IEH$n&(<`EBm%`hU?~H75VnF7NgK?(5HSU%%+E
zeivu8>8P;9Qh|xLJQY9NiAf|dOy#SoGWNzj&~9bc1PqUcA$U1xOnET>h&eA3EIARg
zM}SGqC9=XJhg3lYx+!N=@q(uDBAPb_-CEw1FN_D61*Y_{j;dm2tJ!NRxg7MH{2i0=
zyL!Ui^muyIT@Xks3c|88g?YokGXYDsp$fmO7Y`_vQIw>L%mkx+7U)LX#-YTMcP=xF
zsj^(5VCf*fq;|&MC}+^PP(X=MPTsQE9NFd)XeI`>`e$m#hT4Pcg&i(554+mapab0f
zIEBr3QbmAjwuS&grd-n`JL&5i(?{5;nU>M?b){;{$i2oQH<Le(6aQjt<H92{eBiOX
zHHqZYv3x#ACX8l3Wsjo8(|MQ}XdMk1Y(;D>@7VO!j*3{Ws@h+ShSWC%Y!X1mzv^=L
z$Gc@nX-{xt`a*U%S{LdD58x)%iJs=gEND)sq3^yAbqogEjNpofuXKv~PKK$><{`06
z9@EXhRo&1--Qkg}+Md$@H-zdJvLq6A16$mZR@=j-T{U&n6gajt(z;32scBx8sw(m<
zrlN{$v%k$b`3gjzvAI@E7MUM<<4+8>owWXeZ%yo9(P?UKVtfYXcIisiqRj76B6tME
zyTj+n!*m^p3)dG<ERBi!cx<+hwgK8PH!_b;#4-OR0$JB;Sy0iA(-e|g>LpC`d4>0@
z&1n+LVC7N}EHGxcF4tauT#p{IjI_lN1$SG~HE$-vZ3Dh6NyI=(-9AzPH$Kbh$XKtj
z80J1%z;{`!x8GbAY+2EbF*HJI{cBdFMSmc!9SOW}ruo>tGA#ms&H}ZV2wUO{ufM$+
zMOVk~uD)#P=zbo`vmx$XD~VwzdI&%Ta{EoL>x=00<owO>ieMX+QujH$LewR$u19p`
z04AVeTV76+iL*sh{Piv%o4O_5IjNOV2z~NrhP64I4IZ20b7%`&QN#2!OBTV7)S!p{
zQiHqNXA}&eo<5i09eu+977MxhLKM<p%JqchMj|s|Zc@AaFGD>%xMrp^I=b|ylp*|w
z9loi0X_ROAYzCX!bN!uV4G;D?ZhGkE*n_vmcugE2<3HUI;!*^Yzb46F`)2gM9kypy
zeY#N7TkqPnUHd*sb@NWy5q8k053B{#u^bt%c@d^6xl?#q#{88JN;D=0((Cg9yl6na
z5|j_G_0}cDJO>PmI6Q2$E%iRAYw~EEYrBtAZ*$JXyCK-p0@gEkOr_ym%+&dYjqYXC
z3+K=}-sHqH(8p8V6Yf{lxxSoXsWsKYUv+bvZ$I4M-+yjGJKR1iW!0qNN4&R|5QzUS
zc+qei`i<dr!`1QaM}r(b6?C}jFCKW&J2m~0l++u48}|*Zv5~ps^gv9y-8OWyo-62%
z_>7dXrfrMXwctrd8+d9oXved65(^#Z#mh_|e+Kta))eFHI`$kVRguayJ4UPX$NUs^
z6NjR|qN6W%>)Toc9kZ>u?j~QXDew$K4Sjj4nVTvH9**+BHV)w;H#!KzMLdrwo{%|F
zHdh(u;@WDiWip_u*Pi}n4|&d#v{t2#CUK;yaVm&M=mKng`Z~|MbU?@DAI{eBkp2vf
z_xlwt1fKVzBp?Nl@KkS;BR)_|EZ~VXS%XJ{-DECtcO*9O&<&o_Hi0?pE;%=({3O%y
zBViNO_xW@de>umC1y8#zIAImSJ03(yPqNtrXWR&|?MYKTDxlzsb`C8AeqfLEvEJZu
zIH0bN@Yg!ipc!w*fo7atbQ;|2$E}l1PD3#@VS4gqcKi3b5}g^~hA`12m8HqXjFH6H
zW*}`Mw>=9zNaugr`|{>Cjx5jr^(m^Mi7?=SV5y^LVuIUr4^7cBZIRTHl-&+l0Th9v
zSQZGtC_og~>+in#uFRKL0i@)scM#ncN#uRJeCO{mErJDrt!u1!6>(nz($vMv*%^ZL
zLw9CaPdZ8h47HzM9Xx-L4HCq8xMC+?e0{LrstbjW8NtBszncmkgioaF<pbN>SkRrX
zsBDV`2ubUmBNI`|4j0&n;2&#Q_n+^boo%(Mrqh49m1MnKwe0lpz6W2!!1O|!1{76`
zt@ZGU+{L7Zb9_f<Xq0W<QCw%Ko&e<vBsF(&14M~$$ek?#_32wXlR!@KUsHcKTAo(#
zRdql5Z>v13{O{d$I6VCstOqL^3jIPB-V<5a1;SQfLc@hKBHSxr*X(FlhOaeMATQYt
zTFi;IjNlN=xhwPpa4tXf3?SMJ{j5M^1pHOMP`(06D}-z|y*>os2FF7?fMU@{!b6ig
zjFn^X0quLDHnQ=s4J<a>szNrHwHIYLR0eRm!VZ38YhHo*c)SLOZqlOPEK9@=stI-`
ziF+q4Sx<kgEP?^Nf&P=@am%2W;fef32;kn!^RGMtn7g8h!GOR`ymImLFMYQ-KmRhw
z8-X(2_KU!?5I%f9#E(g1^>y_z3B8&eT>x(@0CT`yG!)Y8`tq&1`I%Pql4)0}U2jPK
zQZ?%_I!ejPp1eGN@$!85?a85~%L9PqGGp(oSC^0eOOY?EZ%#VWWFum9xv@5vUV#T=
z>VJBAaymRcIDdJ1JUlu$JKKA9(AwR@;{4?4NdX&CoPqv=+7nBlb+X1k|06$A{JwXW
zL$gm{3V}O0=sHNW!O4-}mT4m(nn@iIuwhlCD6)4iAg_DTwOpbx3$#e)4lz288nR)l
z{T4KiS|??<OsxfKS9X0XCc3c_i5$KumltL?hKzj5tXd@LIs55OeCrcHo&!QS8CuZK
z0HR0Vvw&$}>>$H~Y+VK2j)U@U#P-jTYd~`9LB;!O2Dx<$DN8z6>LZ5mp-)n27;TZ#
zBxQBM`&J!j^+!l&=4J(_M!16406MYSL8oN`r&cQliLnDB4Kd&gRqhLvZ$Pz6!K!a(
z^Bd9yVWg=*4<8jM-GSc<NigA>uy&I5-xgY`zP_#|b&2{2c?RL^j}0$GkB|R2!T6>(
zpgg#!fMF`SN~=f!Vv~A|Lq`~v>iuF`kLwl7ht7qO7vbX8nsOwLL;)|d#(%Vp-aF<>
zHz@Oia_syV77{Ghz#&y5@hVW0D^dfJwFrwg9FA;ffIr}2GriQS;m~5Yl3OWwLld|3
zDL6Zue?DcmXQgppFU%PHAwhuC^s+aw&4$%oZBuSwgBG1ryI+7t0S`6);NJ1HJHGhg
z2hXVPY6y-Gh=*$aIE2WE;ZHVPyshQXU4NUF2}@eFY(*T`ZU<HIojbOCIZ=D)X2IKt
zq3k>9wI0}j_^<^W?KUlhLGxilmy>N;QIY$uf#rIJbyY1y1sEcb_X^n)__3>whvl;R
zpa*qF4AglTZ)(_IzmZLA-Sr+H9HP5~q0{!}TPWd$;`kn+0AK`R81A|8vXZyBSSE+l
zx>;?$ufgc=cwsrD0{-!206hkOaUT}nNl?ldW(n78B`!G)1q?S9P{Q=A2q7x#g4TWn
zcZ2%PjEw--gLo^w(SQS*{=J3$O;&#Oy7h3{RT%&RZZ|{EZJ{vUH(Lsw-}OOu=I>cz
zp#DUX)_i+@`Gs?+f!(=w7BMI8I}{rQt4$sf+AxF?a!?`kCAs5(Q3Dm;^vcd>m-QQP
z`YMZ&at0c;oa<;i`hqOE8$fmUF;Zq%Lv;5G<+_%DPr+|-QOHuke<0v?0fVVP`7i{3
z+Ef!wGldEO@WJK}j!U>U-0$*sXrxV3ANZeyF+^K&xV94n@ek@;37Y{QF{RN;kh7Ek
zZW``}!S1dk*wzK$R^|kk-WU`xGXsx&vLtt7U{{OL+1c}bn%@ZJHLvoB44YT;W@YU?
zBZB;(P%|DAdIX^^#H-#VMN^M7>fA`z@ckSxVz|9keMyqgiVQk%UV<}$u}p@+gXnB=
zKU0eo<C$z&9uKQpclDGpKF!1s`91E-9XEo$zymXaGc=aG9mqt}tzgg=SgDjE5MsYO
zkktVVzSH^q?Rr73mJNb=+*->8HCdzk4VYCf{3Q}j*2cE<aH|EFgPXLCnK_uPs>f*L
zg>DsA0({mBxN6wnZE;nj?U}4UjTlyx`zPA_-?}o<$eLn`*dCW=qfsbJuPR>C2vEdr
zl>-#MxK(@WcNenx;z)Xn&ut)^SP3W^3K0l@Ftkhl;&H(cBmh2=#09a78aR4}w`_R1
z77n%LgwZeT;Ui!A;KcAjV}Qve=%Ng@M(r4gA<(KX4>c>;ZQex>bOSz4I+AASI@5d_
z=P#sI#*xWrljJ<F1Y41JO<^K<-7Dei?6z5%7oFkp&JC$NG6Sz0>$4zjbMynZ?5hE6
zFDNQ1->7;8FA7-}L50$!e4^AG6|vuucy%J%hDv&8Q7+SUIjI6*AsiB{rlYL!VrJeD
zjdD`K)v=&D4d5scx`Ee|au)1*4&hc4f!;Z~nI0Svz6-%q!S_O{u#^v#;>GekH_W&{
zFCHVTNW)Ms7&8p4%NSi3%gdG0nZn1>Y4KQE&CS*D6AM_M2ODQ+Cz8Cr{JP?Z#Dy7E
z>LKz3`n8}&mJutwvEhp*_!WX`#%LEIT7}fnFE}!6@;vU!Lm4oU6zyxHf?YlP%s^H9
zOQH}_yj?{?Q-zpJ?Rw<RFX*nfDdv6&WOH{vKsI&11#M+cH?*AjDij!%3(&52(xU0P
z2;~sZR>y`4C!B-P{tXynzw2LzCk`{z1olITB^K7#1s5yYmf(wH&NgC;eZJd+4By?H
z9%m!lEZF}Qc{4jd(G1)Fd-2%6D|c$Qb4Rm(gb7rU7Z4RGBvj7wbCftjcelC{e^4|C
z5?r<qbx%zLFiF4{fg(B(<1r2rV^0JY0P3~OK_5wPKH(r#IsX)xYX?al){F_|kiqN_
zJY=v7z#|}Q70d`cpuG0Xn!SM$B43bhZV91Mv%0xTv6h;FIt;knU`ila=>_$#9-~Xx
zASo~G#<IzG;clQ6$E|$Qc!C+TIuF!BpeV-hlQML*4T*QzBM4h_3jv6LoFnPe!fnJ{
zgBuNWVi-RHCyIm<z3S{RVl&M2@qExi^|5_&sG}~jLiRa=joQ2VA2OvK;tEkF^t@}m
zmO!-z_>-j(8;m$;R@gS+rN2p?o8OH`gK;3zKRX5ps}^yp8MZA;-91sGoZS{9FW=vY
zT_Zy2qI5CSyg&rzYNM@ZhSWlEx8EJgR~J=C24S^B9-0Jf5&zNMFBn;*$TTZcUm}Tz
z#mC|YuII3WKB+y+Mlx&qM1V-?TfoDbTZ~gD*%W6L#Vp{Fn_Ev34*RSxy~Z@0tTpFJ
z?osr0ye!9+!pKLQ?Rk8oTv^BngLtFmw7~p*a#$FIbWDvqb2++dei%cKY#=dXe>8^P
zJ{IO%`*Ws<e)x3pVd|m#n7li@+8&(-SC!4YTd4dOd$NN((kW7G5HeiN!C(+p26qzn
zwy_Bl3$P0V($&(r;Ut<2n!*lR*wSM6F6naI5eORz^5b}gw-#iw6p&0W>F+*J7J9nA
zzP`m=vA*;Z*`1=qo>HnDV+c1Jr;vd%_h<J>wgRU=Xm52m1ny$Fsq-O1;#t!e5D<p+
zFuz<x)qGQkGxk4Qm&?h91?U&zB;)R2Xc2PhT>`}<f9rxVb|ZPAb=K_B1Ao;ES&GH=
z%23_iM^Dl<bjR9u+J#!ZEf-VL4*Q<DqNq&x<mv?(4Byo6sv#!3(rzum;M$zjqKQaE
zX1J-?VI<0*k_HJCq3{Hu)&brlO=v}l=y++*Jw{%|(&4(mGCpd`OH#k;wm=RLW%Yda
zXi_1DqH9eBRsk3A$N{`KWk2b;ZDuwlv0yvwVdo?7!#LRBT;YgVt%?={$6omJ`wkyJ
z%xs4Sp|N=<4fnP~0nDBmPdssK`fp+x5QUt>$#fjEG~w;eND^I9h)Gz+rAnpkyD*H7
zVIY|3quIeS859nq1ZjS5n*6e<T@q|u2-a#~1%XvTPJqzSt&lPJbxfC`#=<vLhL|=9
zB};6Xw3V!GY76~GTxo}SBb0%ao=akzoK`F3|Gb@!nYnOMJGRNURkcvK@uJ6WY*2F`
zZrzSGgl}jz)UyPphMBg%C~k)K#V0d5SD!x<Zoy{fD>{)y-ACg^u1EC;JHBmaP<KO5
zx+{?46MN`&7JqLZq1Y+FtAodqk2C+v$C#k^)RA>R#!ip=l&YWA9FV#vM;QMx$BoA&
z2W<>`qIk@+*Nr?mn2Of|!@BOZ0zNf@j7~nh2D&!?sf~3{^pw@aQ__=N^0euy_deqo
z%ddan2@D_bZ%SCCBszG>zL3Rqj`7$LI*~GHDw9>p&ZK(O=iGs9*|39y<MUI{a~~d`
zOSszMQ=s)Bz;cnQ{cP|2;Jdx=h3DVUKy)Ph8|FDn`<tf&q@p75q2U}I9G#fi_+eq2
z%RfDM9!HZUt0}ay<oBH)X4Ma~z`;N78ZDoX?;@@G_FSdmZ{#gqtwL7PXH9`7ZrCvA
zQh!{nZsyCk99N(esmk8$VhV62!_D@Xdw1~+QCTSeFkZ_;!#BMTA`sGz2<)+%tBMIA
zCO;`k#PxzT^7@K+ub8t*(C2=dnDr8UcmTMY#-5cHGQwB18EX9*wmd0iv>t3e)Hj?B
zH9Dv9Ij`Tr*$y<}36Kz(r?`jPL}$ScAzBEkWF3%(s8?B55hNvr@oODm^oWJ8B5l_W
znlCAMIqA~ir5?HVy|FGTr$WRrOzX>vxG}aQNjeHNl?YJOs&+O}*)~V^xpbQHCvHV>
z{kOW8gJ`{b@uAT`Ki(GsE8uYG)kz9TN%BAmmBc8i6ugY=X?${`K_zD7PM3ak<nnNI
zAz<3wvY2sE&r0))&M6lhzTnJZY7!P=6`&sqWg-vy8gv5gE6CNFBvH&E13F}I^Wjus
z1UbTmeg8Hsic(1G$jX@sl`)T)q?i$f(2w*y5NOArj}6Xr{o>iZG|v{sy>N~)e0t4W
zma??K=?guhJkH+vN1*|2f(Q#J#=Bn0R~o|D5;LRS@;bSxID-tDr^p(EIt3Z2^`CKl
zSVtN%BAAi<FVoH^=Jun>K`<Bqilw~K7m(J>HH5phbf#g`*afHtI4a4UO_!}j8|2We
zrnK|RKtUtHLVmrXJ74nP5V(95I97&w20(#jS$>p!RczirTw;}M#vU3DdbcsD&IRe)
zqoNDb<`Q&H)L^0$)1zg2v!-l8355s6I5l&N0pXpB5p6z}qRsym=%qr!(WEmQ;;+WX
zXES;!ZH8uQxu4@~hPX1xry4#CemJ}=SLKv|XS>p7FYYzc6%vIDnhf=f@Zlv(F(EB%
zL`eL>Znp@vq8K?XO{7)|R4(D`qAIHxCDZl*t(pA*mMF0nx@}L0%`O&}j&_(<hK%Gx
zBzIl>Z{NGyi&<(v;Q0f~8oXx-gf+^LWwXt+KYlzPIjFIaWnVge-RPffHeZIB3GkKA
z<_WZpSjAd9`V-ufHfEl$N()cSjb&^dQAO36x4jY|nA+q}vnXkOXQU2?1B%MjT|c(`
z<k-mD$4_04E)d4&KRCJ-cvYvxrnuFNs|h8nK-_fJ(Ie0txiMKiy(G0L%~NF_yXx{1
zR6>T}o3(o5*bWC*7%|GK%RR~v>nZ+Wb4tSv2`|L_H1&$Ispkjf#MVaK{N?522=^Hv
z^^uDO1iI2u%h;&N!aS|mzCr<v_G@@shv4pE2Kc7xf+vWn(Az?jv7LY`A1@dwN6_1a
zVfeJ$?IJUe_{Uj~am6`E)Ng5aq8IZUHu<jd<F?vmlxV)C<lMzk2`R3sL0gbsN>)zl
z^Z&q#dF$(%Qp$s=>D{~r1{P&M*A*^Wn>zjvsO&8-hrs&5!KSCw@JrAhzE!&GzH7Gk
z>hWvyzd%h(hs?G^|8%*1g*6F|2Ir>-dq<wlATdtQeJddWMzDb!+St^e_7Kc<w^0K<
z00<g}G>qR*)l#1|5-z+<uXYfI+A4zow(9sBh)BMv%D3i90D5)85a`SP<@Lp+EN+(l
z5I<m7t%kqgEr<|>C#JTSjc^6DjsTT%3AGday$OY{3rA21#smc=-m8(tm>9wB7$vWM
zT2aSmnA`4VFI-9iVvzz=Ly%?)lHGPY%Zm^jCuh}(OAR$hOImo^7F-5TK!_XD*{5(O
z4+};UPx0ssrr?lj)DUuLHSoD^-axWWK&C;tB#TNDhHSy3*NM#_9d?=P{`~ltou8ll
z(z6&%xs7XI2s6GM?2o6HH|2~_pgGmt4#n1h%$uVerUI?uHJpr&nc)vJGsPcfcgLYh
zlfBk4cvbv0vDkYFS?4+s#X0NC(8?J#=V`HNUUcc2b4AC@*TZ6u^FuG@H3TU@NI$1^
zT~wTOoRvPvELY_$(MUi=nmIItg^T%mY3B2;Y6j`mQnF}H3T%Z%N{en^B0^Mz!<4yu
z2{ENl!gNY!st|bs%Bd7uOs5>tIlN@?obh@nV|3#|fz$XbN_CRH*4$1=WGx=KB9m?2
zRu2KmYo$ERFRDv`USK45p%)D1&O{jVNz=r1TOkxqY6KA=0yWuIAZQ`aAh^fw*EP4K
zBG>O}ZgDmi-PnE1Ey~8Cn`$+;+|_s*SWRUFZYM<F?pF1BGSDt85nHhFJlHL_{4asX
zWvDZx0eE`y-SJQi{+d#U=ytsgLkUh^`{|Ps?Y2gZAO06t$Q*|H&DDxtZ1Dd_mpxrE
zXL%3a_@sfU&Gw!(S!Rt&zlZ5a(ndw5BZcki-Jm~?#pqWzH6*XLe_Zzc+3jTVbK6eB
zTl%p04)aMM8AILHCFg+WBowtIiy%-Ya6U1Q*)0SnG?LlWHAZ%?hT=}tyVtdE-sbx+
z+Slxr-hyA|MPfa*Hq)}+tz9>r;a+1lI?*(=+)pV>+lZxO4K;j~BAa<s)A&s6`Df5S
z6U;tF{3HFHTs#_JQ@S(-A4N1_6R<<4A6`A#c^w>zuh4&^Vb}xTWJww0O3FEBxDz6l
z;66R0oEAY_+#B;UDxOUNLpW^Q)clt9qT2&~wobgTVdG^<@isfJIq*4#F^19|>Cmai
zk8)~hD}SYPYa36V+|z^ogTrqRoRhxFmlu2J3%n27#4$5Nw;ogj$%5GPy~Uje;fT^I
z%;q=tK|^9Bcs1xAPj2DBWPM3cIPh0V?Wf)sOnYd)k_?4KA$m{&RCaUNjMYX-0HuMY
za;pqsvcpgk1lHNb2K+B<V`QKW0LZb3^YYtu9i|mtHekj7L)az3$6uMd=48-B=2JIv
z`ctTjolm9mHtAcVc>`hvDT-5VxB`Afn@MqkAa<DY9!vbUr`n_4GiBoFn}_(;^Our*
zux6dRVKxql!T#*U?bGVrXVq#9FRG*rQLSJ;372NakSrub5|GGVRZuEYU3imdiMR3R
zn#ZBf9E3i{FbK+gU^4fDid0vDUnU?3@&yRM2tj(puL)vL{tO{%@n6xCWr^wO;mgR7
zj*O$Thnq5)B#62}drB;#UiKKJ&FEyory_%KITI)q=s#n5*E6SEE7e`qerhnWx+7Xh
z@o$tr<g3UcZ(rlW@C|RHZ_{S;R1Z<(>8i<U;hF17slJ=LRs~+H>*++FyibFpCpSsa
z5ry1~bl=)cU6~$4%lku4j>L1c!BO}yn(@rlnTLP8?}BS$zk@%C^abk4&fv#+9XKew
ziV}Xf(}hW~^}Uioyp9)K_oZg4Z3n<xW>#AZ9RPZR+!o7xYpVzQgxoSdX_JtE^rk$e
zY)&$?ip!!vk}D2{^Ch6<8uq=FrVUgF=~H#c;1lM2Ap>4()|b@#85Hq0^*#w%`U%r~
zTZR^|htN?*$@}nm2sUQcZ}GvfiQW+wAxKnVsInw-v~A?GY3n&;U!#HInJj~$h6u;m
zQda<#lr7y!-l>5RkXGgRsv95qbVx3oFU!z{=lmm{CI~?P9nMm4n6^1Uk^i6G-Bf0n
zKkA0-L3%O@u@LPt%GWA;Hp+GsOA0_55XFmR$wFo>*|A4PKzN+3E+pHC3JI4HTaDA9
zPr$xmy0c!m0V3eHM+|22%*gvnNi5OA-yY?CMI;fQ2hvCqh4V)dznUt5mbI;?CPTMp
z3qfXhCrM`Lq3UfCLUO2kL)q5d;^$w2udq1gZ6YF|LteN9WUL!5=8N^T%sMh;g%<d{
zt!|Uro?tChij9DfY^=$dHWKv5?THtGnrtOn&@AMo*N>2qtb2tFCe~joH;eIjH#Mlf
zxe>d;Ok$QalP?9i#9S_w@fh0HS%=`0N}6K!MP*DeyU8#MnxAEw6-c7FzSA=erWuCM
zxpaL&FP@ke?Dzi&&AFBS5UCmV12RN-ilZQ&5fws)=@*V9&_F(f3<bC($dM7aFSHJi
z3h-v=`N?&`2+w(k)0Ps`oXu}$#%7@`ppja#QloXVDONZ0VzFK>!0;W8=}AvEy6D2?
zLEUZhj~+sZ0ood7DX(Xk-`-p>-0gZc2^=xGwW1KVpWuaHGC)y+Vt?%wd&Rkt9!P`9
zcNarBxkAsSRHKwG*A)nXIOWwc)EgItpxhO3{t{Z#J+pW9#d<|OK8W^>Y+U~95$Q9!
z;S-fmCv+kH0lw>d%!@C7Vu3DkO<@DjkF=3dS40mP5UfmdxC9Ai%3nZ33Cx{kAj5v>
zlkG4(ZsyAdHp#4LGd7RKvYv-1*nsIY23fMEB!!dt8X?h*yiUo@H99$Gt{)__MaV9^
zf_c3*%coS+v4#TGxO6iEs@P9b1W$qjI*6yBZfru`G;#38q{+jiPJjYY9bqJB!<sn4
z77|5tff4cg7?@|r6z&!OH{NeAvrEu?ipqbCz5T#_)jkU$hZSA4@Sdf7-F?~Y>EUUs
zXeJ+5`pd(E=TA2#dG)t4)~L7jl9%M<<4bfQceXjC%N?REMLWOa^wmh*pbj7KM954`
zi`I*YX|JmXF)+_=z3Re}h3waTHqw<mIynF8WV>D5eh0VNy${{FJMY@Z*)b7&a+~87
z23CT|;*2WB*Kqsi)8B@>&~`oc`_g#Vb1d9B?_$e%)BBuc^uqIsWMqrH(p2u*!Nz3W
z+nrNo!}~Bb(%p`(HMP5tvEvhS3y;1yu-6tIhtQw()#KM;5GxrC7{=V?9)?e>i~qH+
zOUygT0Lj_YW92+RzMrf_fSj{05|ATAQn2Cn10OU9pzYdZbXYx+JHD4}3MW<lv^FhL
z&CLr+101Mnc#|^&xoU+v96*VEP*QmHnpHbO;$SsDLwV7!wyYM3dJn~SKywq^SVy50
z)stOo;np&-QW|37>_}(0M`^I|%Tck6G>DXLTPYA502myz3q-?+fDhcr$YWvPHESK)
z(02hN$N|ik;e0^=L}dda<}Q~O_s@J@lIvXGYmOZtR83{IsnLjPHeo$pft&e-!!cb+
zmK_ln$w|bIX&n9d?t0{*&zB-|_ls)qW?<DiBXPGiH3ootI+&KTHv`(-ATeh`fqMDV
zY?GJ8(3CM3-u(B2w2IoZVWSc(c5N6&4i+%1lncc$_JRoeyNeryUw{r?^FKZk*%}{B
z-IJ8({y?&ne;|@;`C**(+4<i2;r{lFwYw1iQY<x+s0=e5UdoM3wRiBi-m9J0*^7)w
z(m+v%_=Y9IA)Zwn1n7t_Jy!zgLQ(nd^&1HCx8n_dTGwXCj7}>lzXtSb8lE1i=1@%h
z19p1w<@1C6^TU(lE%;$5jO_5&d*AL2pYI(%8}2{fJ3~=!vys(1Zrd5*?{={CT?4NZ
z=3|S`iznoPNbazYUpq$l?B)J`1Xs#i{!tHOa4b5ld|nU)67>rabWW38y@%|owY5C@
zfrBuBUhf`Sans<d4)cW_yidgIbwa72Rag6Q&KnCWcRywyDuDSr{ren8E5E&ZqKc9h
zWqAJmi-RmS&?X6;q;s1G3Y~B81$kjs{_vlcGlw(Wq8Mm%uS4m`4nKBgC;9=iNa)Sg
zG)N3+83gb8zX5G=T8B-8OL@+1ef{CHH3{&g)q7t0!Ou_bbn&^{&_?;^awQ%9zRshq
zCn(k4|9FpX<>X~kl3bP`@&Y5X#=zI>`aNv@Rk;kK&X8<cY7n#5+jyZ4|5e>khg<of
zM*eXp$7koKFZa(++69fINd2q%$=ywmxWA)ZM6l^Cbd1w)<{$U6sg_?EVR>)8n(xmc
zDMk{C3I7EnytyY3$Oj0}$kkA`Fl-8bXH^tv;i~r~G{@Pl;GOuku5Ki%^PTybTL1j>
z&x^TPKKRRkUG;h(^`h3v)O6wseQRqi&2G%ErK(jctylGQaEjk08dBd%IuMuLO$J#v
z^1O*793zd<=-t4TPK#Rm;#L~#tAT%t*5Y?%js6PKHV(TuCluXsGJ*f2cSkuTMPk)_
z_JS7M(1pp|b&Cqd#qHmI#2L$Jf}|#Zkc5AupKT5Gx2_tr;jhiH9`Ig~Joto#1gK`S
zf9&e1aydC;JVwfzf=6L&PZH@c382l+|B|CcRn}@6T18}*TB(|WK1h9_x3}{(*1X4b
z02_jq^$Eqe!V^<2ZRUD6WM3Eg&B6CT7XL2Ce!f)a{qv5eH0G7asg3z-PjL*?l+>ma
z`Om#56*Bx??lLb-m{}&L0k$3fp>@FbQvRCj_3nCDPeR<akE01TM~#|qQ$2DCt25)&
zAclwx51&5Bl%*dwst=iFze>~RfjN|~?;%?oDK6h&QW=ph9{WUn9I1T^&$tMw;@lw~
zAuxwrA&s;I90?r>PiPy6(Dsql1}~4lIR<=+O?m?vR$4XS9h~1xd!V)=wBE9o!1S^I
z&EB(v<ayClkf+x!q|*E}Q`-`PE+atcJ#E=+5!zWDpcMdL{_kVZUZ0-tyxt^sZ5Exk
zRFCnp^i&`<u^q{0gT5L8=*;{^5Jf#uuK>x?%X92y`*N=Y>UH@I5#N57-X*YEb}u=`
z+T08F#pr3sI=Ol34QVz9Q>Umo=}(xZ)1PQb(nS7lW<BYEX|Q+g0jy%8Ww#W$NAj&A
z7VxCHF>Y1e5{iS<gX8@JR6v=#?OGWG&fLDx{IED<S6@t{X4zG1Fz(K0l7vBI)}GC_
z7BqzhBa1&Y2<=stVaFWG_3Wxz))X5PwBuAOWYan<Zm`O<ep@+YQO+_8r_-FYX_ZRw
zd5L)NZ`M=8mLUL&@PQe+$P2%#)-T9Hg^z0k$hKyAh9E+cF>SZ#(iwa8y0`HyzN>M}
zm4Y=qJos)qH=Zn&kA&~Gsl9sqI*Rr9lL*1>Rx|{nQGI2vo8hcMmfO;C?T_itoIkb+
zwJ}WBY<pjRe6sV}oTN6^F=J-ToTw+St>N)^;Wo7{;|@nP!=WYFx%+W*zF|jQW3RbZ
z!{f(gxU%*KD=o<w|C9{ZuuqUaL(a0o^v3&J5*5>x2QlBs9WLjxtrv}cTXqf(e=x8d
z77s(8N}&J>{RK<V5g*82Yfd$7Q<)+nVc$vHLE_qtg0*v-?_+uuGmlJEAH|XfESML7
znG3LFf2!D1D%X`Mocqetw(zp$v@OE+QgUBIZ%ea#m}|DA=z79Iy{J|<6~a+5<Mi#3
z=59+BH2Y~0H7)uv;HNc2VrgLxR3VDe<(4gYlx-Zr5zHi);B#C<#v58%RH$s%FSor?
zFbe~0@dY&)3?(-RtyLV-A-n5fWF&KMFlXG>w&MWBlI_>;m5vDdq@fXqB_n1T`z*yg
z5CYIH%cLY2ta`-3!4zM|S~QHqqnqfDiTpLqkC0NCP^jFtcv~B&Hb+)7HkqHBuDZPu
zo?Fl1hFY;DRdcGAb=m=ZwPiV}9?dT=?IA}a)U1XFp0v)8*<-%qWXX_jVBNq2-6Fn4
z+)60RvGnH10r}MVmiz_YHSnW+4|CADl)pd!%Dm4nPEY>%eIl_kK2bv<fP6T)2Toy2
zhBo<IWk|<z2q=ZAal{DDJp7=Cs6c8qTvTn$J+ngW&AhzZa(i56-;s#z-Y~p$5yd;;
zP}h(_g%t73d&=>NK{K;64hQ6QI@;%Ybhi^@RO3pIDYYX7V>TB@j6lmyqXcjXhrnr>
zrL5P^3HY1d)Q#H922!ha2vksv5%D@?H*-f+CQIOvn-x(2k9-*}&Km@N0l<b!N+-#d
z21!mLH%*R)|31biqdlv}mbs(zZAZr}!P2=~t$7FU;^|*v{~dK7-H&1R==@m(9xnT3
zxD+i%I*{9r=Q{t&O|?`9u9*u`I?RmDp0u1idhBBG#opOL@t=PSc)=S&hd1+B&#Goz
zE~+7D1QF3Qvo#7-K9%IMZLM;bGl8v)nZQG`yorADnc&gw+Ua>Qv-?+-St-!&NRo8S
zv_-jCy-mx7aq8si>cv$I=p7QQ>+^YsZn(xBI&w?5T{o`$9Xd%qh}r&PDMt?=*%mPg
z;lW7YP0NULXD;Ouoj<&4HKMd0D%lOA)@-%DejU5U(L`iEsil`MW9c+fN3-t=Wqb3#
z?S=#|`{;n=l>08wbw+#S>bk#Wo*0)ZCpl{QzU$rVvud?o1Xw|S`;e?4n0ZTVJ+&8*
zR%=)eW*eT;5Lc%qPe|@wQp4`sZZB$WtMi&@HF>kT8=BOr>GUxQG}X}&<P%J3!n(=h
zsu<hMdPUm8z|N8#q&+p6VX9LXszA2kSO%H{A>sW6obwm+`Lx^XZ`DLj(Gc%^o-Ir-
ztBw#GEY`WqS=nYS{_^qpDF{M`$LGV7FTOt5KOY`G&2EXkEOt{8S1={Dg^2$gE8<^n
z@<s#sQj=A|kD)3~Rl9(}ENgJSigp~{6ijZSbTdZUN1rIhPBFjs!fcS{f9Pm9c?Y5h
z_^_Z$&TMK%v4=@1(Da1<Mi-P>&jEyo=MP$g>k9reYVO-mkUR^K4dAopqr>ADFVBbj
zC&%9&oSv)HFPmNzzb!B1Z$p$~2m_d*sGIDmq4rL{Q3_`-p-WGm;f;a#bP8fs)W+^4
zFUZ-!safMY_xKKc3dJU$cp!47>)qH7D9#;KPZGrrpWY=>#*5)rN+4ZPp)6vX>QM`b
zaXO0|IB1>S8X5O`z;R*c^>uYrPp5Sg=S<5aLZO#Was{OlpV<MTjWh8&=77wCvOyBO
zWK$b;I{B=5p|E!s)rk_v*wx%BFhbRbs5}lydKCe*51BO3WpV8%CkhAVUj;|4VjSL6
zTuVw%C8>z`av|t03{TV>?Dh)wOcVe^9S01qgv7pJGrhHSyNsO)GeHl8Ya+vtC~Q_q
zF<*b>PbZW1#JSzK$vhrr!hQ7ut)le?<}LI@i1}5I(+*GUmg8tkC6eryIMH<4L!Bxj
zRFLdriZ{ca+bnPLvcgRk-Ig`0C5+k`LeR?1dpqhVhT7l&ktMU5o9@5%_J~H}%d>5)
zspk1IhS)9lb+>KHgC)q)LeMM0;{;5VdP;UX<8`x|U&F)I2@>An@`3HZ)I~U1WXRBm
zzc$nm5>AoZbz>MVSD>0TYs1dDvnY7p?2>+6PS8gbZa!A9c2&>~PwVl@rDs;2>+k`C
z(~Q7T&|%)4fM*CefKD4#IldD6-SvF3#w=OL3L@!|p%ZtB&Tim&UMjEx)YM*7rg%MH
z_F@tb3%668EZ9Gvf**)*3g0>?p31CdHe0y2-|JcZ6F|gJCKP~LoYc*#o|)-G1LTt@
z?pov};`yn;r;8@c8O<C0v+-%bTBU2Rp7auQe=qVGGt4%0Lsr$&WBQR3vWaH5i|ZDT
zr_KIR!1wH*9vD__C_YlHwIVMlWO05=S0`jet<PEH>D1=j6i4GE`OCnNwt?M5cMq?=
z@|aYE?fDb|F=Ln<U@6@De(~wVaCkE?|1~+HfR_a8<JpIJppk20gtjWz$|$--t}LYj
z(CL2y@EaE+!EcK<QS6$pUx0xleYtVKu?O5lsNnwUSJ&oPucy(gC;h*^9<fiOIbM_X
z7@>DYSP`6~%-6}Qrf0W@i}rUHv%{Fb=%rx`d_MB}mbJ+8K@eT2b`3m-g22MaM;Yik
zj=Z7tc0@N;rl}HE2yj!SL8a{sule5TX^?j@eG$1FUGqENaCcldDo#qGDM5w_?~2-N
z|ACdLz}>iYTPL38zwiE<uJyS<?=1+3&U`A#Yhj0-KT|4=?U||3P`1dGx^<p%&}RJB
z>nA0B4#+#k{LR2G^7$yfkikHAK+G8d`!?c9MA;`Kg`2mfm<+Q?ro!x+o^R3bczR3x
z1f_&)GLKh=V-7bn%rv{L_zNDZ!JH!<pr8L6+e`*V>x6L8mpCt%PA;`HfoPR6dSnvH
zdF*?{TPKcYhX20vQT_kuEas%zbO0I{tTDeVDie6s6Y}q!g~M1z`xigs_UvMA)B7J>
zmn!@t9a`Ot=hFFGoMoNgJhaED!T5s7pYp_0F6ckUahF~Bu!C;6>a32mm2U_SLRY*G
zs!2}_I;1ou^?}*LmV7WR?@8;9CE*S~)VxZkAUZDn;%!weGTw$hqnV-}Lv>YJXa*=(
ztwvYfbR$BjTws!3ep$^I73MGX)UY;FSJB0gYrN;42tux`BL`!U&~l`vy8}%LK}nqh
zip233&ulIp9Pr;;Q9f^T9;fT3eix<=+Gh0z;XpUm@J5ZLVV#rOw*+Bp%VQK;<(Pnk
zI_Bh3di`(hmV;dkQVr&PSbV>p%w0=)<ZN*-F=S-(4#aJz-&l1IRid1}*0=cg=BRv2
zPc11Vc$TKJi(y5x1&35>8*-&xs!;>rrLSjn63$Pp5rbJ2ce~saoW7@EQNEgpH#;jL
zDHGSZ>!A-elsDr6SJ0G>l$FhjFL^zCH-9SwA?F0NR@c`x_OqmKR`YpbWXoHgse6w#
zWJAWq9zI$vLrsugCNXh%cJ%^xFPBx-D}or#pd+<sDJ@zERs$yhky~m1-<3ufUX&|P
zcd}QuStmp1QTs(;+XCAfM5mYzpVBw*7$FSp0wGE+=IfP3tN~AgS$s-(RZRCgUT~|`
zQb9$94s}Thw8vMvD{onZb_+!AB;j>T^V})G?)Zp*vmVr!^%S7VDDNhQGUPQ@oj29$
zhflwp&pYO9+$wFu(XIJoAAVB*eo+$QM?}Ie5Ek2%z{p@3ADqJLay7m(<0{{ja5~8F
zGh5Y=nY(^7D@}iuZ~r-;QzdfDSV;;Mj3tq)0k+K!{c>!+?Edg+CtfAD|H$Uj8L*8u
zH6jja-MoG6BlTg!EDNp<{Xw>g4;A2Yn@%{609wO@0A5P~p$-Z8CSI?(<`POu{7Ov*
zs=M<FuH#Z0@$OUv(>s6LvAkJ<8ltBq>;_sIS+!Oz!QtV&xUU2|4ciP?h(MgvH#%@Z
zim{i|dTf2Y_06`InyJEn<7HKuc7RMw%k^x0WmPf(H_`OTo%u05B1_Z9)o!QLr(17-
z141-kteV|dul?t9Qdh(I%&_>i4I+_fo?&C&#`P)@0-{S8;fm?iq%^d9R{x)BDBm;P
zLMli@fgghhoUNFktzRtXz}OAnEa&TmjcFmTHE_I6@s-bUae^2$vFRbo5H(7XPXdp{
zxK-|mD&A3^*uO`hQv>z&=Od2x02V+8r_`Ffo>f(`b7@AnGxB#a)w*B8bO6K_JPBOe
zGYZ569+}8HzUj~b8KNUfYfc7bxkS^X88LMdGZb)9$Qgr*1~;TxRAajPh%E(D_2R0!
z2AiamM*<!(uY4wX5GB|>=gc_#?Q8hfbbA6s%`x@tqhY@DAC|JA7aG|XOaV8W&mO^(
zJt>zHBV)3&J>=QA6vhO{6((c@+u<d??yF+d+%^NloL<)NN93}BxkhId8Fq1)Ue{hr
z>)Av{L|BfUFCR_Y{%YU2%q}EWkM8Y`O@IJow|c)?mNu?v1>qcolY(t^Lj#iQo>?#O
z7PR~6l3I^!!I67Q{L2c={zwKj8uVfsZ^=m{O5xz-_{*)tcf-^|9xq_8%+~QHN@-}f
zp*$+Z+Zr|_TCsq7?LfoJia=lJ8<#y6OZcz1xdDi0*<}h52n|>RQ}F4K$70?LCe^!{
zc`*kr77L>^7oZ{V+PDPm#-wV-%X)zjWTkmnD}39RNSVxUgW@|_7ci~hNrZqII`OMI
zFp<-3Y9u>uEIkIq`lBz1Y4gvuAn`keN7b<;5E{F_Mivy@yexC$Kb+Fa1YZ^`LXK?J
zI}BkJPv77ON1!rrrh(lM;i;UFxECx-Q*>#u`@Bu9Pjd8%_loQK%@yU6qyWQ3wVd9P
zWIG8sSoS`KvTMTFEASmP@9;G-(DjVKL4X*6b`|y?^G1bR5zeBIe#-P}O-od8;-j9V
zd_y9kd<6SUi^Vzc-w+}HtCORH&o1iOXFp>2q3^EeiXeAMq0F3-2v4MTHYDaGrsWy_
zB;aG={?yK!P<RlZZRIn7^?AeDn}{A86e*_QUrt#En5&21=olrNlTdBL8{C+OkZzzP
zsrF;9;R1z^;r?nleYF4C+2OOp<8z1_{bWUq_j0{hxpod&$ncM|A>B<spxc$tc;Nz@
zbuou%Y8d%G%a0(hT~zL1A-4nhOnm~w=rgcD^YxpnXnPcU80?7nctZCO#R@VT@#FMn
zlE<mf7tAog4+K2sB`AIo6%eBC0nxyXf;}M0d$ExIGp#WdG?2h33cS=33E0((-Buxi
zf*Y9=Q=);B3;O!y&Qlf;2ZG=Gku(sJqBPlbLPGo}Ei%}jwv-xjU-;FvP@#VIB4PB&
z@+xz%=+MQ2n)QcmP)frH&Lvl;pNwu`xBq9U-An{C#lg%d5>ls0gGuX01-YsL_i}+a
z4<0;tSnTgTfBwbZ{x`+hi-Y~cFAw(xy(I7`4FOC*-x631k(+21<`u3PLD&2~t|+P;
z-Lp}X?bNRi?65Faos*JT%EdUbTIY>4b6Fuf9_kG3f-xA@lJehbw;^S!SdcPZ63P(E
za7RoDa-j`%26sV<C?^xMWQ{Gp_w?!M!P(gz!vJFGaXuC4aX$4n1(ZKa(=KcYb2!Xz
zp)eBTHEdAJ1|hDgcr1&s04D&OvK(IYX8$qZ9rCdhk)`Es@kqZhWI(I&#$U4mDg!X}
z3LVK7tIfxWoxW4xzuXi4moFLoCkj%gnBq-=uyL3%H@!t&F6mcqnkg8TRz}9(q6t+x
z9DW9fN)y4|;=8IetER0p4~yq>b7Ca{9%Nx`=D^yUC$~o6K!B|T%7>VjiW1_>>xDxA
zgeUl0&nFdndy7Y@btq_vvs$967j+Isrk{GK+yq*9YZc7nat(3}oUln%PRW;k;LmXi
zGuSOoVJ4x8y>5nL^)p5MG^4rz@OeY8lqKa9Rg!Qaa8XpjrO<{krG#M%909A?mY4(p
zkpf^w01;i!y1jv>8Rg{=-GX=7?_>bcaOOCngM~hZeRrB6*{u0raNOufQ7f)F`lrj?
z^ySZIC;Q(F&(2Q|_Kx(jf^VleEko|J8@KFAr(IRN!UH4DO}g1>gqW6pd$d|Fm`|t|
z@BXu2{MG-9k4gedBsa_N)~q4?1bX>o+z$9vR{-NNbaxzD(~Fua&7=g2K5<1u?;9{a
z1YF~F1DSI=d+ftReoIRWO~0~m;5Nq2y*Zs!3igh;v;6${mz|%V{L=ID86{TRYlPi+
z?EUSU0p{@kbbu9<>lc*=fN)gaUQ~iU3Na)N1(BHZUzM}<qF68f;h$A5yK8O}{LrOj
zwiG#Fu87zAQ_gZ>u3h4eGnYn^#};JPiYUouB+ASZ2=I3Hw?9EGBrMS?=gIYnwZaL%
z6WgL$Cr9v}T17VMu1C<+h{&qw<sY}2vQnotHx+%s^RtoLalZ*2kzQaD%!?Zxp;p=c
zg29+GP%M}qx?cVTv<FPD=FKYOlb4ylyvlY7fcj>My_*B_JgDGAF~z-$B9$oiuPD41
zA#lb(hX`mS;78@4jRwV@ZYmU?G(;i$MZn-hH3jkfU_(L-;7IR&KShS_RJX?cZ|Raw
zj+F9($6kaRA8^`s=W1wK%!1bjcg-wq(izBm3}|D!|B)nsX+Or!8NH(JOqks_gtjw?
ziVD%a23NjG1sqc1)%0JaUN0Op6{T&(IE*yzMxTo!rR;g9umA#vhaWO!#Ut<^HhOhH
zatL=KmEo_PfQ_96P#jyBs0Vk4;O_434#C~s-GX}{!6jI54IbPzxI4jZaCaT_k*&LL
z@2!2i_tjKSbx%!w{nu$ZJ^wlLf3H7`0C*fPp>f)rq@NK2Xl|62$$!Tp28~|64V&Vp
z*|#T%vtR__J&XPhZ{BO1fEG&nz|@coRivDNTWmM$gS?x46J?SPmv;&MEX^~Y0@|v*
zw7>QSeVzt*pe@KceXZ~eTVE;8rTAVEW}`JWN4Ux`3RSiCH7^XG<k(QDj0Cz5|J=UD
zBf3Vf=J?71Y9PkZ%6nQ?OT$S72jKK;IF1*OIhJB=ma4~w+O1{x^cEUDJf@4%W9=Ly
z5c!HH(fx4P2|hW3w8O|b-+A=OGs9(%oquG%*xjMA2Riyd7HUhOlM6J_sJ<pfv>2)B
zjY3kpA>Zuuv7Q6HEPg!=_*}F=zK%z+&xkLc5nSkj(L|3<)kySns|N&^$^5}p%9S}A
zCJlH-j2#!=JCf9Xhi+Cf%3$9gut%ha70r=!M5_5jY21EQpQ0NMhMuXps+bXLIMz#-
z<L6rIrtd>{A$jd6^u50ZFTgxuo+D3eeP3-Z&yL(E^<-L07fkKFRz*T!C(mfbN$O6*
zn2&ugwZDQ}cS#-xi_xkA4$a+InG(d{z?2y2JSY<g_{b@<Dhg|?m_<Q?zKl2B8A6)F
z&~2$S?l-5%n39|+p%Zgbv102fjs_3{ORh_A=7`~{6@r)-o2Su-#A!}O)4w5j#d~OI
z(JZ^4^HRvh@l}(^>cV#+A8DP-BqS^A3hd{s`3cA`<6xs7OA?@TER#2})$U>+fW=4P
zM(db#zMpsI?RO&9joyWQfUGgT8m47OlPIci)@uQh+N<4wGzj39)rCq^q5-bJIzby5
zwo}hSlna$}WlV%^1;95{(ZmKsp6pS)P5I?7E83uw<So0HmMGuSsva!8v5dL-doGUE
zGU-@MC&Rml=HKBJ4H^YqL%}4_tQ4nT^@OX*(&ov182b|<GhMfaJBS5T%q8a1MCYSy
zmtq!zi1&jD?Wz@YuA{^8AFAFoatny1O!z$6DVA4(3)i0ES~unwUdoyHHATQxtL$|}
zGW20_Gdv*e`T~iKUJHA;x9dgtxOaajmXxo`B}F>P$8Ct{Nc@YHZ8(_S^Gf&mPo)~n
zX#unMoKIlY?gvj1TI#h>HW~C{$x@-xAjt^mVuSN70t23b7RmE4B@O`|R-I4Q3^lcg
z%y35ps4ATcz75w0!^b6XLVD|nGdl(ZZjCwmdoI#WbMhjsrF!})xAl1>-&cCvp67F%
z$En>q`wx;6zkl@}e~kQb!-;f=4d>^tg=-wn0p8-bhb^pXIp}-dx5sj!#`hhg?L0np
zgzvk^-bhZ)56rR)lftqr!I&m&rP&kk$uDtKUU(3ls4wmSFcAdy2*`nWKN;Lq>&0xX
z;5o2Cn!A<hAdy2$T=K!)3s;lPP`e57-+CGAQVnYlGZ`OR1CB#%zv3_IR5201Xi?WF
zVm{$19xSaNV*5@akJ)F7e{QMRPZ6LQPDr#}ZkXJmPrp1qEl%se4#;kr4Y~gQUg(Yd
zx-M;ktNfc9Mg&!ea<SpW`h+s72lb+^4iqu1Ag(xfnG#8DeEX+q{P``Ba5GRpX0-S=
zH-%x+j(nC)Y<I(SAFLa9CN=>mit@b_JZ0<P=l7gncBW%TCVYt7<aL9Bh|7o?<}g{!
znRSlgOVET>y}kf+wxz0GXX$Y?+(G6tI%<?Q_Gc=&!quZbhomby&qv;*e2<;jFgoUW
zMC5h#;ACiCo|WPPPS;<{=K9Tk*jj=;WhCoFzoNtd=Rd;X)qbW!i(6r`cuuav=1To2
zh_MEkyNa-Qe8|M5(FDrLyhz_^*^icIVit!mk%M?i9&7vvIl7Gd)%$8485Ba>)0l1g
zj3WHD{K*kh7F3$6YfO!39cE{oyMnzVUu5czvt{_0u9Rn7y>azGmtm|6?9}m;qREt^
zWmJw#y|l`msLLoqXn+lBL{{gAdhOY#3)zj^)vxwrWt~CC%bz#J0_Gk`O)DK9)wsL4
z`buNK^aTUHJPBkQ-mBR%%*j9HUp}?X)gGF7A5(HRvZVJp_4D$%*PY8?0Z?=VNrW<3
zC9j1D@B9b!3BV`@aIKlaXGcCj>%)%IiCPoqw|%)v<|s#J#O+c^#LFE8Q_bZStzC;s
z!}=vTB*=I1wlgQgE}()BgfS%jiOtG;VNrtp)ck#zSj;ajo}}bj5t5Vw@-Cp=^&YxD
z$g|z{BoYSeEkBpTbE_>CeCBjjtkb+Cs~+|io!dtluhqty>U9shdGNx*s@kUf34wi#
zVrBwbQVNPP$;KFNeIfZHxh(`ZTRg3=_HN1pR;K_5qCI#pQzff?l!`Yi-)SvDIS;ta
zb|g~+P|A%TD8ioiTu5FkofOR|W+Y2bzOWGPzu>I=xg;tT2Mie++7jGYf*BU9+QEc}
zBu?8q*kpde8mnW8>lgtjhHGP{8mCoB>DwhGlY`_ybQ^^hu6X5(@@lMh1$=B4u5tAF
z?ddjHBKd$0AIuZ2Wn1skhqRJL6ijO?j<?5B?wV1~HR0Lx0K$HotUG#%yg`jya}UE^
z_9PY`LS)6$iiZ}6bz901(bX4XMYn{DLWfGQ&Lz=n&y;`GZA|zgPIYnJlj%wxo?1}L
zah|YQW~;uVed8&*ts?GbUf6GIX^56)(BlJ5Pj2KSI%ya7=>!Hhi^$LWK}qJ8$i~+T
zhOH5QDJO7)F}sgXYf+sivq&vHKqg`=cW#;$vW`oAyeQ0DZ>_AuUXS8*Jt8Z;tjc|M
zuWYr>raY(m)Q1c~|0nQgQ-h^m?Hw7X%+vtb)zk}imwL56(@nk}_jiHL?|SucecfD(
zAs4@xM~aBd%)?2)(o6{F8X>xw?U=X;QhrG?PewHaj{e7(Xvik$o|(sgs=mE5H3_Q4
zlN^OuDy%d~W=U0$ohb+m^y*}UGTF)NNBUUyK0$)r<Yhk|^VFb2VhYt<VAaTw(yo&$
z$`yjgaRxGlPuPF>BeVDkOJRu4ydpYajV1IA<)1}hOCSZRG|&J5Ri3{oM4VlnJ~^2=
z+5bxc*arUh0<c^&8z6e;cIGXe%|4r=g=qsm^3hNEnT()sJN0QC;-Z)otea6^f1H%0
zJ>OelJ3-2fcpn-RsbzGTde*cpa(7i8^F<MoKao{k(J0BJSE0a>+cie8YbHca_S@+9
z-(SeqaT1$aNZh+LQidf|3?Ch=66O4wm+^yf9-s_<qbXf1J{pcB%cN30wf<6PUl&o3
zYJs)11GQ8ejm6<zD~$QnZmu{|X3$B6rhrZ|M_n#1d8Xq+(UoTSfU3oxFl9O_;~}H<
zlSbC5pA0^lx9jlR)Ms1)udWrzOqH(umP6LS<NN;mqw32c2u9f?D~onZsA@thF~s_x
zJQ#|Z8b61u#WWeBJZKq(@@Ym4Uc2U9ov@@6s%KWsGkL377-c(2EuN_{wxZScFes~T
z!J5Z;JCI`Nf@2aWstELxJ~PPZV2M=i@SEl26OK2EG*R~_Pbb~VCMlCJ2K*|Pl-GiD
zyQn+C(3BJ!F;QpH?H&;zS0izMZCA<`6bwA#f$#yS_Xp`uf%C{}`%rwx{4Ksc<I-))
zC7GSTvPw`U-rW=5cPPY6=Q^}SV^qfyL9b2`pI^?Vgp7p%6-us|PZbbQshNxw9{l0h
z`S#i^o|@!9&&=+tnJcYd9|>f`ovPo+@XI9#MtEda&{hO+LjeAkkhy{qk5zk2uW53j
z&8mxe^YC+0^b$YlF*kakiU3~tPC8!kE-`(6w2T0;fH`m%VKdNJ_2QH__S)yc5@#JT
z5M?kC%3Cx$OZaX7+9===1QOmI^CGj#J}5QXDZAXU6~JX)@A<($0W=?&o5oU01nxSs
z24HBjD5Z<^!XYai^g&5|23dRIhZbn{;>NyS!XUn8LF#-71HT7CCib5=c=1C8^c?md
z8`r}Qeuvd-nXu+d>+162gLAT)T`_L-vg?(ogkXUBg|zYY+W3wtb!ql4^LPxq>kZZ!
zN^5%q4LAw5zki#7zt7f;w{E~J_Fz!SB-A{AVD5zh<awa?k)i8!by=;OY4K`<-P_zc
zWy}e)s7?b?Iv0F3U1-TxG(0-DTWCpDtP0tbK&c2?ygA$An<BTP5XRd;th$||JTu|q
zVo9!qXoIutHdn7=(6{2r=pL8@u<)aNcyxm~iQd2U+~VWz)q9aOuyXY@P#f3Z_uzS)
zys%I(K^bWwnmASnp>q7(Uw&9fBnvjm(GKUx1YY^H>b@>Nc>63EZi6=VHOhvum{HjW
zLA$v#unM%{Y_jo@8J?>bt52Ugb-ak+iF%<0k4Z$`XGdxAdd(VhZ2!Ssf|v2k5(8*0
z6jlKQZcItMDeeJ-$_|v1ZAEFC5Ln>*aakpE!CFums&yGu#kAhI;kqv@pGnA$H?4i#
z*h6!gOi?}EpsmQ+A&ZO|e*1h#Biglrk1hWsC9a$-R%*{7X0Npke5~bc=%(e-F_!Tp
z#|*$AkqQEpxApJdG|84sC}WgK&^Sttr$JhE(NyhDJ4|5npx~!XENiP}`eNyCG+KJ_
zkpFDum2`nuJjigl(6hUOP=x*!_jyIp@{Y%!*HO-7ys$bIsyAbPm0({4M*P$44}qU#
zg^1liIUc*~wy615hmJ51T>K26g{U{}4E&>Q?y>O<To0m4H_?Z+aI%(fxGNM4Wu~x`
zdydKxe(au!Z}L+(t<D5Vt-L=tHki78gYR^2`M_4n6C}=_fCVnN=;&m+^kyho>4z69
zxP1uP$bm5s`2s!TdWKoYaXD<gC>v#)iXV+Zh+6s;CkH$NNz5}*F+S$^Z;i2p21<CB
zk-}kj-OQj=Pw^!5OC-7L9xh97Ddv#l$}e`?I4Iaro9}<I0s$Vi7?$*gK&0QNTg?m%
z`N1uSilea=JHOzXHPm~bp(*NqK)K~^nu%<EA?X<l*_1!2d}^K$flbBAlG2dTl#W-c
zvcCD2<U2$~s@2qMkd=C3@m{w^w4BvhN}?;|AXmH*k%~^rpHPy}^r7c)ZR^ev?aJC<
zFi%=@&c(pg`KxD?79%a90ou<#OFg=7eMgUAQV))f0iX-G^QfdZE_TZNbeNm?d3|z>
z>ou9|VW=!tv^<6q;Ri^S=^}`|Yy^=JR00^UT&T$)q3aWaavKpJI0Y@ck3zS@E+SOk
z0|MVRGg?fLJd_e4wsW7kg@f^`54?AvsJ!}vTU;9^INDN7y=S&_vLU(M#wxLD&uXyK
z;2amohE7hQ+#^?^n2suyzY8HDAGbH#T~ZC(grt9?@T}l5Z!hiYi53)rN~`g?#fC|^
zFWOu0*w;WHs&5ha?rgcwhBRJRZN9WzVP)A(6T~O0_<_OnQm|JBx)8!9VIs;4U;N8C
z63?;n^~9o39$H9xx*%#Osj?Tga=zK1iF+s>p}!2A#eJ;(6~$%;Kqy<ZbiA-)FO6m`
zPh^791T1Y=*&dsdsKv1whL2W-0m;iv<af;?9V!lmwTkE|J3t+;2B6*s6y8>diADkb
z`Yg(rV-l8EB9`~43m-u0vC(T<6Gut@eDf@PV;d~mjR{`lJpLsOXG}k?%2ak(_cxYk
zX>z0Nkb!%N<mtkEWRG#IT#ex_lu3?tt5VnNJjf)uTK7g(GQqcW2p!S}TvlRHNUw&l
zBMyxGu4e0#(T=Ysh4iz$jfRcj1<@oa###gC`I|&8!*p-WEh8@R@0Uj1jFegzL%6fh
zR;<dR_87&WH!!-l>_X+=I)f~)+aEu;`Tkm^c9~;ugSXN?^T}S&T|h$MG6~7YykGQ8
zjDD}9)!8m>|1m$F%3{fr_7s)^GsLy#5VA=3=q!Fgh$unP?u$J%F1;6=Z%{jn*}Zca
zf>Z(F0sIZ|bi(GIqc?HH^KS~WQgYpUWrytNtHp2AUD>V{?O#5?!;HNV!%NepM4}mE
z5#&)KR^Rq~zE84`rd-sEtc%gBGub+Jl$j%|L-U84Tu8Cv@J))@>|^cwvF|`T*X__B
zm<mm*VLORK<DOHsaAsU4X%GovMWq5!g+#U?Ss=ATyW3>i2+%;rc%d=+JfhVhi?q!F
z8`Wrio^|GMr%5E7PG7(75!F~c?R$<T?t&ZXVFN34Ia@R6j}O7E-*zGQEDv(%?Tx)e
zEy?S{`Yb@x6rQJKngjAbsXnJs>@2!hr%`a<l|0WdLJV>;7&zEcOVN1G?ju&g+Dn&5
z$ZEAdc2!?yhiF+&OK)5?Ajlj}@%8z4$x<=C>0u4(f@n}e0Z<eS<^7x6T+itmanB#K
z;kQAUHU5&|ujCVTiuzjd@<Rk7bjk`J1?_dPM(PD1^EDJ&l;ok_f8Qt>Ues&RTPIlo
zyXnjlRi#KS3AXcxB&UK|WLoJ)d_P<~5v^dS7M|b?&b3qYu+k$IXsg8Np&niNMSPBd
zbZOTf9=-w&?4MoX&rxeOnpz|p43mD_5)Dl*6j5_b6>5!jMNfgtowD8K_;3hcz#fD%
zq8<F)k!bxB`Zn#f>$u>S6R*~XM`6I$<&6b*>WB0XiNe%Z#*Vsn=o$<e{2MJOx-&_|
zE8B+Kz*H@UZh_CkE5j9;JGq^b4<=Uv0)qJ{+vr<ku@`F`M}VnM%?45$8%dcB5N6z9
zPsk^Ml7-hn2YT(Z^<z(|b($^<3dff`bD}@|X<g?%3SVERX3$=@-NB)MBK^k3QB;!p
z9PBQ|Ks3>X`!Whui>D^tIt^@fVRg4eMP`+OeZw6%&fc%@AXeR}z|8#iZ2*!bKPDBc
z#XQb(7lO`7j{Z(0&Lj!=g@?zly;<5;VCM$(<#zZW5iR)Z(3Zc#Lk07rV^W$}WlsWD
zd27Cc=jsSI9b^RF6uby{u_AZlPcO)r+|^*HmaHq;2wClx_VlLwj71l6=R9LcmF3m8
zmftB2B6$)N1AWIxM0B&52ym%b-ibZs6$gdd2Z=V&77ozxG;)3tq;908IN7jR_Py{t
zKzYHRsICWg`m)g|w5qM|^7;l^SjA~{<!~wa{pEng{^^MyaP8uI;cn?cKb}x(&Yp?A
zI#aHCH)<$`eb;9~Ixa#ZsXB*Y&>ibY&<D@0g4Ld+prbz%uA<RLw#UN^`rhAke7kq>
zI-@BjQ4whJhb-cAp6P#XyS|^N5RPM6Ir-Art8q>_<2ea^o)I=zITZ{OLp;j5QnC2#
zY$R$%<HHIjQ3wC9t}CJsD^9cIlD?zziRR)3X^7wC2p1Lfd^5U#&770te_10q9U=b7
zz-Ku>H}=k8Q@_14KBxnK#jMf$_51>14x;&>n|NtIr}G*1Jn}aEo@>Ks7qkL8HNIf&
z_DF7}W<*<q&hb^*P~+h0=~y-6$C}=UIcbhn(ML6j?6mi#li}={j!xV~SwTjX>=H%{
z?G?KBJ`*}{j!@kqI7ws1^U;S&sO3?Rp3xhOFT*@eFb|{H1XX(!6<R%ovtdH^5sAD{
z%M1}OgNR#US&eFW{I#QtSeV}Nnu7HriA{mEVP`K!Y@cW72mKXgGPNwDA$?bk^cG3<
zUQI*!*8Ag{j<`ixM;CU}92_ptzunRfcf5FU-m9xXM<&!7G6BQ4fe-<61p7yKqvL7%
z*g|dPLW^_ma3VD%a>MZ`gnj9bi?=qLR=vB^U@gxYKf3~K^d{}RhHAOeM&hr-efEkJ
zU6dz!f0tvA*4?G;HDstF@1?0oDg%OHiz|_>(praSTH<nC$dni>xdf5!#`wmdn%yvX
zM0#~NgXu&F>vsRFeXJt5h~(99NN1!XB41O*(3SzNVJ!}u;8M9L6YQ_x+S2{-1qs(|
zPMgD|LgtcUB1XIE1bR$FjUy7cd4BxjLVGYIp$hg2kOUhr49dtFnGequNSzR9F-+vg
z11}He6F53dpgMVdNHMvc#1z+-9nN?P4X(3R+i%E2$~Q61cEJWJ{yG%3Vre#|O|I_g
zHg_Kd==*1D#dk8qsfd;?iW{+BhC=hz2`QorHX}IDbnzGwF8T1PG5KT`k?IW514D;I
zQgCQfp0%yzM_cK%Z8^Ogx#Q=jKV|mxb99uFzE6SonN<jhWnam@b$XykKh<dr=tfzE
zi`pCgWjlpL)0?e#?BMocb|xK3&c83aVLULA6CawMUOKkCcZmNYaIVV&VS(*x!vhgF
zt~{0MGAA|2gh6LUJei)eT|~P11i}|$eVoYo*vkgf`>itK<LnJmywUd{OjTH#3q;!a
zt-%VBfo|5hyYlQ^EAP2nhLbBy71pO?Rhm4&^}JyEa?&Y=BcZ4`XsA0KY**~{GT3X*
zaDI>?KP^F3iGbc|SZRHR=#<+vtZ*0nhXu+I;}F!O;IF>50iCbB)f`PJzo%p}D>0V5
zvU8t$6C(FI!S+L(<F|_~fd{aD;})OdtbGUY2GN>`-1n-O5s%!;!z9vz(uB3i`pqqp
ztJT#Zul69(K$S=C1--E1^-C-#$qY+DHXl}0Jscj4^im@*Ug=SMovG4ugt{%Vt7?Fc
ztqI+-IoqOI{ck?Kt1}mX$052CWx$hZ2;S`i*HAA^DiZfG9vYu%yfM{8c@@Xw_r~S!
zeWR6!Pz@CC%S(Pwd0DFYv_eC*nvJ?8p+R4t4XTWdUnX{uPC%$)E7|-re7`mvw*1*Z
zN(j)PKrPr;1M6*EX{3twlna@MCV{!iA>vw1MBq8Mrm-AaW8Y%>Haha!madnAf-!9=
zllMJAyDiNgs4+z|*?8f@5p#7=#{oGs8}5^_dnV8bRL^^o>;ms5z;WJ9tL3IYnt>)b
zImDu<Bcbc~)MvTD0j`c$hLbdZ4BkL$!VQ3|z6kUkFoZ_E6M=|fI?n$TbAH=KoS4(_
zw04)3sL+ezZYSQE-sedb^9z|}7jik#l#Rj^hN#`v$!QWri83kwqxW>z*0vp&d&r7i
zR4&MsL|yWZyJ@SRwMwoE51W}3Mib1Yq}EC0Ci4yEztS^*{OaN@#10Km008WJiU#FB
zRjO__R__Ane>137Xeq?4aG`b_YoQKH>hi1;`OqLfOpz(nh@=A)T;ai)8CAII&`T54
zxnA%6#lK-%@DG2eU`mmlSjgb+op!t1+Jh>0C!aIv6>1N>-NM$T%@*{%D|Q@EWbxdI
zMUo2q7E>L~PX7cps!4=-&Tzc5?(C+E-T$=R&gZ-zsv@s@Cz*-?Hyn$<$8iG(t3#c{
z(&Xdxr8|NKz*((dFLf_Z#~;4Tx;P6h+=zWvl3_DT7z5n-v>H%x9A>(VuP7X7xX>Cl
z(QMbu7oJapH(i@89G+!A;-T)|K#N1PM0E>d6O?L33r-R>?pV>8ZfG-TFug{9F)pQJ
zi?Gh4S#S5hemXm&T%nqEsz+a>If~UAUH6{)*zp$a9AcAuf!|b%9A10)V^VS)%boyM
zEabbc48U3kSF{0ZyUZk}9*`^K<GvN8JOTYsk=hU7IeoKDBZ7926u0uJRJ4Li>8WHh
zTZJvEH=DPCz;#?I0HTmSYvSbDqF1NOju7A%I6#tZ!f*FZ4X5*$Vx30QNrsC1UmT(A
zEV@%qLdtsh`4|OAvAK(kD>@Tq;h*f?-I1)htg2l(;RiCjp73}4I_1@o;RnmTAsh4I
zk-VWs31@E0QlB~XprjNsgMKma%q2vZJE1SHzs>BJ?XZ5Zfzk);M}YDVX|K{3q%S=B
z@5pmiXc!Iobr1s8{ce+BpwpN*?|cCTD-2*=_RL}qx9!LCa!q?7dUuzz1acM0pTUSn
zGL<MkmPn;CUb=oUANnP|Rr;1AiKzVwZ<DWx+OJbi>mYO8mliMRQ$qd2?8TbUdpuZl
z;Zm?whIT&eHc}<ei%tYB;gsi~x5y_`^mb?8oe`Qv=Q7*N3fb&D&0wS<DF5jxBO^?j
zVdd$G?s{9Xy9jA~Y}Of^49BEHfS|)^2N(0+Y@ecmm)e(POExp<6lzNZTukleQ8hl!
zn1n2oMFJ!B=n$;Jg`@7}X9py2t~3<ZXprAjB0w38K!+tG7=N95I!2mT^hiNqq%sPo
zJ&={%im#|$39rcW>i&}Ec-vdsFpsc4KYsqCQla%AEfvG}Yn9rD1Xb2=CWdbt++d-2
zx7kmLCEP6>1+Z6bl1aQ}YH?tG2;V!AD1G@+S6l^3wzhrwJqYzh4j*)v2}28t1hwR@
z*ghz7QV;G6H+&uMAumfT>6be%0b@Y$pcRd?cOz|FlY(K6^YKVujMygUqi33A8n2v=
zNBgJ)z2@dtQr6<1tM<B&d_~&uNPQPyJe9Q~utp+1@FKahQ4&IZ9ZU6P*s$c)vSTW6
z_~$qWibxG^$8&9EZs{C(B;D!~n9+enmGVcqb58T|f_gHobH$Lw>W=xfw}CyHj3)@F
zL~*5}*3jCjK4)S7<SgaggijjovU1dqq;A|<F9!|&j}NoOc2EgF0+b124l0j30(@|V
zC@=UaJ%!lzX58S=`trP}>a|Yx1`?;3j&}$>9Om01=8Q=*1o?=!4qEK4^VtaWMbl`h
zqNiCTkDlo{<ypLdFp6b&1ZIdhsW&oTRN2T48kkm*IPAqCL$dnm9RVBetWKhd1G;5W
zt%JbBenP8X{)}03+f=LPuW`;MKvL>0oV5M`rlY+!26d<EtBx(dn03*?y4F-je)_UQ
zzBWtN-$V-QT6WB3ux8r&6X(I7ARc<;rACw^<?y8am9dpl)SOo`RaX!!4d&RmU+6`(
z_CG1AE}8qCU7hqKjC%G_Z8_sK2`D^k|MdI3JUuOljL{0gexOJ86x*eqnuOE?=m<Cp
zJyFSki3Z?kPDxEiCnugpcr63BFLVd4cI0CfJE#Iuv$(tJc=?##O0JW4k(H?+G(0kd
zZU-jYS85-iWZQM@e+VzEto7e!uQrc4@_45hEa|V_e82DCGct)h*gN)sGiWUvK_G`T
z<MZR4HqW1SjJz$2MD>iPe+Bz<_lJMCc>@4{3~iDY38vWZ>wlL2d-wOSaCLhZX#Y!Y
z-pJ^$?-?192$E2<FtM^QaZ|%9%YlItL14b~U;%&nm-!LAxX$<f1sU&8`XA^0a{vJB
zeW?%rv*zs0Vasf4VrJ*;<ofA9b_fN4_~R3I_d)=GK^{N=0RQp!cj}+GU)>(g*xq5J
z)_`}j+~29N(0@_Q>}@O@Ke<`BzK8jr<UgLd|AqV~Ub-abVVw8fCcwSN{10+C(SMSi
z9Gp%5f1~}wRC_E4oqcD{paKAB|6m@F{>8MkdGBy-eEuuce`$n2*ni5||K14DRDWUr
zp=$s4ru{?xr(6B+)U5xc{y$CY|4#q!O2EI<B{=?y`M(JQ-^u@S;Qo;RInMNVvTWpE
z<bRAh{X6wfYy9)^=<n2njK8RVe^~n88sUGs>i!Ph%lixZ_iONfiWKVoR0jay-(NQG
LCzn+IpRNA`+u!Y8

literal 0
HcmV?d00001

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 24905f1c97b21..d1f89e8d16dd6 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -998,8 +998,8 @@ def runJob(self, rdd, partitionFunc, partitions=None, allowLocal=False):
         # by runJob() in order to avoid having to pass a Python lambda into
         # SparkContext#runJob.
         mappedRDD = rdd.mapPartitions(partitionFunc)
-        port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
-        return list(_load_from_socket(port, mappedRDD._jrdd_deserializer))
+        sock_info = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
+        return list(_load_from_socket(sock_info, mappedRDD._jrdd_deserializer))
 
     def show_profiles(self):
         """ Print the profile stats to stdout """
diff --git a/python/pyspark/daemon.py b/python/pyspark/daemon.py
index 7f06d4288c872..e7d1e718c934a 100644
--- a/python/pyspark/daemon.py
+++ b/python/pyspark/daemon.py
@@ -29,7 +29,7 @@
 from signal import SIGHUP, SIGTERM, SIGCHLD, SIG_DFL, SIG_IGN, SIGINT
 
 from pyspark.worker import main as worker_main
-from pyspark.serializers import read_int, write_int
+from pyspark.serializers import read_int, write_int, write_with_length, UTF8Deserializer
 
 
 def compute_real_exit_code(exit_code):
@@ -40,7 +40,7 @@ def compute_real_exit_code(exit_code):
         return 1
 
 
-def worker(sock):
+def worker(sock, authenticated):
     """
     Called by a worker process after the fork().
     """
@@ -56,6 +56,18 @@ def worker(sock):
     # otherwise writes also cause a seek that makes us miss data on the read side.
     infile = os.fdopen(os.dup(sock.fileno()), "rb", 65536)
     outfile = os.fdopen(os.dup(sock.fileno()), "wb", 65536)
+
+    if not authenticated:
+        client_secret = UTF8Deserializer().loads(infile)
+        if os.environ["PYTHON_WORKER_FACTORY_SECRET"] == client_secret:
+            write_with_length("ok".encode("utf-8"), outfile)
+            outfile.flush()
+        else:
+            write_with_length("err".encode("utf-8"), outfile)
+            outfile.flush()
+            sock.close()
+            return 1
+
     exit_code = 0
     try:
         worker_main(infile, outfile)
@@ -153,8 +165,11 @@ def handle_sigterm(*args):
                         write_int(os.getpid(), outfile)
                         outfile.flush()
                         outfile.close()
+                        authenticated = False
                         while True:
-                            code = worker(sock)
+                            code = worker(sock, authenticated)
+                            if code == 0:
+                                authenticated = True
                             if not reuse or code:
                                 # wait for closing
                                 try:
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 3e704fe9bf6ec..0afbe9dc6aa3e 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -21,16 +21,19 @@
 import select
 import signal
 import shlex
+import shutil
 import socket
 import platform
+import tempfile
+import time
 from subprocess import Popen, PIPE
 
 if sys.version >= '3':
     xrange = range
 
-from py4j.java_gateway import java_import, JavaGateway, GatewayClient
+from py4j.java_gateway import java_import, JavaGateway, GatewayParameters
 from pyspark.find_spark_home import _find_spark_home
-from pyspark.serializers import read_int
+from pyspark.serializers import read_int, write_with_length, UTF8Deserializer
 
 
 def launch_gateway(conf=None):
@@ -41,6 +44,7 @@ def launch_gateway(conf=None):
     """
     if "PYSPARK_GATEWAY_PORT" in os.environ:
         gateway_port = int(os.environ["PYSPARK_GATEWAY_PORT"])
+        gateway_secret = os.environ["PYSPARK_GATEWAY_SECRET"]
     else:
         SPARK_HOME = _find_spark_home()
         # Launch the Py4j gateway using Spark's run command so that we pick up the
@@ -59,40 +63,40 @@ def launch_gateway(conf=None):
             ])
         command = command + shlex.split(submit_args)
 
-        # Start a socket that will be used by PythonGatewayServer to communicate its port to us
-        callback_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        callback_socket.bind(('127.0.0.1', 0))
-        callback_socket.listen(1)
-        callback_host, callback_port = callback_socket.getsockname()
-        env = dict(os.environ)
-        env['_PYSPARK_DRIVER_CALLBACK_HOST'] = callback_host
-        env['_PYSPARK_DRIVER_CALLBACK_PORT'] = str(callback_port)
-
-        # Launch the Java gateway.
-        # We open a pipe to stdin so that the Java gateway can die when the pipe is broken
-        if not on_windows:
-            # Don't send ctrl-c / SIGINT to the Java gateway:
-            def preexec_func():
-                signal.signal(signal.SIGINT, signal.SIG_IGN)
-            proc = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env)
-        else:
-            # preexec_fn not supported on Windows
-            proc = Popen(command, stdin=PIPE, env=env)
-
-        gateway_port = None
-        # We use select() here in order to avoid blocking indefinitely if the subprocess dies
-        # before connecting
-        while gateway_port is None and proc.poll() is None:
-            timeout = 1  # (seconds)
-            readable, _, _ = select.select([callback_socket], [], [], timeout)
-            if callback_socket in readable:
-                gateway_connection = callback_socket.accept()[0]
-                # Determine which ephemeral port the server started on:
-                gateway_port = read_int(gateway_connection.makefile(mode="rb"))
-                gateway_connection.close()
-                callback_socket.close()
-        if gateway_port is None:
-            raise Exception("Java gateway process exited before sending the driver its port number")
+        # Create a temporary directory where the gateway server should write the connection
+        # information.
+        conn_info_dir = tempfile.mkdtemp()
+        try:
+            fd, conn_info_file = tempfile.mkstemp(dir=conn_info_dir)
+            os.close(fd)
+            os.unlink(conn_info_file)
+
+            env = dict(os.environ)
+            env["_PYSPARK_DRIVER_CONN_INFO_PATH"] = conn_info_file
+
+            # Launch the Java gateway.
+            # We open a pipe to stdin so that the Java gateway can die when the pipe is broken
+            if not on_windows:
+                # Don't send ctrl-c / SIGINT to the Java gateway:
+                def preexec_func():
+                    signal.signal(signal.SIGINT, signal.SIG_IGN)
+                proc = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env)
+            else:
+                # preexec_fn not supported on Windows
+                proc = Popen(command, stdin=PIPE, env=env)
+
+            # Wait for the file to appear, or for the process to exit, whichever happens first.
+            while not proc.poll() and not os.path.isfile(conn_info_file):
+                time.sleep(0.1)
+
+            if not os.path.isfile(conn_info_file):
+                raise Exception("Java gateway process exited before sending its port number")
+
+            with open(conn_info_file, "rb") as info:
+                gateway_port = read_int(info)
+                gateway_secret = UTF8Deserializer().loads(info)
+        finally:
+            shutil.rmtree(conn_info_dir)
 
         # In Windows, ensure the Java child processes do not linger after Python has exited.
         # In UNIX-based systems, the child process can kill itself on broken pipe (i.e. when
@@ -111,7 +115,9 @@ def killChild():
             atexit.register(killChild)
 
     # Connect to the gateway
-    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=True)
+    gateway = JavaGateway(
+        gateway_parameters=GatewayParameters(port=gateway_port, auth_token=gateway_secret,
+                                             auto_convert=True))
 
     # Import the classes used by PySpark
     java_import(gateway.jvm, "org.apache.spark.SparkConf")
@@ -126,3 +132,16 @@ def killChild():
     java_import(gateway.jvm, "scala.Tuple2")
 
     return gateway
+
+
+def do_server_auth(conn, auth_secret):
+    """
+    Performs the authentication protocol defined by the SocketAuthHelper class on the given
+    file-like object 'conn'.
+    """
+    write_with_length(auth_secret.encode("utf-8"), conn)
+    conn.flush()
+    reply = UTF8Deserializer().loads(conn)
+    if reply != "ok":
+        conn.close()
+        raise Exception("Unexpected reply from iterator server.")
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 9a6010e87679c..acd0fa28958ed 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -39,9 +39,11 @@
 else:
     from itertools import imap as map, ifilter as filter
 
+from pyspark.java_gateway import do_server_auth
 from pyspark.serializers import NoOpSerializer, CartesianDeserializer, \
     BatchedSerializer, CloudPickleSerializer, PairDeserializer, \
-    PickleSerializer, pack_long, AutoBatchedSerializer
+    PickleSerializer, pack_long, AutoBatchedSerializer, write_with_length, \
+    UTF8Deserializer
 from pyspark.join import python_join, python_left_outer_join, \
     python_right_outer_join, python_full_outer_join, python_cogroup
 from pyspark.statcounter import StatCounter
@@ -135,7 +137,8 @@ def _parse_memory(s):
     return int(float(s[:-1]) * units[s[-1].lower()])
 
 
-def _load_from_socket(port, serializer):
+def _load_from_socket(sock_info, serializer):
+    port, auth_secret = sock_info
     sock = None
     # Support for both IPv4 and IPv6.
     # On most of IPv6-ready systems, IPv6 will take precedence.
@@ -155,8 +158,12 @@ def _load_from_socket(port, serializer):
     # The RDD materialization time is unpredicable, if we set a timeout for socket reading
     # operation, it will very possibly fail. See SPARK-18281.
     sock.settimeout(None)
+
+    sockfile = sock.makefile("rwb", 65536)
+    do_server_auth(sockfile, auth_secret)
+
     # The socket will be automatically closed when garbage-collected.
-    return serializer.load_stream(sock.makefile("rb", 65536))
+    return serializer.load_stream(sockfile)
 
 
 def ignore_unicode_prefix(f):
@@ -821,8 +828,8 @@ def collect(self):
             to be small, as all the data is loaded into the driver's memory.
         """
         with SCCallSiteSync(self.context) as css:
-            port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
-        return list(_load_from_socket(port, self._jrdd_deserializer))
+            sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
+        return list(_load_from_socket(sock_info, self._jrdd_deserializer))
 
     def reduce(self, f):
         """
@@ -2379,8 +2386,8 @@ def toLocalIterator(self):
         [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
         """
         with SCCallSiteSync(self.context) as css:
-            port = self.ctx._jvm.PythonRDD.toLocalIteratorAndServe(self._jrdd.rdd())
-        return _load_from_socket(port, self._jrdd_deserializer)
+            sock_info = self.ctx._jvm.PythonRDD.toLocalIteratorAndServe(self._jrdd.rdd())
+        return _load_from_socket(sock_info, self._jrdd_deserializer)
 
 
 def _prepare_for_python_RDD(sc, command):
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 930d177fa5b8f..9bb0dca95788c 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -463,8 +463,8 @@ def collect(self):
         [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
         """
         with SCCallSiteSync(self._sc) as css:
-            port = self._jdf.collectToPython()
-        return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
+            sock_info = self._jdf.collectToPython()
+        return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
 
     @ignore_unicode_prefix
     @since(2.0)
@@ -477,8 +477,8 @@ def toLocalIterator(self):
         [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
         """
         with SCCallSiteSync(self._sc) as css:
-            port = self._jdf.toPythonIterator()
-        return _load_from_socket(port, BatchedSerializer(PickleSerializer()))
+            sock_info = self._jdf.toPythonIterator()
+        return _load_from_socket(sock_info, BatchedSerializer(PickleSerializer()))
 
     @ignore_unicode_prefix
     @since(1.3)
@@ -1999,8 +1999,8 @@ def _collectAsArrow(self):
         .. note:: Experimental.
         """
         with SCCallSiteSync(self._sc) as css:
-            port = self._jdf.collectAsArrowToPython()
-        return list(_load_from_socket(port, ArrowSerializer()))
+            sock_info = self._jdf.collectAsArrowToPython()
+        return list(_load_from_socket(sock_info, ArrowSerializer()))
 
     ##########################################################################################
     # Pandas compatibility
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 8c41c3d1d6183..dec06420dac39 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -27,6 +27,7 @@
 
 from pyspark.accumulators import _accumulatorRegistry
 from pyspark.broadcast import Broadcast, _broadcastRegistry
+from pyspark.java_gateway import do_server_auth
 from pyspark.taskcontext import TaskContext
 from pyspark.files import SparkFiles
 from pyspark.rdd import PythonEvalType
@@ -260,9 +261,11 @@ def process():
 
 
 if __name__ == '__main__':
-    # Read a local port to connect to from stdin
-    java_port = int(sys.stdin.readline())
+    # Read information about how to connect back to the JVM from the environment.
+    java_port = int(os.environ["PYTHON_WORKER_FACTORY_PORT"])
+    auth_secret = os.environ["PYTHON_WORKER_FACTORY_SECRET"]
     sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     sock.connect(("127.0.0.1", java_port))
     sock_file = sock.makefile("rwb", 65536)
+    do_server_auth(sock_file, auth_secret)
     main(sock_file, sock_file)
diff --git a/python/setup.py b/python/setup.py
index 6a98401941d8d..80cf0d7b329bd 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -201,7 +201,7 @@ def _supports_symlinks():
             'pyspark.examples.src.main.python': ['*.py', '*/*.py']},
         scripts=scripts,
         license='http://www.apache.org/licenses/LICENSE-2.0',
-        install_requires=['py4j==0.10.6'],
+        install_requires=['py4j==0.10.7'],
         setup_requires=['pypandoc'],
         extras_require={
             'ml': ['numpy>=1.7'],
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 28087dee831d1..c3ba48efe7de8 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -1182,7 +1182,7 @@ private[spark] class Client(
         val pyArchivesFile = new File(pyLibPath, "pyspark.zip")
         require(pyArchivesFile.exists(),
           s"$pyArchivesFile not found; cannot run pyspark application in YARN mode.")
-        val py4jFile = new File(pyLibPath, "py4j-0.10.6-src.zip")
+        val py4jFile = new File(pyLibPath, "py4j-0.10.7-src.zip")
         require(py4jFile.exists(),
           s"$py4jFile not found; cannot run pyspark application in YARN mode.")
         Seq(pyArchivesFile.getAbsolutePath(), py4jFile.getAbsolutePath())
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index 33d400a5b1b2e..6a974d558403f 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -257,7 +257,7 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
     // needed locations.
     val sparkHome = sys.props("spark.test.home")
     val pythonPath = Seq(
-        s"$sparkHome/python/lib/py4j-0.10.6-src.zip",
+        s"$sparkHome/python/lib/py4j-0.10.7-src.zip",
         s"$sparkHome/python")
     val extraEnvVars = Map(
       "PYSPARK_ARCHIVES_PATH" -> pythonPath.map("local:" + _).mkString(File.pathSeparator),
diff --git a/sbin/spark-config.sh b/sbin/spark-config.sh
index bac154e10ae62..bf3da18c3706e 100755
--- a/sbin/spark-config.sh
+++ b/sbin/spark-config.sh
@@ -28,6 +28,6 @@ export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"${SPARK_HOME}/conf"}"
 # Add the PySpark classes to the PYTHONPATH:
 if [ -z "${PYSPARK_PYTHONPATH_SET}" ]; then
   export PYTHONPATH="${SPARK_HOME}/python:${PYTHONPATH}"
-  export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.6-src.zip:${PYTHONPATH}"
+  export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.7-src.zip:${PYTHONPATH}"
   export PYSPARK_PYTHONPATH_SET=1
 fi
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 917168162b236..0058e6a25d911 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -3187,7 +3187,7 @@ class Dataset[T] private[sql](
     EvaluatePython.javaToPython(rdd)
   }
 
-  private[sql] def collectToPython(): Int = {
+  private[sql] def collectToPython(): Array[Any] = {
     EvaluatePython.registerPicklers()
     withAction("collectToPython", queryExecution) { plan =>
       val toJava: (Any) => Any = EvaluatePython.toJava(_, schema)
@@ -3200,7 +3200,7 @@ class Dataset[T] private[sql](
   /**
    * Collect a Dataset as ArrowPayload byte arrays and serve to PySpark.
    */
-  private[sql] def collectAsArrowToPython(): Int = {
+  private[sql] def collectAsArrowToPython(): Array[Any] = {
     withAction("collectAsArrowToPython", queryExecution) { plan =>
       val iter: Iterator[Array[Byte]] =
         toArrowPayload(plan).collect().iterator.map(_.asPythonSerializable)
@@ -3208,7 +3208,7 @@ class Dataset[T] private[sql](
     }
   }
 
-  private[sql] def toPythonIterator(): Int = {
+  private[sql] def toPythonIterator(): Array[Any] = {
     withNewExecutionId {
       PythonRDD.toLocalIteratorAndServe(javaToPython.rdd)
     }

From 16cd9ac5264831e061c033b26fe1173ebc88e5d1 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 17 Apr 2018 13:29:43 -0700
Subject: [PATCH 441/566] [SPARKR] Match pyspark features in SparkR
 communication protocol.

(cherry picked from commit 628c7b517969c4a7ccb26ea67ab3dd61266073ca)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 R/pkg/R/client.R                              |  4 +-
 R/pkg/R/deserialize.R                         | 10 +++-
 R/pkg/R/sparkR.R                              | 39 +++++++++++--
 R/pkg/inst/worker/daemon.R                    |  4 +-
 R/pkg/inst/worker/worker.R                    |  5 +-
 .../org/apache/spark/api/r/RAuthHelper.scala  | 38 +++++++++++++
 .../org/apache/spark/api/r/RBackend.scala     | 43 +++++++++++++--
 .../spark/api/r/RBackendAuthHandler.scala     | 55 +++++++++++++++++++
 .../org/apache/spark/api/r/RRunner.scala      | 35 ++++++++----
 .../org/apache/spark/deploy/RRunner.scala     |  6 +-
 10 files changed, 210 insertions(+), 29 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/api/r/RAuthHelper.scala
 create mode 100644 core/src/main/scala/org/apache/spark/api/r/RBackendAuthHandler.scala

diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R
index 9d82814211bc5..7244cc9f9e38e 100644
--- a/R/pkg/R/client.R
+++ b/R/pkg/R/client.R
@@ -19,7 +19,7 @@
 
 # Creates a SparkR client connection object
 # if one doesn't already exist
-connectBackend <- function(hostname, port, timeout) {
+connectBackend <- function(hostname, port, timeout, authSecret) {
   if (exists(".sparkRcon", envir = .sparkREnv)) {
     if (isOpen(.sparkREnv[[".sparkRCon"]])) {
       cat("SparkRBackend client connection already exists\n")
@@ -29,7 +29,7 @@ connectBackend <- function(hostname, port, timeout) {
 
   con <- socketConnection(host = hostname, port = port, server = FALSE,
                           blocking = TRUE, open = "wb", timeout = timeout)
-
+  doServerAuth(con, authSecret)
   assign(".sparkRCon", con, envir = .sparkREnv)
   con
 }
diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
index a90f7d381026b..cb03f1667629f 100644
--- a/R/pkg/R/deserialize.R
+++ b/R/pkg/R/deserialize.R
@@ -60,14 +60,18 @@ readTypedObject <- function(con, type) {
     stop(paste("Unsupported type for deserialization", type)))
 }
 
-readString <- function(con) {
-  stringLen <- readInt(con)
-  raw <- readBin(con, raw(), stringLen, endian = "big")
+readStringData <- function(con, len) {
+  raw <- readBin(con, raw(), len, endian = "big")
   string <- rawToChar(raw)
   Encoding(string) <- "UTF-8"
   string
 }
 
+readString <- function(con) {
+  stringLen <- readInt(con)
+  readStringData(con, stringLen)
+}
+
 readInt <- function(con) {
   readBin(con, integer(), n = 1, endian = "big")
 }
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 965471f3b07a0..7430d849cba89 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -161,6 +161,10 @@ sparkR.sparkContext <- function(
                     " please use the --packages commandline instead", sep = ","))
     }
     backendPort <- existingPort
+    authSecret <- Sys.getenv("SPARKR_BACKEND_AUTH_SECRET")
+    if (nchar(authSecret) == 0) {
+      stop("Auth secret not provided in environment.")
+    }
   } else {
     path <- tempfile(pattern = "backend_port")
     submitOps <- getClientModeSparkSubmitOpts(
@@ -189,16 +193,27 @@ sparkR.sparkContext <- function(
     monitorPort <- readInt(f)
     rLibPath <- readString(f)
     connectionTimeout <- readInt(f)
+
+    # Don't use readString() so that we can provide a useful
+    # error message if the R and Java versions are mismatched.
+    authSecretLen = readInt(f)
+    if (length(authSecretLen) == 0 || authSecretLen == 0) {
+      stop("Unexpected EOF in JVM connection data. Mismatched versions?")
+    }
+    authSecret <- readStringData(f, authSecretLen)
     close(f)
     file.remove(path)
     if (length(backendPort) == 0 || backendPort == 0 ||
         length(monitorPort) == 0 || monitorPort == 0 ||
-        length(rLibPath) != 1) {
+        length(rLibPath) != 1 || length(authSecret) == 0) {
       stop("JVM failed to launch")
     }
-    assign(".monitorConn",
-           socketConnection(port = monitorPort, timeout = connectionTimeout),
-           envir = .sparkREnv)
+
+    monitorConn <- socketConnection(port = monitorPort, blocking = TRUE,
+                                    timeout = connectionTimeout, open = "wb")
+    doServerAuth(monitorConn, authSecret)
+
+    assign(".monitorConn", monitorConn, envir = .sparkREnv)
     assign(".backendLaunched", 1, envir = .sparkREnv)
     if (rLibPath != "") {
       assign(".libPath", rLibPath, envir = .sparkREnv)
@@ -208,7 +223,7 @@ sparkR.sparkContext <- function(
 
   .sparkREnv$backendPort <- backendPort
   tryCatch({
-    connectBackend("localhost", backendPort, timeout = connectionTimeout)
+    connectBackend("localhost", backendPort, timeout = connectionTimeout, authSecret = authSecret)
   },
   error = function(err) {
     stop("Failed to connect JVM\n")
@@ -694,3 +709,17 @@ sparkCheckInstall <- function(sparkHome, master, deployMode) {
     NULL
   }
 }
+
+# Utility function for sending auth data over a socket and checking the server's reply.
+doServerAuth <- function(con, authSecret) {
+  if (nchar(authSecret) == 0) {
+    stop("Auth secret not provided.")
+  }
+  writeString(con, authSecret)
+  flush(con)
+  reply <- readString(con)
+  if (reply != "ok") {
+    close(con)
+    stop("Unexpected reply from server.")
+  }
+}
diff --git a/R/pkg/inst/worker/daemon.R b/R/pkg/inst/worker/daemon.R
index 2e31dc5f728cd..fb9db63b07cd0 100644
--- a/R/pkg/inst/worker/daemon.R
+++ b/R/pkg/inst/worker/daemon.R
@@ -28,7 +28,9 @@ suppressPackageStartupMessages(library(SparkR))
 
 port <- as.integer(Sys.getenv("SPARKR_WORKER_PORT"))
 inputCon <- socketConnection(
-    port = port, open = "rb", blocking = TRUE, timeout = connectionTimeout)
+    port = port, open = "wb", blocking = TRUE, timeout = connectionTimeout)
+
+SparkR:::doServerAuth(inputCon, Sys.getenv("SPARKR_WORKER_SECRET"))
 
 # Waits indefinitely for a socket connecion by default.
 selectTimeout <- NULL
diff --git a/R/pkg/inst/worker/worker.R b/R/pkg/inst/worker/worker.R
index 00789d815bba8..ba458d2b9ddfb 100644
--- a/R/pkg/inst/worker/worker.R
+++ b/R/pkg/inst/worker/worker.R
@@ -100,9 +100,12 @@ suppressPackageStartupMessages(library(SparkR))
 
 port <- as.integer(Sys.getenv("SPARKR_WORKER_PORT"))
 inputCon <- socketConnection(
-    port = port, blocking = TRUE, open = "rb", timeout = connectionTimeout)
+    port = port, blocking = TRUE, open = "wb", timeout = connectionTimeout)
+SparkR:::doServerAuth(inputCon, Sys.getenv("SPARKR_WORKER_SECRET"))
+
 outputCon <- socketConnection(
     port = port, blocking = TRUE, open = "wb", timeout = connectionTimeout)
+SparkR:::doServerAuth(outputCon, Sys.getenv("SPARKR_WORKER_SECRET"))
 
 # read the index of the current partition inside the RDD
 partition <- SparkR:::readInt(inputCon)
diff --git a/core/src/main/scala/org/apache/spark/api/r/RAuthHelper.scala b/core/src/main/scala/org/apache/spark/api/r/RAuthHelper.scala
new file mode 100644
index 0000000000000..ac6826a9ec774
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/api/r/RAuthHelper.scala
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.r
+
+import java.io.{DataInputStream, DataOutputStream}
+import java.net.Socket
+
+import org.apache.spark.SparkConf
+import org.apache.spark.security.SocketAuthHelper
+
+private[spark] class RAuthHelper(conf: SparkConf) extends SocketAuthHelper(conf) {
+
+  override protected def readUtf8(s: Socket): String = {
+    SerDe.readString(new DataInputStream(s.getInputStream()))
+  }
+
+  override protected def writeUtf8(str: String, s: Socket): Unit = {
+    val out = s.getOutputStream()
+    SerDe.writeString(new DataOutputStream(out), str)
+    out.flush()
+  }
+
+}
diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackend.scala b/core/src/main/scala/org/apache/spark/api/r/RBackend.scala
index 2d1152a036449..3b2e809408e0f 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RBackend.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RBackend.scala
@@ -17,8 +17,8 @@
 
 package org.apache.spark.api.r
 
-import java.io.{DataOutputStream, File, FileOutputStream, IOException}
-import java.net.{InetAddress, InetSocketAddress, ServerSocket}
+import java.io.{DataInputStream, DataOutputStream, File, FileOutputStream, IOException}
+import java.net.{InetAddress, InetSocketAddress, ServerSocket, Socket}
 import java.util.concurrent.TimeUnit
 
 import io.netty.bootstrap.ServerBootstrap
@@ -32,6 +32,8 @@ import io.netty.handler.timeout.ReadTimeoutHandler
 
 import org.apache.spark.SparkConf
 import org.apache.spark.internal.Logging
+import org.apache.spark.network.util.JavaUtils
+import org.apache.spark.util.Utils
 
 /**
  * Netty-based backend server that is used to communicate between R and Java.
@@ -45,7 +47,7 @@ private[spark] class RBackend {
   /** Tracks JVM objects returned to R for this RBackend instance. */
   private[r] val jvmObjectTracker = new JVMObjectTracker
 
-  def init(): Int = {
+  def init(): (Int, RAuthHelper) = {
     val conf = new SparkConf()
     val backendConnectionTimeout = conf.getInt(
       "spark.r.backendConnectionTimeout", SparkRDefaults.DEFAULT_CONNECTION_TIMEOUT)
@@ -53,6 +55,7 @@ private[spark] class RBackend {
       conf.getInt("spark.r.numRBackendThreads", SparkRDefaults.DEFAULT_NUM_RBACKEND_THREADS))
     val workerGroup = bossGroup
     val handler = new RBackendHandler(this)
+    val authHelper = new RAuthHelper(conf)
 
     bootstrap = new ServerBootstrap()
       .group(bossGroup, workerGroup)
@@ -71,13 +74,16 @@ private[spark] class RBackend {
             new LengthFieldBasedFrameDecoder(Integer.MAX_VALUE, 0, 4, 0, 4))
           .addLast("decoder", new ByteArrayDecoder())
           .addLast("readTimeoutHandler", new ReadTimeoutHandler(backendConnectionTimeout))
+          .addLast(new RBackendAuthHandler(authHelper.secret))
           .addLast("handler", handler)
       }
     })
 
     channelFuture = bootstrap.bind(new InetSocketAddress("localhost", 0))
     channelFuture.syncUninterruptibly()
-    channelFuture.channel().localAddress().asInstanceOf[InetSocketAddress].getPort()
+
+    val port = channelFuture.channel().localAddress().asInstanceOf[InetSocketAddress].getPort()
+    (port, authHelper)
   }
 
   def run(): Unit = {
@@ -116,7 +122,7 @@ private[spark] object RBackend extends Logging {
     val sparkRBackend = new RBackend()
     try {
       // bind to random port
-      val boundPort = sparkRBackend.init()
+      val (boundPort, authHelper) = sparkRBackend.init()
       val serverSocket = new ServerSocket(0, 1, InetAddress.getByName("localhost"))
       val listenPort = serverSocket.getLocalPort()
       // Connection timeout is set by socket client. To make it configurable we will pass the
@@ -133,6 +139,7 @@ private[spark] object RBackend extends Logging {
       dos.writeInt(listenPort)
       SerDe.writeString(dos, RUtils.rPackages.getOrElse(""))
       dos.writeInt(backendConnectionTimeout)
+      SerDe.writeString(dos, authHelper.secret)
       dos.close()
       f.renameTo(new File(path))
 
@@ -144,12 +151,35 @@ private[spark] object RBackend extends Logging {
           val buf = new Array[Byte](1024)
           // shutdown JVM if R does not connect back in 10 seconds
           serverSocket.setSoTimeout(10000)
+
+          // Wait for the R process to connect back, ignoring any failed auth attempts. Allow
+          // a max number of connection attempts to avoid looping forever.
           try {
-            val inSocket = serverSocket.accept()
+            var remainingAttempts = 10
+            var inSocket: Socket = null
+            while (inSocket == null) {
+              inSocket = serverSocket.accept()
+              try {
+                authHelper.authClient(inSocket)
+              } catch {
+                case e: Exception =>
+                  remainingAttempts -= 1
+                  if (remainingAttempts == 0) {
+                    val msg = "Too many failed authentication attempts."
+                    logError(msg)
+                    throw new IllegalStateException(msg)
+                  }
+                  logInfo("Client connection failed authentication.")
+                  inSocket = null
+              }
+            }
+
             serverSocket.close()
+
             // wait for the end of socket, closed if R process die
             inSocket.getInputStream().read(buf)
           } finally {
+            serverSocket.close()
             sparkRBackend.close()
             System.exit(0)
           }
@@ -165,4 +195,5 @@ private[spark] object RBackend extends Logging {
     }
     System.exit(0)
   }
+
 }
diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackendAuthHandler.scala b/core/src/main/scala/org/apache/spark/api/r/RBackendAuthHandler.scala
new file mode 100644
index 0000000000000..4162e4a6c7476
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/api/r/RBackendAuthHandler.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.r
+
+import java.io.{ByteArrayOutputStream, DataOutputStream}
+import java.nio.charset.StandardCharsets.UTF_8
+
+import io.netty.channel.{Channel, ChannelHandlerContext, SimpleChannelInboundHandler}
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.Utils
+
+/**
+ * Authentication handler for connections from the R process.
+ */
+private class RBackendAuthHandler(secret: String)
+  extends SimpleChannelInboundHandler[Array[Byte]] with Logging {
+
+  override def channelRead0(ctx: ChannelHandlerContext, msg: Array[Byte]): Unit = {
+    // The R code adds a null terminator to serialized strings, so ignore it here.
+    val clientSecret = new String(msg, 0, msg.length - 1, UTF_8)
+    try {
+      require(secret == clientSecret, "Auth secret mismatch.")
+      ctx.pipeline().remove(this)
+      writeReply("ok", ctx.channel())
+    } catch {
+      case e: Exception =>
+        logInfo("Authentication failure.", e)
+        writeReply("err", ctx.channel())
+        ctx.close()
+    }
+  }
+
+  private def writeReply(reply: String, chan: Channel): Unit = {
+    val out = new ByteArrayOutputStream()
+    SerDe.writeString(new DataOutputStream(out), reply)
+    chan.writeAndFlush(out.toByteArray())
+  }
+
+}
diff --git a/core/src/main/scala/org/apache/spark/api/r/RRunner.scala b/core/src/main/scala/org/apache/spark/api/r/RRunner.scala
index 88118392003e8..e7fdc3963945a 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RRunner.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RRunner.scala
@@ -74,14 +74,19 @@ private[spark] class RRunner[U](
 
     // the socket used to send out the input of task
     serverSocket.setSoTimeout(10000)
-    val inSocket = serverSocket.accept()
-    startStdinThread(inSocket.getOutputStream(), inputIterator, partitionIndex)
-
-    // the socket used to receive the output of task
-    val outSocket = serverSocket.accept()
-    val inputStream = new BufferedInputStream(outSocket.getInputStream)
-    dataStream = new DataInputStream(inputStream)
-    serverSocket.close()
+    dataStream = try {
+      val inSocket = serverSocket.accept()
+      RRunner.authHelper.authClient(inSocket)
+      startStdinThread(inSocket.getOutputStream(), inputIterator, partitionIndex)
+
+      // the socket used to receive the output of task
+      val outSocket = serverSocket.accept()
+      RRunner.authHelper.authClient(outSocket)
+      val inputStream = new BufferedInputStream(outSocket.getInputStream)
+      new DataInputStream(inputStream)
+    } finally {
+      serverSocket.close()
+    }
 
     try {
       return new Iterator[U] {
@@ -315,6 +320,11 @@ private[r] object RRunner {
   private[this] var errThread: BufferedStreamThread = _
   private[this] var daemonChannel: DataOutputStream = _
 
+  private lazy val authHelper = {
+    val conf = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf())
+    new RAuthHelper(conf)
+  }
+
   /**
    * Start a thread to print the process's stderr to ours
    */
@@ -349,6 +359,7 @@ private[r] object RRunner {
     pb.environment().put("SPARKR_BACKEND_CONNECTION_TIMEOUT", rConnectionTimeout.toString)
     pb.environment().put("SPARKR_SPARKFILES_ROOT_DIR", SparkFiles.getRootDirectory())
     pb.environment().put("SPARKR_IS_RUNNING_ON_WORKER", "TRUE")
+    pb.environment().put("SPARKR_WORKER_SECRET", authHelper.secret)
     pb.redirectErrorStream(true)  // redirect stderr into stdout
     val proc = pb.start()
     val errThread = startStdoutThread(proc)
@@ -370,8 +381,12 @@ private[r] object RRunner {
           // the socket used to send out the input of task
           serverSocket.setSoTimeout(10000)
           val sock = serverSocket.accept()
-          daemonChannel = new DataOutputStream(new BufferedOutputStream(sock.getOutputStream))
-          serverSocket.close()
+          try {
+            authHelper.authClient(sock)
+            daemonChannel = new DataOutputStream(new BufferedOutputStream(sock.getOutputStream))
+          } finally {
+            serverSocket.close()
+          }
         }
         try {
           daemonChannel.writeInt(port)
diff --git a/core/src/main/scala/org/apache/spark/deploy/RRunner.scala b/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
index 6eb53a8252205..e86b362639e57 100644
--- a/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
@@ -68,10 +68,13 @@ object RRunner {
     // Java system properties etc.
     val sparkRBackend = new RBackend()
     @volatile var sparkRBackendPort = 0
+    @volatile var sparkRBackendSecret: String = null
     val initialized = new Semaphore(0)
     val sparkRBackendThread = new Thread("SparkR backend") {
       override def run() {
-        sparkRBackendPort = sparkRBackend.init()
+        val (port, authHelper) = sparkRBackend.init()
+        sparkRBackendPort = port
+        sparkRBackendSecret = authHelper.secret
         initialized.release()
         sparkRBackend.run()
       }
@@ -91,6 +94,7 @@ object RRunner {
         env.put("SPARKR_PACKAGE_DIR", rPackageDir.mkString(","))
         env.put("R_PROFILE_USER",
           Seq(rPackageDir(0), "SparkR", "profile", "general.R").mkString(File.separator))
+        env.put("SPARKR_BACKEND_AUTH_SECRET", sparkRBackendSecret)
         builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr to synchronize
         val process = builder.start()
 

From 4c49b12da512ae29e2e4b773a334abbf6a4f08f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Cattilapiros=E2=80=9D?=
 <piros.attila.zsolt@gmail.com>
Date: Thu, 10 May 2018 14:26:38 -0700
Subject: [PATCH 442/566] [SPARK-19181][CORE] Fixing flaky
 "SparkListenerSuite.local metrics"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

Sometimes "SparkListenerSuite.local metrics" test fails because the average of executorDeserializeTime is too short. As squito suggested to avoid these situations in one of the task a reference introduced to an object implementing a custom Externalizable.readExternal which sleeps 1ms before returning.

## How was this patch tested?

With unit tests (and checking the effect of this change to the average with a much larger sleep time).

Author: “attilapiros” <piros.attila.zsolt@gmail.com>
Author: Attila Zsolt Piros <2017933+attilapiros@users.noreply.github.com>

Closes #21280 from attilapiros/SPARK-19181.

(cherry picked from commit 3e2600538ee477ffe3f23fba57719e035219550b)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../spark/scheduler/SparkListenerSuite.scala      | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
index da6ecb82c7e42..fa47a52bbbc47 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.scheduler
 
+import java.io.{Externalizable, IOException, ObjectInput, ObjectOutput}
 import java.util.concurrent.Semaphore
 
 import scala.collection.JavaConverters._
@@ -294,10 +295,13 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     val listener = new SaveStageAndTaskInfo
     sc.addSparkListener(listener)
     sc.addSparkListener(new StatsReportListener)
-    // just to make sure some of the tasks take a noticeable amount of time
+    // just to make sure some of the tasks and their deserialization take a noticeable
+    // amount of time
+    val slowDeserializable = new SlowDeserializable
     val w = { i: Int =>
       if (i == 0) {
         Thread.sleep(100)
+        slowDeserializable.use()
       }
       i
     }
@@ -583,3 +587,12 @@ private class FirehoseListenerThatAcceptsSparkConf(conf: SparkConf) extends Spar
     case _ =>
   }
 }
+
+private class SlowDeserializable extends Externalizable {
+
+  override def writeExternal(out: ObjectOutput): Unit = { }
+
+  override def readExternal(in: ObjectInput): Unit = Thread.sleep(1)
+
+  def use(): Unit = { }
+}

From 414e4e3d70caa950a63fab1c8cac3314fb961b0c Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Thu, 10 May 2018 14:41:55 -0700
Subject: [PATCH 443/566] [SPARK-10878][CORE] Fix race condition when multiple
 clients resolves artifacts at the same time

## What changes were proposed in this pull request?

When multiple clients attempt to resolve artifacts via the `--packages` parameter, they could run into race condition when they each attempt to modify the dummy `org.apache.spark-spark-submit-parent-default.xml` file created in the default ivy cache dir.
This PR changes the behavior to encode UUID in the dummy module descriptor so each client will operate on a different resolution file in the ivy cache dir. In addition, this patch changes the behavior of when and which resolution files are cleaned to prevent accumulation of resolution files in the default ivy cache dir.

Since this PR is a successor of #18801, close #18801. Many codes were ported from #18801. **Many efforts were put here. I think this PR should credit to Victsm .**

## How was this patch tested?

added UT into `SparkSubmitUtilsSuite`

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #21251 from kiszk/SPARK-10878.

(cherry picked from commit d3c426a5b02abdec49ff45df12a8f11f9e473a88)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../org/apache/spark/deploy/SparkSubmit.scala | 42 ++++++++++++++-----
 .../spark/deploy/SparkSubmitUtilsSuite.scala  | 15 +++++++
 2 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index deb52a4ff20e9..d1347f1fdd2be 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -22,6 +22,7 @@ import java.lang.reflect.{InvocationTargetException, Modifier, UndeclaredThrowab
 import java.net.URL
 import java.security.PrivilegedExceptionAction
 import java.text.ParseException
+import java.util.UUID
 
 import scala.annotation.tailrec
 import scala.collection.mutable.{ArrayBuffer, HashMap, Map}
@@ -1209,7 +1210,33 @@ private[spark] object SparkSubmitUtils {
 
   /** A nice function to use in tests as well. Values are dummy strings. */
   def getModuleDescriptor: DefaultModuleDescriptor = DefaultModuleDescriptor.newDefaultInstance(
-    ModuleRevisionId.newInstance("org.apache.spark", "spark-submit-parent", "1.0"))
+    // Include UUID in module name, so multiple clients resolving maven coordinate at the same time
+    // do not modify the same resolution file concurrently.
+    ModuleRevisionId.newInstance("org.apache.spark",
+      s"spark-submit-parent-${UUID.randomUUID.toString}",
+      "1.0"))
+
+  /**
+   * Clear ivy resolution from current launch. The resolution file is usually at
+   * ~/.ivy2/org.apache.spark-spark-submit-parent-$UUID-default.xml,
+   * ~/.ivy2/resolved-org.apache.spark-spark-submit-parent-$UUID-1.0.xml, and
+   * ~/.ivy2/resolved-org.apache.spark-spark-submit-parent-$UUID-1.0.properties.
+   * Since each launch will have its own resolution files created, delete them after
+   * each resolution to prevent accumulation of these files in the ivy cache dir.
+   */
+  private def clearIvyResolutionFiles(
+      mdId: ModuleRevisionId,
+      ivySettings: IvySettings,
+      ivyConfName: String): Unit = {
+    val currentResolutionFiles = Seq(
+      s"${mdId.getOrganisation}-${mdId.getName}-$ivyConfName.xml",
+      s"resolved-${mdId.getOrganisation}-${mdId.getName}-${mdId.getRevision}.xml",
+      s"resolved-${mdId.getOrganisation}-${mdId.getName}-${mdId.getRevision}.properties"
+    )
+    currentResolutionFiles.foreach { filename =>
+      new File(ivySettings.getDefaultCache, filename).delete()
+    }
+  }
 
   /**
    * Resolves any dependencies that were supplied through maven coordinates
@@ -1260,14 +1287,6 @@ private[spark] object SparkSubmitUtils {
 
         // A Module descriptor must be specified. Entries are dummy strings
         val md = getModuleDescriptor
-        // clear ivy resolution from previous launches. The resolution file is usually at
-        // ~/.ivy2/org.apache.spark-spark-submit-parent-default.xml. In between runs, this file
-        // leads to confusion with Ivy when the files can no longer be found at the repository
-        // declared in that file/
-        val mdId = md.getModuleRevisionId
-        val previousResolution = new File(ivySettings.getDefaultCache,
-          s"${mdId.getOrganisation}-${mdId.getName}-$ivyConfName.xml")
-        if (previousResolution.exists) previousResolution.delete
 
         md.setDefaultConf(ivyConfName)
 
@@ -1288,7 +1307,10 @@ private[spark] object SparkSubmitUtils {
           packagesDirectory.getAbsolutePath + File.separator +
             "[organization]_[artifact]-[revision](-[classifier]).[ext]",
           retrieveOptions.setConfs(Array(ivyConfName)))
-        resolveDependencyPaths(rr.getArtifacts.toArray, packagesDirectory)
+        val paths = resolveDependencyPaths(rr.getArtifacts.toArray, packagesDirectory)
+        val mdId = md.getModuleRevisionId
+        clearIvyResolutionFiles(mdId, ivySettings, ivyConfName)
+        paths
       } finally {
         System.setOut(sysOut)
       }
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
index eb8c203ae7751..a0f09891787e0 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
@@ -256,4 +256,19 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
       assert(jarPath.indexOf("mydep") >= 0, "should find dependency")
     }
   }
+
+  test("SPARK-10878: test resolution files cleaned after resolving artifact") {
+    val main = new MavenCoordinate("my.great.lib", "mylib", "0.1")
+
+    IvyTestUtils.withRepository(main, None, None) { repo =>
+      val ivySettings = SparkSubmitUtils.buildIvySettings(Some(repo), Some(tempIvyPath))
+      val jarPath = SparkSubmitUtils.resolveMavenCoordinates(
+        main.toString,
+        ivySettings,
+        isTest = true)
+      val r = """.*org.apache.spark-spark-submit-parent-.*""".r
+      assert(!ivySettings.getDefaultCache.listFiles.map(_.getName)
+        .exists(r.findFirstIn(_).isDefined), "resolution files should be cleaned")
+    }
+  }
 }

From 1d598b771de3b588a2f377ae7ccf8193156641f2 Mon Sep 17 00:00:00 2001
From: cody koeninger <cody@koeninger.org>
Date: Fri, 11 May 2018 13:40:36 -0500
Subject: [PATCH 444/566] [SPARK-24067][BACKPORT-2.3][STREAMING][KAFKA] Allow
 non-consecutive offsets

## What changes were proposed in this pull request?

Backport of the bugfix in SPARK-17147

Add a configuration spark.streaming.kafka.allowNonConsecutiveOffsets to allow streaming jobs to proceed on compacted topics (or other situations involving gaps between offsets in the log).

## How was this patch tested?

Added new unit test

justinrmiller has been testing this branch in production for a few weeks

Author: cody koeninger <cody@koeninger.org>

Closes #21300 from koeninger/branch-2.3_kafkafix.
---
 .../kafka010/CachedKafkaConsumer.scala        |  55 +++-
 .../spark/streaming/kafka010/KafkaRDD.scala   | 236 +++++++++++++-----
 .../streaming/kafka010/KafkaRDDSuite.scala    | 106 ++++++++
 .../streaming/kafka010/KafkaTestUtils.scala   |  25 +-
 .../kafka010/mocks/MockScheduler.scala        |  96 +++++++
 .../streaming/kafka010/mocks/MockTime.scala   |  51 ++++
 6 files changed, 487 insertions(+), 82 deletions(-)
 create mode 100644 external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/mocks/MockScheduler.scala
 create mode 100644 external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/mocks/MockTime.scala

diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/CachedKafkaConsumer.scala b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/CachedKafkaConsumer.scala
index fa3ea6131a507..aeb8c1dc342b3 100644
--- a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/CachedKafkaConsumer.scala
+++ b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/CachedKafkaConsumer.scala
@@ -22,10 +22,8 @@ import java.{ util => ju }
 import org.apache.kafka.clients.consumer.{ ConsumerConfig, ConsumerRecord, KafkaConsumer }
 import org.apache.kafka.common.{ KafkaException, TopicPartition }
 
-import org.apache.spark.SparkConf
 import org.apache.spark.internal.Logging
 
-
 /**
  * Consumer of single topicpartition, intended for cached reuse.
  * Underlying consumer is not threadsafe, so neither is this,
@@ -38,7 +36,7 @@ class CachedKafkaConsumer[K, V] private(
   val partition: Int,
   val kafkaParams: ju.Map[String, Object]) extends Logging {
 
-  assert(groupId == kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG),
+  require(groupId == kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG),
     "groupId used for cache key must match the groupId in kafkaParams")
 
   val topicPartition = new TopicPartition(topic, partition)
@@ -53,7 +51,7 @@ class CachedKafkaConsumer[K, V] private(
 
   // TODO if the buffer was kept around as a random-access structure,
   // could possibly optimize re-calculating of an RDD in the same batch
-  protected var buffer = ju.Collections.emptyList[ConsumerRecord[K, V]]().iterator
+  protected var buffer = ju.Collections.emptyListIterator[ConsumerRecord[K, V]]()
   protected var nextOffset = -2L
 
   def close(): Unit = consumer.close()
@@ -71,7 +69,7 @@ class CachedKafkaConsumer[K, V] private(
     }
 
     if (!buffer.hasNext()) { poll(timeout) }
-    assert(buffer.hasNext(),
+    require(buffer.hasNext(),
       s"Failed to get records for $groupId $topic $partition $offset after polling for $timeout")
     var record = buffer.next()
 
@@ -79,17 +77,56 @@ class CachedKafkaConsumer[K, V] private(
       logInfo(s"Buffer miss for $groupId $topic $partition $offset")
       seek(offset)
       poll(timeout)
-      assert(buffer.hasNext(),
+      require(buffer.hasNext(),
         s"Failed to get records for $groupId $topic $partition $offset after polling for $timeout")
       record = buffer.next()
-      assert(record.offset == offset,
-        s"Got wrong record for $groupId $topic $partition even after seeking to offset $offset")
+      require(record.offset == offset,
+        s"Got wrong record for $groupId $topic $partition even after seeking to offset $offset " +
+          s"got offset ${record.offset} instead. If this is a compacted topic, consider enabling " +
+          "spark.streaming.kafka.allowNonConsecutiveOffsets"
+      )
     }
 
     nextOffset = offset + 1
     record
   }
 
+  /**
+   * Start a batch on a compacted topic
+   */
+  def compactedStart(offset: Long, timeout: Long): Unit = {
+    logDebug(s"compacted start $groupId $topic $partition starting $offset")
+    // This seek may not be necessary, but it's hard to tell due to gaps in compacted topics
+    if (offset != nextOffset) {
+      logInfo(s"Initial fetch for compacted $groupId $topic $partition $offset")
+      seek(offset)
+      poll(timeout)
+    }
+  }
+
+  /**
+   * Get the next record in the batch from a compacted topic.
+   * Assumes compactedStart has been called first, and ignores gaps.
+   */
+  def compactedNext(timeout: Long): ConsumerRecord[K, V] = {
+    if (!buffer.hasNext()) {
+      poll(timeout)
+    }
+    require(buffer.hasNext(),
+      s"Failed to get records for compacted $groupId $topic $partition after polling for $timeout")
+    val record = buffer.next()
+    nextOffset = record.offset + 1
+    record
+  }
+
+  /**
+   * Rewind to previous record in the batch from a compacted topic.
+   * @throws NoSuchElementException if no previous element
+   */
+  def compactedPrevious(): ConsumerRecord[K, V] = {
+    buffer.previous()
+  }
+
   private def seek(offset: Long): Unit = {
     logDebug(s"Seeking to $topicPartition $offset")
     consumer.seek(topicPartition, offset)
@@ -99,7 +136,7 @@ class CachedKafkaConsumer[K, V] private(
     val p = consumer.poll(timeout)
     val r = p.records(topicPartition)
     logDebug(s"Polled ${p.partitions()}  ${r.size}")
-    buffer = r.iterator
+    buffer = r.listIterator
   }
 
 }
diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala
index d9fc9cc206647..07239eda64d2e 100644
--- a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala
+++ b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala
@@ -55,12 +55,12 @@ private[spark] class KafkaRDD[K, V](
     useConsumerCache: Boolean
 ) extends RDD[ConsumerRecord[K, V]](sc, Nil) with Logging with HasOffsetRanges {
 
-  assert("none" ==
+  require("none" ==
     kafkaParams.get(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG).asInstanceOf[String],
     ConsumerConfig.AUTO_OFFSET_RESET_CONFIG +
       " must be set to none for executor kafka params, else messages may not match offsetRange")
 
-  assert(false ==
+  require(false ==
     kafkaParams.get(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG).asInstanceOf[Boolean],
     ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG +
       " must be set to false for executor kafka params, else offsets may commit before processing")
@@ -74,6 +74,8 @@ private[spark] class KafkaRDD[K, V](
     conf.getInt("spark.streaming.kafka.consumer.cache.maxCapacity", 64)
   private val cacheLoadFactor =
     conf.getDouble("spark.streaming.kafka.consumer.cache.loadFactor", 0.75).toFloat
+  private val compacted =
+    conf.getBoolean("spark.streaming.kafka.allowNonConsecutiveOffsets", false)
 
   override def persist(newLevel: StorageLevel): this.type = {
     logError("Kafka ConsumerRecord is not serializable. " +
@@ -87,48 +89,63 @@ private[spark] class KafkaRDD[K, V](
     }.toArray
   }
 
-  override def count(): Long = offsetRanges.map(_.count).sum
+  override def count(): Long =
+    if (compacted) {
+      super.count()
+    } else {
+      offsetRanges.map(_.count).sum
+    }
 
   override def countApprox(
       timeout: Long,
       confidence: Double = 0.95
-  ): PartialResult[BoundedDouble] = {
-    val c = count
-    new PartialResult(new BoundedDouble(c, 1.0, c, c), true)
-  }
-
-  override def isEmpty(): Boolean = count == 0L
-
-  override def take(num: Int): Array[ConsumerRecord[K, V]] = {
-    val nonEmptyPartitions = this.partitions
-      .map(_.asInstanceOf[KafkaRDDPartition])
-      .filter(_.count > 0)
+  ): PartialResult[BoundedDouble] =
+    if (compacted) {
+      super.countApprox(timeout, confidence)
+    } else {
+      val c = count
+      new PartialResult(new BoundedDouble(c, 1.0, c, c), true)
+    }
 
-    if (num < 1 || nonEmptyPartitions.isEmpty) {
-      return new Array[ConsumerRecord[K, V]](0)
+  override def isEmpty(): Boolean =
+    if (compacted) {
+      super.isEmpty()
+    } else {
+      count == 0L
     }
 
-    // Determine in advance how many messages need to be taken from each partition
-    val parts = nonEmptyPartitions.foldLeft(Map[Int, Int]()) { (result, part) =>
-      val remain = num - result.values.sum
-      if (remain > 0) {
-        val taken = Math.min(remain, part.count)
-        result + (part.index -> taken.toInt)
+  override def take(num: Int): Array[ConsumerRecord[K, V]] =
+    if (compacted) {
+      super.take(num)
+    } else if (num < 1) {
+      Array.empty[ConsumerRecord[K, V]]
+    } else {
+      val nonEmptyPartitions = this.partitions
+        .map(_.asInstanceOf[KafkaRDDPartition])
+        .filter(_.count > 0)
+
+      if (nonEmptyPartitions.isEmpty) {
+        Array.empty[ConsumerRecord[K, V]]
       } else {
-        result
+        // Determine in advance how many messages need to be taken from each partition
+        val parts = nonEmptyPartitions.foldLeft(Map[Int, Int]()) { (result, part) =>
+          val remain = num - result.values.sum
+          if (remain > 0) {
+            val taken = Math.min(remain, part.count)
+            result + (part.index -> taken.toInt)
+          } else {
+            result
+          }
+        }
+
+        context.runJob(
+          this,
+          (tc: TaskContext, it: Iterator[ConsumerRecord[K, V]]) =>
+          it.take(parts(tc.partitionId)).toArray, parts.keys.toArray
+        ).flatten
       }
     }
 
-    val buf = new ArrayBuffer[ConsumerRecord[K, V]]
-    val res = context.runJob(
-      this,
-      (tc: TaskContext, it: Iterator[ConsumerRecord[K, V]]) =>
-      it.take(parts(tc.partitionId)).toArray, parts.keys.toArray
-    )
-    res.foreach(buf ++= _)
-    buf.toArray
-  }
-
   private def executors(): Array[ExecutorCacheTaskLocation] = {
     val bm = sparkContext.env.blockManager
     bm.master.getPeers(bm.blockManagerId).toArray
@@ -172,57 +189,138 @@ private[spark] class KafkaRDD[K, V](
 
   override def compute(thePart: Partition, context: TaskContext): Iterator[ConsumerRecord[K, V]] = {
     val part = thePart.asInstanceOf[KafkaRDDPartition]
-    assert(part.fromOffset <= part.untilOffset, errBeginAfterEnd(part))
+    require(part.fromOffset <= part.untilOffset, errBeginAfterEnd(part))
     if (part.fromOffset == part.untilOffset) {
       logInfo(s"Beginning offset ${part.fromOffset} is the same as ending offset " +
         s"skipping ${part.topic} ${part.partition}")
       Iterator.empty
     } else {
-      new KafkaRDDIterator(part, context)
+      logInfo(s"Computing topic ${part.topic}, partition ${part.partition} " +
+        s"offsets ${part.fromOffset} -> ${part.untilOffset}")
+      if (compacted) {
+        new CompactedKafkaRDDIterator[K, V](
+          part,
+          context,
+          kafkaParams,
+          useConsumerCache,
+          pollTimeout,
+          cacheInitialCapacity,
+          cacheMaxCapacity,
+          cacheLoadFactor
+        )
+      } else {
+        new KafkaRDDIterator[K, V](
+          part,
+          context,
+          kafkaParams,
+          useConsumerCache,
+          pollTimeout,
+          cacheInitialCapacity,
+          cacheMaxCapacity,
+          cacheLoadFactor
+        )
+      }
     }
   }
+}
 
-  /**
-   * An iterator that fetches messages directly from Kafka for the offsets in partition.
-   * Uses a cached consumer where possible to take advantage of prefetching
-   */
-  private class KafkaRDDIterator(
-      part: KafkaRDDPartition,
-      context: TaskContext) extends Iterator[ConsumerRecord[K, V]] {
-
-    logInfo(s"Computing topic ${part.topic}, partition ${part.partition} " +
-      s"offsets ${part.fromOffset} -> ${part.untilOffset}")
-
-    val groupId = kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).asInstanceOf[String]
+/**
+ * An iterator that fetches messages directly from Kafka for the offsets in partition.
+ * Uses a cached consumer where possible to take advantage of prefetching
+ */
+private class KafkaRDDIterator[K, V](
+  part: KafkaRDDPartition,
+  context: TaskContext,
+  kafkaParams: ju.Map[String, Object],
+  useConsumerCache: Boolean,
+  pollTimeout: Long,
+  cacheInitialCapacity: Int,
+  cacheMaxCapacity: Int,
+  cacheLoadFactor: Float
+) extends Iterator[ConsumerRecord[K, V]] {
+
+  val groupId = kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).asInstanceOf[String]
+
+  context.addTaskCompletionListener(_ => closeIfNeeded())
+
+  val consumer = if (useConsumerCache) {
+    CachedKafkaConsumer.init(cacheInitialCapacity, cacheMaxCapacity, cacheLoadFactor)
+    if (context.attemptNumber >= 1) {
+      // just in case the prior attempt failures were cache related
+      CachedKafkaConsumer.remove(groupId, part.topic, part.partition)
+    }
+    CachedKafkaConsumer.get[K, V](groupId, part.topic, part.partition, kafkaParams)
+  } else {
+    CachedKafkaConsumer.getUncached[K, V](groupId, part.topic, part.partition, kafkaParams)
+  }
 
-    context.addTaskCompletionListener{ context => closeIfNeeded() }
+  var requestOffset = part.fromOffset
 
-    val consumer = if (useConsumerCache) {
-      CachedKafkaConsumer.init(cacheInitialCapacity, cacheMaxCapacity, cacheLoadFactor)
-      if (context.attemptNumber >= 1) {
-        // just in case the prior attempt failures were cache related
-        CachedKafkaConsumer.remove(groupId, part.topic, part.partition)
-      }
-      CachedKafkaConsumer.get[K, V](groupId, part.topic, part.partition, kafkaParams)
-    } else {
-      CachedKafkaConsumer.getUncached[K, V](groupId, part.topic, part.partition, kafkaParams)
+  def closeIfNeeded(): Unit = {
+    if (!useConsumerCache && consumer != null) {
+      consumer.close()
     }
+  }
 
-    var requestOffset = part.fromOffset
+  override def hasNext(): Boolean = requestOffset < part.untilOffset
 
-    def closeIfNeeded(): Unit = {
-      if (!useConsumerCache && consumer != null) {
-        consumer.close
-      }
+  override def next(): ConsumerRecord[K, V] = {
+    if (!hasNext) {
+      throw new ju.NoSuchElementException("Can't call getNext() once untilOffset has been reached")
     }
+    val r = consumer.get(requestOffset, pollTimeout)
+    requestOffset += 1
+    r
+  }
+}
 
-    override def hasNext(): Boolean = requestOffset < part.untilOffset
-
-    override def next(): ConsumerRecord[K, V] = {
-      assert(hasNext(), "Can't call getNext() once untilOffset has been reached")
-      val r = consumer.get(requestOffset, pollTimeout)
-      requestOffset += 1
-      r
+/**
+ * An iterator that fetches messages directly from Kafka for the offsets in partition.
+ * Uses a cached consumer where possible to take advantage of prefetching.
+ * Intended for compacted topics, or other cases when non-consecutive offsets are ok.
+ */
+private class CompactedKafkaRDDIterator[K, V](
+    part: KafkaRDDPartition,
+    context: TaskContext,
+    kafkaParams: ju.Map[String, Object],
+    useConsumerCache: Boolean,
+    pollTimeout: Long,
+    cacheInitialCapacity: Int,
+    cacheMaxCapacity: Int,
+    cacheLoadFactor: Float
+  ) extends KafkaRDDIterator[K, V](
+    part,
+    context,
+    kafkaParams,
+    useConsumerCache,
+    pollTimeout,
+    cacheInitialCapacity,
+    cacheMaxCapacity,
+    cacheLoadFactor
+  ) {
+
+  consumer.compactedStart(part.fromOffset, pollTimeout)
+
+  private var nextRecord = consumer.compactedNext(pollTimeout)
+
+  private var okNext: Boolean = true
+
+  override def hasNext(): Boolean = okNext
+
+  override def next(): ConsumerRecord[K, V] = {
+    if (!hasNext) {
+      throw new ju.NoSuchElementException("Can't call getNext() once untilOffset has been reached")
+    }
+    val r = nextRecord
+    if (r.offset + 1 >= part.untilOffset) {
+      okNext = false
+    } else {
+      nextRecord = consumer.compactedNext(pollTimeout)
+      if (nextRecord.offset >= part.untilOffset) {
+        okNext = false
+        consumer.compactedPrevious()
+      }
     }
+    r
   }
 }
diff --git a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaRDDSuite.scala b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaRDDSuite.scala
index be373af0599cc..271adea1df731 100644
--- a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaRDDSuite.scala
+++ b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaRDDSuite.scala
@@ -18,16 +18,22 @@
 package org.apache.spark.streaming.kafka010
 
 import java.{ util => ju }
+import java.io.File
 
 import scala.collection.JavaConverters._
 import scala.util.Random
 
+import kafka.common.TopicAndPartition
+import kafka.log._
+import kafka.message._
+import kafka.utils.Pool
 import org.apache.kafka.common.TopicPartition
 import org.apache.kafka.common.serialization.StringDeserializer
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark._
 import org.apache.spark.scheduler.ExecutorCacheTaskLocation
+import org.apache.spark.streaming.kafka010.mocks.MockTime
 
 class KafkaRDDSuite extends SparkFunSuite with BeforeAndAfterAll {
 
@@ -64,6 +70,41 @@ class KafkaRDDSuite extends SparkFunSuite with BeforeAndAfterAll {
 
   private val preferredHosts = LocationStrategies.PreferConsistent
 
+  private def compactLogs(topic: String, partition: Int, messages: Array[(String, String)]) {
+    val mockTime = new MockTime()
+    // LogCleaner in 0.10 version of Kafka is still expecting the old TopicAndPartition api
+    val logs = new Pool[TopicAndPartition, Log]()
+    val logDir = kafkaTestUtils.brokerLogDir
+    val dir = new File(logDir, topic + "-" + partition)
+    dir.mkdirs()
+    val logProps = new ju.Properties()
+    logProps.put(LogConfig.CleanupPolicyProp, LogConfig.Compact)
+    logProps.put(LogConfig.MinCleanableDirtyRatioProp, java.lang.Float.valueOf(0.1f))
+    val log = new Log(
+      dir,
+      LogConfig(logProps),
+      0L,
+      mockTime.scheduler,
+      mockTime
+    )
+    messages.foreach { case (k, v) =>
+      val msg = new ByteBufferMessageSet(
+        NoCompressionCodec,
+        new Message(v.getBytes, k.getBytes, Message.NoTimestamp, Message.CurrentMagicValue))
+      log.append(msg)
+    }
+    log.roll()
+    logs.put(TopicAndPartition(topic, partition), log)
+
+    val cleaner = new LogCleaner(CleanerConfig(), logDirs = Array(dir), logs = logs)
+    cleaner.startup()
+    cleaner.awaitCleaned(topic, partition, log.activeSegment.baseOffset, 1000)
+
+    cleaner.shutdown()
+    mockTime.scheduler.shutdown()
+  }
+
+
   test("basic usage") {
     val topic = s"topicbasic-${Random.nextInt}-${System.currentTimeMillis}"
     kafkaTestUtils.createTopic(topic)
@@ -102,6 +143,71 @@ class KafkaRDDSuite extends SparkFunSuite with BeforeAndAfterAll {
     }
   }
 
+  test("compacted topic") {
+    val compactConf = sparkConf.clone()
+    compactConf.set("spark.streaming.kafka.allowNonConsecutiveOffsets", "true")
+    sc.stop()
+    sc = new SparkContext(compactConf)
+    val topic = s"topiccompacted-${Random.nextInt}-${System.currentTimeMillis}"
+
+    val messages = Array(
+      ("a", "1"),
+      ("a", "2"),
+      ("b", "1"),
+      ("c", "1"),
+      ("c", "2"),
+      ("b", "2"),
+      ("b", "3")
+    )
+    val compactedMessages = Array(
+      ("a", "2"),
+      ("b", "3"),
+      ("c", "2")
+    )
+
+    compactLogs(topic, 0, messages)
+
+    val props = new ju.Properties()
+    props.put("cleanup.policy", "compact")
+    props.put("flush.messages", "1")
+    props.put("segment.ms", "1")
+    props.put("segment.bytes", "256")
+    kafkaTestUtils.createTopic(topic, 1, props)
+
+
+    val kafkaParams = getKafkaParams()
+
+    val offsetRanges = Array(OffsetRange(topic, 0, 0, messages.size))
+
+    val rdd = KafkaUtils.createRDD[String, String](
+      sc, kafkaParams, offsetRanges, preferredHosts
+    ).map(m => m.key -> m.value)
+
+    val received = rdd.collect.toSet
+    assert(received === compactedMessages.toSet)
+
+    // size-related method optimizations return sane results
+    assert(rdd.count === compactedMessages.size)
+    assert(rdd.countApprox(0).getFinalValue.mean === compactedMessages.size)
+    assert(!rdd.isEmpty)
+    assert(rdd.take(1).size === 1)
+    assert(rdd.take(1).head === compactedMessages.head)
+    assert(rdd.take(messages.size + 10).size === compactedMessages.size)
+
+    val emptyRdd = KafkaUtils.createRDD[String, String](
+      sc, kafkaParams, Array(OffsetRange(topic, 0, 0, 0)), preferredHosts)
+
+    assert(emptyRdd.isEmpty)
+
+    // invalid offset ranges throw exceptions
+    val badRanges = Array(OffsetRange(topic, 0, 0, messages.size + 1))
+    intercept[SparkException] {
+      val result = KafkaUtils.createRDD[String, String](sc, kafkaParams, badRanges, preferredHosts)
+        .map(_.value)
+        .collect()
+    }
+  }
+
   test("iterator boundary conditions") {
     // the idea is to find e.g. off-by-one errors between what kafka has available and the rdd
     val topic = s"topicboundary-${Random.nextInt}-${System.currentTimeMillis}"
diff --git a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaTestUtils.scala b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaTestUtils.scala
index 6c7024ea4b5a5..70b579d96d692 100644
--- a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaTestUtils.scala
+++ b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaTestUtils.scala
@@ -162,17 +162,22 @@ private[kafka010] class KafkaTestUtils extends Logging {
   }
 
   /** Create a Kafka topic and wait until it is propagated to the whole cluster */
-  def createTopic(topic: String, partitions: Int): Unit = {
-    AdminUtils.createTopic(zkUtils, topic, partitions, 1)
+  def createTopic(topic: String, partitions: Int, config: Properties): Unit = {
+    AdminUtils.createTopic(zkUtils, topic, partitions, 1, config)
     // wait until metadata is propagated
     (0 until partitions).foreach { p =>
       waitUntilMetadataIsPropagated(topic, p)
     }
   }
 
+  /** Create a Kafka topic and wait until it is propagated to the whole cluster */
+  def createTopic(topic: String, partitions: Int): Unit = {
+    createTopic(topic, partitions, new Properties())
+  }
+
   /** Create a Kafka topic and wait until it is propagated to the whole cluster */
   def createTopic(topic: String): Unit = {
-    createTopic(topic, 1)
+    createTopic(topic, 1, new Properties())
   }
 
   /** Java-friendly function for sending messages to the Kafka broker */
@@ -196,12 +201,24 @@ private[kafka010] class KafkaTestUtils extends Logging {
     producer = null
   }
 
+  /** Send the array of (key, value) messages to the Kafka broker */
+  def sendMessages(topic: String, messages: Array[(String, String)]): Unit = {
+    producer = new KafkaProducer[String, String](producerConfiguration)
+    messages.foreach { message =>
+      producer.send(new ProducerRecord[String, String](topic, message._1, message._2))
+    }
+    producer.close()
+    producer = null
+  }
+
+  val brokerLogDir = Utils.createTempDir().getAbsolutePath
+
   private def brokerConfiguration: Properties = {
     val props = new Properties()
     props.put("broker.id", "0")
     props.put("host.name", "localhost")
     props.put("port", brokerPort.toString)
-    props.put("log.dir", Utils.createTempDir().getAbsolutePath)
+    props.put("log.dir", brokerLogDir)
     props.put("zookeeper.connect", zkAddress)
     props.put("log.flush.interval.messages", "1")
     props.put("replica.socket.timeout.ms", "1500")
diff --git a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/mocks/MockScheduler.scala b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/mocks/MockScheduler.scala
new file mode 100644
index 0000000000000..928e1a6ef54b9
--- /dev/null
+++ b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/mocks/MockScheduler.scala
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka010.mocks
+
+import java.util.concurrent.TimeUnit
+
+import scala.collection.mutable.PriorityQueue
+
+import kafka.utils.{Scheduler, Time}
+
+/**
+ * A mock scheduler that executes tasks synchronously using a mock time instance.
+ * Tasks are executed synchronously when the time is advanced.
+ * This class is meant to be used in conjunction with MockTime.
+ *
+ * Example usage
+ * <code>
+ *   val time = new MockTime
+ *   time.scheduler.schedule("a task", println("hello world: " + time.milliseconds), delay = 1000)
+ *   time.sleep(1001) // this should cause our scheduled task to fire
+ * </code>
+ *
+ * Incrementing the time to the exact next execution time of a task will result in that task
+ * executing (it as if execution itself takes no time).
+ */
+private[kafka010] class MockScheduler(val time: Time) extends Scheduler {
+
+  /* a priority queue of tasks ordered by next execution time */
+  var tasks = new PriorityQueue[MockTask]()
+
+  def isStarted: Boolean = true
+
+  def startup(): Unit = {}
+
+  def shutdown(): Unit = synchronized {
+    tasks.foreach(_.fun())
+    tasks.clear()
+  }
+
+  /**
+   * Check for any tasks that need to execute. Since this is a mock scheduler this check only occurs
+   * when this method is called and the execution happens synchronously in the calling thread.
+   * If you are using the scheduler associated with a MockTime instance this call
+   * will be triggered automatically.
+   */
+  def tick(): Unit = synchronized {
+    val now = time.milliseconds
+    while(!tasks.isEmpty && tasks.head.nextExecution <= now) {
+      /* pop and execute the task with the lowest next execution time */
+      val curr = tasks.dequeue
+      curr.fun()
+      /* if the task is periodic, reschedule it and re-enqueue */
+      if(curr.periodic) {
+        curr.nextExecution += curr.period
+        this.tasks += curr
+      }
+    }
+  }
+
+  def schedule(
+      name: String,
+      fun: () => Unit,
+      delay: Long = 0,
+      period: Long = -1,
+      unit: TimeUnit = TimeUnit.MILLISECONDS): Unit = synchronized {
+    tasks += MockTask(name, fun, time.milliseconds + delay, period = period)
+    tick()
+  }
+
+}
+
+case class MockTask(
+    val name: String,
+    val fun: () => Unit,
+    var nextExecution: Long,
+    val period: Long) extends Ordered[MockTask] {
+  def periodic: Boolean = period >= 0
+  def compare(t: MockTask): Int = {
+    java.lang.Long.compare(t.nextExecution, nextExecution)
+  }
+}
diff --git a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/mocks/MockTime.scala b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/mocks/MockTime.scala
new file mode 100644
index 0000000000000..a68f94db1f689
--- /dev/null
+++ b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/mocks/MockTime.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka010.mocks
+
+import java.util.concurrent._
+
+import kafka.utils.Time
+
+/**
+ * A class used for unit testing things which depend on the Time interface.
+ *
+ * This class never manually advances the clock, it only does so when you call
+ *   sleep(ms)
+ *
+ * It also comes with an associated scheduler instance for managing background tasks in
+ * a deterministic way.
+ */
+private[kafka010] class MockTime(@volatile private var currentMs: Long) extends Time {
+
+  val scheduler = new MockScheduler(this)
+
+  def this() = this(System.currentTimeMillis)
+
+  def milliseconds: Long = currentMs
+
+  def nanoseconds: Long =
+    TimeUnit.NANOSECONDS.convert(currentMs, TimeUnit.MILLISECONDS)
+
+  def sleep(ms: Long) {
+    this.currentMs += ms
+    scheduler.tick()
+  }
+
+  override def toString(): String = s"MockTime($milliseconds)"
+
+}

From 7de4bef9ec37440aa36e6b0e9d8656de07d03b68 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Fri, 11 May 2018 17:00:51 -0700
Subject: [PATCH 445/566] [SPARKR] Require Java 8 for SparkR

This change updates the SystemRequirements and also includes a runtime check if the JVM is being launched by R. The runtime check is done by querying `java -version`

## How was this patch tested?

Tested on a Mac and Windows machine

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #21278 from shivaram/sparkr-skip-solaris.

(cherry picked from commit f27a035daf705766d3445e5c6a99867c11c552b0)
Signed-off-by: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
---
 R/pkg/DESCRIPTION |  1 +
 R/pkg/R/client.R  | 35 +++++++++++++++++++++++++++++++++++
 R/pkg/R/sparkR.R  |  1 +
 R/pkg/R/utils.R   |  4 ++--
 4 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 29a8a0062873f..632bcb342caeb 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -13,6 +13,7 @@ Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
 License: Apache License (== 2.0)
 URL: http://www.apache.org/ http://spark.apache.org/
 BugReports: http://spark.apache.org/contributing.html
+SystemRequirements: Java (== 8)
 Depends:
     R (>= 3.0),
     methods
diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R
index 7244cc9f9e38e..e9295e05872bd 100644
--- a/R/pkg/R/client.R
+++ b/R/pkg/R/client.R
@@ -60,6 +60,40 @@ generateSparkSubmitArgs <- function(args, sparkHome, jars, sparkSubmitOpts, pack
   combinedArgs
 }
 
+checkJavaVersion <- function() {
+  javaBin <- "java"
+  javaHome <- Sys.getenv("JAVA_HOME")
+  javaReqs <- utils::packageDescription(utils::packageName(), fields=c("SystemRequirements"))
+  sparkJavaVersion <- as.numeric(tail(strsplit(javaReqs, "[(=)]")[[1]], n = 1L))
+  if (javaHome != "") {
+    javaBin <- file.path(javaHome, "bin", javaBin)
+  }
+
+  # If java is missing from PATH, we get an error in Unix and a warning in Windows
+  javaVersionOut <- tryCatch(
+      launchScript(javaBin, "-version", wait = TRUE, stdout = TRUE, stderr = TRUE),
+                   error = function(e) {
+                     stop("Java version check failed. Please make sure Java is installed",
+                          " and set JAVA_HOME to point to the installation directory.", e)
+                   },
+                   warning = function(w) {
+                     stop("Java version check failed. Please make sure Java is installed",
+                          " and set JAVA_HOME to point to the installation directory.", w)
+                   })
+  javaVersionFilter <- Filter(
+      function(x) {
+        grepl("java version", x)
+      }, javaVersionOut)
+
+  javaVersionStr <- strsplit(javaVersionFilter[[1]], "[\"]")[[1L]][2]
+  # javaVersionStr is of the form 1.8.0_92.
+  # Extract 8 from it to compare to sparkJavaVersion
+  javaVersionNum <- as.integer(strsplit(javaVersionStr, "[.]")[[1L]][2])
+  if (javaVersionNum != sparkJavaVersion) {
+    stop(paste("Java version", sparkJavaVersion, "is required for this package; found version:", javaVersionStr))
+  }
+}
+
 launchBackend <- function(args, sparkHome, jars, sparkSubmitOpts, packages) {
   sparkSubmitBinName <- determineSparkSubmitBin()
   if (sparkHome != "") {
@@ -67,6 +101,7 @@ launchBackend <- function(args, sparkHome, jars, sparkSubmitOpts, packages) {
   } else {
     sparkSubmitBin <- sparkSubmitBinName
   }
+
   combinedArgs <- generateSparkSubmitArgs(args, sparkHome, jars, sparkSubmitOpts, packages)
   cat("Launching java with spark-submit command", sparkSubmitBin, combinedArgs, "\n")
   invisible(launchScript(sparkSubmitBin, combinedArgs))
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 7430d849cba89..2cd8b0c1e7947 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -170,6 +170,7 @@ sparkR.sparkContext <- function(
     submitOps <- getClientModeSparkSubmitOpts(
         Sys.getenv("SPARKR_SUBMIT_ARGS", "sparkr-shell"),
         sparkEnvirMap)
+    checkJavaVersion()
     launchBackend(
         args = path,
         sparkHome = sparkHome,
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 164cd6d01a347..493a50d0ba5cc 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -747,7 +747,7 @@ varargsToJProperties <- function(...) {
   props
 }
 
-launchScript <- function(script, combinedArgs, wait = FALSE) {
+launchScript <- function(script, combinedArgs, wait = FALSE, stdout = "", stderr = "") {
   if (.Platform$OS.type == "windows") {
     scriptWithArgs <- paste(script, combinedArgs, sep = " ")
     # on Windows, intern = F seems to mean output to the console. (documentation on this is missing)
@@ -757,7 +757,7 @@ launchScript <- function(script, combinedArgs, wait = FALSE) {
     # stdout = F means discard output
     # stdout = "" means to its console (default)
     # Note that the console of this child process might not be the same as the running R process.
-    system2(script, combinedArgs, stdout = "", wait = wait)
+    system2(script, combinedArgs, stdout = stdout, wait = wait, stderr = stderr)
   }
 }
 

From 867d948a666605cc53fc200df753abf93cb814fb Mon Sep 17 00:00:00 2001
From: Kelley Robinson <krobinson@twilio.com>
Date: Sun, 13 May 2018 13:19:03 -0700
Subject: [PATCH 446/566] [SPARK-24262][PYTHON] Fix typo in UDF type match
 error message

## What changes were proposed in this pull request?

Updates `functon` to `function`. This was called out in holdenk's PyCon 2018 conference talk. Didn't see any existing PR's for this.

holdenk happy to fix the Pandas.Series bug too but will need a bit more guidance.

Author: Kelley Robinson <krobinson@twilio.com>

Closes #21304 from robinske/master.

(cherry picked from commit 0d210ec8b610e4b0570ce730f3987dc86787c663)
Signed-off-by: Holden Karau <holden@pigscanfly.ca>
---
 python/pyspark/worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index dec06420dac39..44e9106a23523 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -81,7 +81,7 @@ def wrap_scalar_pandas_udf(f, return_type):
     def verify_result_length(*a):
         result = f(*a)
         if not hasattr(result, "__len__"):
-            raise TypeError("Return type of the user-defined functon should be "
+            raise TypeError("Return type of the user-defined function should be "
                             "Pandas.Series, but is {}".format(type(result)))
         if len(result) != len(a[0]):
             raise RuntimeError("Result vector from pandas_udf was not the required length: "

From 88003f02c11bec9df42d225841b82dc748828940 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Mon, 14 May 2018 10:49:12 -0700
Subject: [PATCH 447/566] [SPARK-24263][R] SparkR java check breaks with
 openjdk

## What changes were proposed in this pull request?

Change text to grep for.

## How was this patch tested?

manual test

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #21314 from felixcheung/openjdkver.

(cherry picked from commit 1430fa80e37762e31cc5adc74cd609c215d84b6e)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 R/pkg/R/client.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R
index e9295e05872bd..14a17c600b17f 100644
--- a/R/pkg/R/client.R
+++ b/R/pkg/R/client.R
@@ -82,7 +82,7 @@ checkJavaVersion <- function() {
                    })
   javaVersionFilter <- Filter(
       function(x) {
-        grepl("java version", x)
+        grepl(" version", x)
       }, javaVersionOut)
 
   javaVersionStr <- strsplit(javaVersionFilter[[1]], "[\"]")[[1L]][2]

From 2f60df09dc1bb65da254e00abe8463122e8c77cf Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <zsxwing@gmail.com>
Date: Mon, 14 May 2018 11:37:57 -0700
Subject: [PATCH 448/566] [SPARK-24246][SQL] Improve AnalysisException by
 setting the cause when it's available

## What changes were proposed in this pull request?

If there is an exception, it's better to set it as the cause of AnalysisException since the exception may contain useful debug information.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <zsxwing@gmail.com>

Closes #21297 from zsxwing/SPARK-24246.

(cherry picked from commit c26f673252c2cbbccf8c395ba6d4ab80c098d60e)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../org/apache/spark/sql/catalyst/analysis/Analyzer.scala   | 6 +++---
 .../spark/sql/catalyst/analysis/ResolveInlineTables.scala   | 2 +-
 .../org/apache/spark/sql/catalyst/analysis/package.scala    | 5 +++++
 .../org/apache/spark/sql/execution/datasources/rules.scala  | 2 +-
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 9cc928c8ac108..8597d83d83000 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -661,13 +661,13 @@ class Analyzer(
       try {
         catalog.lookupRelation(tableIdentWithDb)
       } catch {
-        case _: NoSuchTableException =>
-          u.failAnalysis(s"Table or view not found: ${tableIdentWithDb.unquotedString}")
+        case e: NoSuchTableException =>
+          u.failAnalysis(s"Table or view not found: ${tableIdentWithDb.unquotedString}", e)
         // If the database is defined and that database is not found, throw an AnalysisException.
         // Note that if the database is not defined, it is possible we are looking up a temp view.
         case e: NoSuchDatabaseException =>
           u.failAnalysis(s"Table or view not found: ${tableIdentWithDb.unquotedString}, the " +
-            s"database ${e.db} doesn't exist.")
+            s"database ${e.db} doesn't exist.", e)
       }
     }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveInlineTables.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveInlineTables.scala
index f2df3e132629f..71ed75454cd4d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveInlineTables.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveInlineTables.scala
@@ -103,7 +103,7 @@ case class ResolveInlineTables(conf: SQLConf) extends Rule[LogicalPlan] with Cas
           castedExpr.eval()
         } catch {
           case NonFatal(ex) =>
-            table.failAnalysis(s"failed to evaluate expression ${e.sql}: ${ex.getMessage}")
+            table.failAnalysis(s"failed to evaluate expression ${e.sql}: ${ex.getMessage}", ex)
         }
       })
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala
index 7731336d247db..354a3fa0602a9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala
@@ -41,6 +41,11 @@ package object analysis {
     def failAnalysis(msg: String): Nothing = {
       throw new AnalysisException(msg, t.origin.line, t.origin.startPosition)
     }
+
+    /** Fails the analysis at the point where a specific tree node was parsed. */
+    def failAnalysis(msg: String, cause: Throwable): Nothing = {
+      throw new AnalysisException(msg, t.origin.line, t.origin.startPosition, cause = Some(cause))
+    }
   }
 
   /** Catches any AnalysisExceptions thrown by `f` and attaches `t`'s position if any. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index 0dea767840ed3..cab00251622b8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -61,7 +61,7 @@ class ResolveSQLOnFile(sparkSession: SparkSession) extends Rule[LogicalPlan] {
         case _: ClassNotFoundException => u
         case e: Exception =>
           // the provider is valid, but failed to create a logical plan
-          u.failAnalysis(e.getMessage)
+          u.failAnalysis(e.getMessage, e)
       }
   }
 }

From a8ee5706ad96be3d6501471d05f7c3d61d3ca38e Mon Sep 17 00:00:00 2001
From: Henry Robinson <henry@apache.org>
Date: Mon, 14 May 2018 14:05:32 -0700
Subject: [PATCH 449/566] [SPARK-23852][SQL] Upgrade to Parquet 1.8.3

## What changes were proposed in this pull request?

Upgrade Parquet dependency to 1.8.3 to avoid PARQUET-1217

## How was this patch tested?

Ran the included new test case.

Author: Henry Robinson <henry@apache.org>

Closes #21302 from henryr/branch-2.3.
---
 dev/deps/spark-deps-hadoop-2.6                    |  10 +++++-----
 dev/deps/spark-deps-hadoop-2.7                    |  10 +++++-----
 pom.xml                                           |   2 +-
 .../test/resources/test-data/parquet-1217.parquet | Bin 0 -> 321 bytes
 .../datasources/parquet/ParquetFilterSuite.scala  |  10 ++++++++++
 5 files changed, 21 insertions(+), 11 deletions(-)
 create mode 100644 sql/core/src/test/resources/test-data/parquet-1217.parquet

diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 577bf43a1340a..f4559a831de21 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -161,13 +161,13 @@ orc-mapreduce-1.4.3-nohive.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar
-parquet-column-1.8.2.jar
-parquet-common-1.8.2.jar
-parquet-encoding-1.8.2.jar
+parquet-column-1.8.3.jar
+parquet-common-1.8.3.jar
+parquet-encoding-1.8.3.jar
 parquet-format-2.3.1.jar
-parquet-hadoop-1.8.2.jar
+parquet-hadoop-1.8.3.jar
 parquet-hadoop-bundle-1.6.0.jar
-parquet-jackson-1.8.2.jar
+parquet-jackson-1.8.3.jar
 protobuf-java-2.5.0.jar
 py4j-0.10.7.jar
 pyrolite-4.13.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 304982e92e093..c2df998e1fa64 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -162,13 +162,13 @@ orc-mapreduce-1.4.3-nohive.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar
-parquet-column-1.8.2.jar
-parquet-common-1.8.2.jar
-parquet-encoding-1.8.2.jar
+parquet-column-1.8.3.jar
+parquet-common-1.8.3.jar
+parquet-encoding-1.8.3.jar
 parquet-format-2.3.1.jar
-parquet-hadoop-1.8.2.jar
+parquet-hadoop-1.8.3.jar
 parquet-hadoop-bundle-1.6.0.jar
-parquet-jackson-1.8.2.jar
+parquet-jackson-1.8.3.jar
 protobuf-java-2.5.0.jar
 py4j-0.10.7.jar
 pyrolite-4.13.jar
diff --git a/pom.xml b/pom.xml
index 9c2d931cab91e..533c6b4574a55 100644
--- a/pom.xml
+++ b/pom.xml
@@ -129,7 +129,7 @@
     <!-- Version used for internal directory structure -->
     <hive.version.short>1.2.1</hive.version.short>
     <derby.version>10.12.1.1</derby.version>
-    <parquet.version>1.8.2</parquet.version>
+    <parquet.version>1.8.3</parquet.version>
     <orc.version>1.4.3</orc.version>
     <orc.classifier>nohive</orc.classifier>
     <hive.parquet.version>1.6.0</hive.parquet.version>
diff --git a/sql/core/src/test/resources/test-data/parquet-1217.parquet b/sql/core/src/test/resources/test-data/parquet-1217.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..eb2dc4f79907019ffe4edaa9adb951924aad1c5d
GIT binary patch
literal 321
zcmbu5(MrQG7=^Pml#v^~@DB~-BFI`Q)R6|)b+DV=$S%Yc=L=*_hK0@zhq6oG%h&Ne
zT(Vd2z~P6(;p68ti<c4zNEkiA5C}ro^FJ@-HSosXfZzs_0;uoB5F<oKA&#za^oz3Z
zfdI@?67wW#_dQ@n*@2z1$80DnoRd&Z$c4!ZPauE;F&vM^yjaG^_RZ}~$Z?7<l=K!c
z$^{qy!?P2kC|`BEgXxX@av#TT|1R~o)MItOeqJ=|<?2=Do6oMdQnz|itGd#&ZT;Z3
fu6EOz*-WO+YOAa+b*n04ox0;f_}72D8$aF|YqC6p

literal 0
HcmV?d00001

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index 33801954ebd51..79891af5658bf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -602,6 +602,16 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
       }
     }
   }
+
+  test("SPARK-23852: Broken Parquet push-down for partially-written stats") {
+    // parquet-1217.parquet contains a single column with values -1, 0, 1, 2 and null.
+    // The row-group statistics include null counts, but not min and max values, which
+    // triggers PARQUET-1217.
+    val df = readResourceParquetFile("test-data/parquet-1217.parquet")
+
+    // Will return 0 rows if PARQUET-1217 is not fixed.
+    assert(df.where("col > 0").count() === 2)
+  }
 }
 
 class NumRowGroupsAcc extends AccumulatorV2[Integer, Integer] {

From 6dfb515571b68a471509035287a46e431e48b73b Mon Sep 17 00:00:00 2001
From: Henry Robinson <henry@apache.org>
Date: Mon, 14 May 2018 14:35:08 -0700
Subject: [PATCH 450/566] [SPARK-23852][SQL] Add withSQLConf(...) to test case

## What changes were proposed in this pull request?

Add a `withSQLConf(...)` wrapper to force Parquet filter pushdown for a test that relies on it.

## How was this patch tested?

Test passes

Author: Henry Robinson <henry@apache.org>

Closes #21323 from henryr/spark-23582.

(cherry picked from commit 061e0084ce19c1384ba271a97a0aa1f87abe879d)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../datasources/parquet/ParquetFilterSuite.scala   | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index 79891af5658bf..f8d04b593f3d0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -604,13 +604,15 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
   }
 
   test("SPARK-23852: Broken Parquet push-down for partially-written stats") {
-    // parquet-1217.parquet contains a single column with values -1, 0, 1, 2 and null.
-    // The row-group statistics include null counts, but not min and max values, which
-    // triggers PARQUET-1217.
-    val df = readResourceParquetFile("test-data/parquet-1217.parquet")
+    withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true") {
+      // parquet-1217.parquet contains a single column with values -1, 0, 1, 2 and null.
+      // The row-group statistics include null counts, but not min and max values, which
+      // triggers PARQUET-1217.
+      val df = readResourceParquetFile("test-data/parquet-1217.parquet")
 
-    // Will return 0 rows if PARQUET-1217 is not fixed.
-    assert(df.where("col > 0").count() === 2)
+      // Will return 0 rows if PARQUET-1217 is not fixed.
+      assert(df.where("col > 0").count() === 2)
+    }
   }
 }
 

From cc93bc9592fa52c1bf81b583e2bc4df5f4914146 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@apache.org>
Date: Mon, 14 May 2018 17:57:16 -0700
Subject: [PATCH 451/566] Preparing Spark release v2.3.1-rc1

---
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 2 +-
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 40 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index 5c5a8e9b8232e..d744c8b4e2e0f 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 2a625da682cdc..3a41e1627993f 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index adb1890439094..f02108f03b74b 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 4cdcfa2a4da99..44304870bced2 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 494137f8e9ec1..167dc7aa503a7 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 32bebf5eeca99..ffb770ea6769e 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 8dc4ddeea3ec0..ddbfb49fe0021 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 5613f5e81384c..ce7d67828e0b0 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index c7aa9a4aac409..506b2740817af 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index 213579037f51a..daef97ce691d0 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,7 +14,7 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.1-SNAPSHOT
+SPARK_VERSION: 2.3.1
 SPARK_VERSION_SHORT: 2.3.1
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
diff --git a/examples/pom.xml b/examples/pom.xml
index e584feed1b337..7974a3847f01b 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 6c4031b392f65..33f72dfe1bacc 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 6935ade33546a..b5ac7aee37049 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 60aa5fb1a1031..9118ad274cf44 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index d8ed3364c4bd4..73b54bc15418a 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 15d550d3bb445..815cab9c4ef8d 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 7ee7156efe22c..475cf7ff39d17 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 1f28798fd245a..553b73390a220 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index ad3cd0059e183..f911d0191b0f7 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 53d903fc58aee..4cfc4577fe09d 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 4206a1ac4ad99..b385bacf1027e 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 7e838812bdc2e..416b40c2a798a 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index ee7eb98e3dfc8..8c99aa3a1a97f 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 2ea2e4a7adbdb..74535db06ad0a 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 35ddfac1d6624..7e93d8925a505 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 29c1e53f480a3..5e2d8503b8a1e 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 61eb9dbcf6f7a..dd4825c19a7ad 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index edba0a72ee6a4..5160d0b6e8b85 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 533c6b4574a55..3c445211a3d74 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.1-SNAPSHOT</version>
+  <version>2.3.1</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index b09fa9822184c..49c29a0042617 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.1.dev0"
+__version__ = "2.3.1"
diff --git a/repl/pom.xml b/repl/pom.xml
index 3b445bfaf773d..84802101496bd 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index ae1c69f305589..4457605fc1515 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index ea31b9c121433..7e47916dcacaa 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index d42bfff851572..78bfce4e429f9 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 865d3b3f6ad4e..5a1657d109812 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index f39bffe417666..b3fa6535dfc37 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 6e04f528a1c2f..f0b7d5e23a569 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index da50c9bd60fa6..6b9302c4eb92a 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 6e8e034fc4de4..a84af92d1c39b 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 9523085cf5513..1c10777ce8286 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From eb7b37350108fca9d52d60a842c5920783a356da Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@apache.org>
Date: Mon, 14 May 2018 17:57:20 -0700
Subject: [PATCH 452/566] Preparing development version 2.3.2-SNAPSHOT

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 632bcb342caeb..8df26351614ec 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.1
+Version: 2.3.2
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index d744c8b4e2e0f..02bf39bcb96f3 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 3a41e1627993f..646fdfb23ef19 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index f02108f03b74b..76c7dcf52203f 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 44304870bced2..f2661fedd6f75 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 167dc7aa503a7..229d4667169d2 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index ffb770ea6769e..febec1897c59c 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index ddbfb49fe0021..4dec96f86d9c3 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index ce7d67828e0b0..7187313c5c2f7 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 506b2740817af..a9cc91f87566b 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index daef97ce691d0..8579166c2e635 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.1
-SPARK_VERSION_SHORT: 2.3.1
+SPARK_VERSION: 2.3.2-SNAPSHOT
+SPARK_VERSION_SHORT: 2.3.2
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 7974a3847f01b..b873bc9b20322 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 33f72dfe1bacc..b7dc48e4a7001 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index b5ac7aee37049..161fd0faac82a 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 9118ad274cf44..26abc31f9620d 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 73b54bc15418a..bcc13ebb359b3 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 815cab9c4ef8d..020e0ba7d568b 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 475cf7ff39d17..e12115ee6e20c 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 553b73390a220..07d205b01dfde 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index f911d0191b0f7..11d971a0de046 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 4cfc4577fe09d..70eb580d474af 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index b385bacf1027e..0728baede26f0 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 416b40c2a798a..c6925851ebc23 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 8c99aa3a1a97f..8b95f1be216c7 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 74535db06ad0a..b5253fd8954f3 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 7e93d8925a505..dacd42126393b 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 5e2d8503b8a1e..2fcb367cf301c 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index dd4825c19a7ad..8ecd126ebb9b2 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 5160d0b6e8b85..f52b83475c515 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 3c445211a3d74..f4335f18c6350 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.1</version>
+  <version>2.3.2-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 49c29a0042617..ed1cbdd58f252 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.1"
+__version__ = "2.3.2.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 84802101496bd..bc10cce2d1aea 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 4457605fc1515..3e0aa0872b99b 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 7e47916dcacaa..cf153476ce9f9 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 78bfce4e429f9..71ddbb5d4db08 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 5a1657d109812..04a3380b02115 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index b3fa6535dfc37..c9a05cc9e43f6 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index f0b7d5e23a569..8a0c6fba66cda 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 6b9302c4eb92a..2eeff747a68c6 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index a84af92d1c39b..d16f48dec205f 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 1c10777ce8286..f5927c5e41939 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From a886dc23aeae8a79f7bc4d1aeb47d545e5550604 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Mon, 14 May 2018 19:20:25 -0700
Subject: [PATCH 453/566] [SPARK-23780][R] Failed to use googleVis library with
 new SparkR

## What changes were proposed in this pull request?

change generic to get it to work with googleVis
also fix lintr

## How was this patch tested?

manual test, unit tests

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #21315 from felixcheung/googvis.

(cherry picked from commit 9059f1ee6ae13c8636c9b7fdbb708a349256fb8e)
Signed-off-by: Felix Cheung <felixcheung@apache.org>
---
 R/pkg/R/client.R   | 5 +++--
 R/pkg/R/generics.R | 2 +-
 R/pkg/R/sparkR.R   | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R
index 14a17c600b17f..4c87f64e7f0e1 100644
--- a/R/pkg/R/client.R
+++ b/R/pkg/R/client.R
@@ -63,7 +63,7 @@ generateSparkSubmitArgs <- function(args, sparkHome, jars, sparkSubmitOpts, pack
 checkJavaVersion <- function() {
   javaBin <- "java"
   javaHome <- Sys.getenv("JAVA_HOME")
-  javaReqs <- utils::packageDescription(utils::packageName(), fields=c("SystemRequirements"))
+  javaReqs <- utils::packageDescription(utils::packageName(), fields = c("SystemRequirements"))
   sparkJavaVersion <- as.numeric(tail(strsplit(javaReqs, "[(=)]")[[1]], n = 1L))
   if (javaHome != "") {
     javaBin <- file.path(javaHome, "bin", javaBin)
@@ -90,7 +90,8 @@ checkJavaVersion <- function() {
   # Extract 8 from it to compare to sparkJavaVersion
   javaVersionNum <- as.integer(strsplit(javaVersionStr, "[.]")[[1L]][2])
   if (javaVersionNum != sparkJavaVersion) {
-    stop(paste("Java version", sparkJavaVersion, "is required for this package; found version:", javaVersionStr))
+    stop(paste("Java version", sparkJavaVersion, "is required for this package; found version:",
+               javaVersionStr))
   }
 }
 
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index e0dde3339fabc..cffc9ab5f2bea 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -762,7 +762,7 @@ setGeneric("summarize", function(x, ...) { standardGeneric("summarize") })
 #' @export
 setGeneric("summary", function(object, ...) { standardGeneric("summary") })
 
-setGeneric("toJSON", function(x) { standardGeneric("toJSON") })
+setGeneric("toJSON", function(x, ...) { standardGeneric("toJSON") })
 
 setGeneric("toRDD", function(x) { standardGeneric("toRDD") })
 
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 2cd8b0c1e7947..266fa46525e0a 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -197,7 +197,7 @@ sparkR.sparkContext <- function(
 
     # Don't use readString() so that we can provide a useful
     # error message if the R and Java versions are mismatched.
-    authSecretLen = readInt(f)
+    authSecretLen <- readInt(f)
     if (length(authSecretLen) == 0 || authSecretLen == 0) {
       stop("Unexpected EOF in JVM connection data. Mismatched versions?")
     }

From e16ab6fe05503872a323a31dc6755aaa5d12c68c Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Tue, 15 May 2018 22:06:58 +0800
Subject: [PATCH 454/566] [SPARK-24259][SQL] ArrayWriter for Arrow produces
 wrong output

## What changes were proposed in this pull request?

Right now `ArrayWriter` used to output Arrow data for array type, doesn't do `clear` or `reset` after each batch. It produces wrong output.

## How was this patch tested?

Added test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #21312 from viirya/SPARK-24259.

(cherry picked from commit d610d2a3f57ca551f72cb4e5dfed78f27be62eec)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 python/pyspark/sql/tests.py                   | 20 +++++++++++++++++++
 .../sql/execution/arrow/ArrowWriter.scala     |  8 ++++++++
 2 files changed, 28 insertions(+)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index f8da7d89c1fd0..7ff09afbb8e09 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -4476,6 +4476,26 @@ def test_supported_types(self):
         expected = df.toPandas().groupby('id').apply(foo_udf.func).reset_index(drop=True)
         self.assertPandasEqual(expected, result)
 
+    def test_array_type_correct(self):
+        from pyspark.sql.functions import pandas_udf, PandasUDFType, array, col
+
+        df = self.data.withColumn("arr", array(col("id"))).repartition(1, "id")
+
+        output_schema = StructType(
+            [StructField('id', LongType()),
+             StructField('v', IntegerType()),
+             StructField('arr', ArrayType(LongType()))])
+
+        udf = pandas_udf(
+            lambda pdf: pdf,
+            output_schema,
+            PandasUDFType.GROUPED_MAP
+        )
+
+        result = df.groupby('id').apply(udf).sort('id').toPandas()
+        expected = df.toPandas().groupby('id').apply(udf.func).reset_index(drop=True)
+        self.assertPandasEqual(expected, result)
+
     def test_register_grouped_map_udf(self):
         from pyspark.sql.functions import pandas_udf, PandasUDFType
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala
index 22b63513548fe..66888fce7f9f5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala
@@ -133,6 +133,14 @@ private[arrow] abstract class ArrowFieldWriter {
     valueVector match {
       case fixedWidthVector: BaseFixedWidthVector => fixedWidthVector.reset()
       case variableWidthVector: BaseVariableWidthVector => variableWidthVector.reset()
+      case listVector: ListVector =>
+        // Manual "reset" the underlying buffer.
+        // TODO: When we upgrade to Arrow 0.10.0, we can simply remove this and call
+        // `listVector.reset()`.
+        val buffers = listVector.getBuffers(false)
+        buffers.foreach(buf => buf.setZero(0, buf.capacity()))
+        listVector.setValueCount(0)
+        listVector.setLastSet(0)
       case _ =>
     }
     count = 0

From d4a892af53266098affa9cd444e8e3b639dfc851 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 16 May 2018 13:34:54 -0700
Subject: [PATCH 455/566] [SPARK-23601][BUILD][FOLLOW-UP] Keep md5 checksums
 for nexus artifacts.

The repository.apache.org server still requires md5 checksums or
it won't publish the staging repo.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #21338 from vanzin/SPARK-23601.

(cherry picked from commit 8e60a16b73490007fe1c480d77cc09d760f0a02b)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 dev/create-release/release-build.sh | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index c00b00b845401..5faa3d3260a56 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -371,11 +371,18 @@ if [[ "$1" == "publish-release" ]]; then
   find . -type f |grep -v \.jar |grep -v \.pom | xargs rm
 
   echo "Creating hash and signature files"
-  # this must have .asc and .sha1 - it really doesn't like anything else there
+  # this must have .asc, .md5 and .sha1 - it really doesn't like anything else there
   for file in $(find . -type f)
   do
     echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --output $file.asc \
       --detach-sig --armour $file;
+    if [ $(command -v md5) ]; then
+      # Available on OS X; -q to keep only hash
+      md5 -q $file > $file.md5
+    else
+      # Available on Linux; cut to keep only hash
+      md5sum $file | cut -f1 -d' ' > $file.md5
+    fi
     sha1sum $file | cut -f1 -d' ' > $file.sha1
   done
 

From 1708de27e84fa722510b6220471dd1746a4f581f Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 17 May 2018 22:19:03 +0800
Subject: [PATCH 456/566] [SPARK-24002][SQL][BACKPORT-2.3] Task not
 serializable caused by
 org.apache.parquet.io.api.Binary$ByteBufferBackedBinary.getBytes

This PR is to backport https://github.com/apache/spark/pull/21086 to Apache Spark 2.3

----
```
Py4JJavaError: An error occurred while calling o153.sql.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:223)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:189)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:79)
	at org.apache.spark.sql.Dataset$$anonfun$6.apply(Dataset.scala:190)
	at org.apache.spark.sql.Dataset$$anonfun$6.apply(Dataset.scala:190)
	at org.apache.spark.sql.Dataset$$anonfun$59.apply(Dataset.scala:3021)
	at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:89)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:127)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3020)
	at org.apache.spark.sql.Dataset.<init>(Dataset.scala:190)
	at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:74)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:646)
	at sun.reflect.GeneratedMethodAccessor153.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
	at py4j.Gateway.invoke(Gateway.java:293)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:226)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Exception thrown in Future.get:
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:190)
	at org.apache.spark.sql.execution.InputAdapter.doExecuteBroadcast(WholeStageCodegenExec.scala:267)
	at org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec.doConsume(BroadcastNestedLoopJoinExec.scala:530)
	at org.apache.spark.sql.execution.CodegenSupport$class.consume(WholeStageCodegenExec.scala:155)
	at org.apache.spark.sql.execution.ProjectExec.consume(basicPhysicalOperators.scala:37)
	at org.apache.spark.sql.execution.ProjectExec.doConsume(basicPhysicalOperators.scala:69)
	at org.apache.spark.sql.execution.CodegenSupport$class.consume(WholeStageCodegenExec.scala:155)
	at org.apache.spark.sql.execution.FilterExec.consume(basicPhysicalOperators.scala:144)
	...
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:190)
	... 23 more
Caused by: java.util.concurrent.ExecutionException: org.apache.spark.SparkException: Task not serializable
	at java.util.concurrent.FutureTask.report(FutureTask.java:122)
	at java.util.concurrent.FutureTask.get(FutureTask.java:206)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:179)
	... 276 more
Caused by: org.apache.spark.SparkException: Task not serializable
	at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:340)
	at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:330)
	at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:156)
	at org.apache.spark.SparkContext.clean(SparkContext.scala:2380)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1.apply(RDD.scala:850)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1.apply(RDD.scala:849)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:371)
	at org.apache.spark.rdd.RDD.mapPartitionsWithIndex(RDD.scala:849)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:417)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:123)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$3.apply(SparkPlan.scala:152)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:149)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:118)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.prepareShuffleDependency(ShuffleExchangeExec.scala:89)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:125)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:116)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:116)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:123)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$3.apply(SparkPlan.scala:152)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:149)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:118)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:271)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:181)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:414)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:123)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$3.apply(SparkPlan.scala:152)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:149)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:118)
	at org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:61)
	at org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:70)
	at org.apache.spark.sql.execution.SparkPlan.executeCollectResult(SparkPlan.scala:264)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec$$anon$1$$anonfun$call$1.apply(BroadcastExchangeExec.scala:93)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec$$anon$1$$anonfun$call$1.apply(BroadcastExchangeExec.scala:81)
	at org.apache.spark.sql.execution.SQLExecution$.withExecutionId(SQLExecution.scala:150)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec$$anon$1.call(BroadcastExchangeExec.scala:80)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec$$anon$1.call(BroadcastExchangeExec.scala:76)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.nio.BufferUnderflowException
	at java.nio.HeapByteBuffer.get(HeapByteBuffer.java:151)
	at java.nio.ByteBuffer.get(ByteBuffer.java:715)
	at org.apache.parquet.io.api.Binary$ByteBufferBackedBinary.getBytes(Binary.java:405)
	at org.apache.parquet.io.api.Binary$ByteBufferBackedBinary.getBytesUnsafe(Binary.java:414)
	at org.apache.parquet.io.api.Binary$ByteBufferBackedBinary.writeObject(Binary.java:484)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at java.io.ObjectStreamClass.invokeWriteObject(ObjectStreamClass.java:1128)
	at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1496)
```

The Parquet filters are serializable but not thread safe. SparkPlan.prepare() could be called in different threads (BroadcastExchange will call it in a thread pool). Thus, we could serialize the same Parquet filter at the same time. This is not easily reproduced. The fix is to avoid serializing these Parquet filters in the driver. This PR is to avoid serializing these Parquet filters by moving the parquet filter generation from the driver to executors.

## How was this patch tested?

Having two queries one is a 1000-line SQL query and a 3000-line SQL query. Need to run at least one hour with a heavy write workload to reproduce once.

Author: gatorsmile <gatorsmile@gmail.com>

Closes #21351 from gatorsmile/backportSPARK-24002.
---
 .../parquet/ParquetFileFormat.scala           | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
index a6129dafd2f77..b0ba21e47df45 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -321,19 +321,6 @@ class ParquetFileFormat
       SQLConf.PARQUET_INT96_AS_TIMESTAMP.key,
       sparkSession.sessionState.conf.isParquetINT96AsTimestamp)
 
-    // Try to push down filters when filter push-down is enabled.
-    val pushed =
-      if (sparkSession.sessionState.conf.parquetFilterPushDown) {
-        filters
-          // Collects all converted Parquet filter predicates. Notice that not all predicates can be
-          // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap`
-          // is used here.
-          .flatMap(ParquetFilters.createFilter(requiredSchema, _))
-          .reduceOption(FilterApi.and)
-      } else {
-        None
-      }
-
     val broadcastedHadoopConf =
       sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
 
@@ -350,12 +337,26 @@ class ParquetFileFormat
       sparkSession.sessionState.conf.parquetRecordFilterEnabled
     val timestampConversion: Boolean =
       sparkSession.sessionState.conf.isParquetINT96TimestampConversion
+    val enableParquetFilterPushDown: Boolean =
+      sparkSession.sessionState.conf.parquetFilterPushDown
     // Whole stage codegen (PhysicalRDD) is able to deal with batches directly
     val returningBatch = supportBatch(sparkSession, resultSchema)
 
     (file: PartitionedFile) => {
       assert(file.partitionValues.numFields == partitionSchema.size)
 
+      // Try to push down filters when filter push-down is enabled.
+      val pushed = if (enableParquetFilterPushDown) {
+        filters
+          // Collects all converted Parquet filter predicates. Notice that not all predicates can be
+          // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap`
+          // is used here.
+          .flatMap(ParquetFilters.createFilter(requiredSchema, _))
+          .reduceOption(FilterApi.and)
+      } else {
+        None
+      }
+
       val fileSplit =
         new FileSplit(new Path(new URI(file.filePath)), file.start, file.length, Array.empty)
 

From 28973e152f7de73f82583ab373ffdecc55dcabc2 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@apache.org>
Date: Thu, 17 May 2018 12:07:58 +0800
Subject: [PATCH 457/566] [SPARK-21945][YARN][PYTHON] Make --py-files work with
 PySpark shell in Yarn client mode

When we run _PySpark shell with Yarn client mode_, specified `--py-files` are not recognised in _driver side_.

Here are the steps I took to check:

```bash
$ cat /home/spark/tmp.py
def testtest():
    return 1
```

```bash
$ ./bin/pyspark --master yarn --deploy-mode client --py-files /home/spark/tmp.py
```

```python
>>> def test():
...     import tmp
...     return tmp.testtest()
...
>>> spark.range(1).rdd.map(lambda _: test()).collect()  # executor side
[1]
>>> test()  # driver side
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "<stdin>", line 2, in test
ImportError: No module named tmp
```

Unlike Yarn cluster and client mode with Spark submit, when Yarn client mode with PySpark shell specifically,

1. It first runs Python shell via:

https://github.com/apache/spark/blob/3cb82047f2f51af553df09b9323796af507d36f8/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java#L158 as pointed out by tgravescs in the JIRA.

2. this triggers shell.py and submit another application to launch a py4j gateway:

https://github.com/apache/spark/blob/209b9361ac8a4410ff797cff1115e1888e2f7e66/python/pyspark/java_gateway.py#L45-L60

3. it runs a Py4J gateway:

https://github.com/apache/spark/blob/3cb82047f2f51af553df09b9323796af507d36f8/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala#L425

4. it copies (or downloads) --py-files  into local temp directory:

https://github.com/apache/spark/blob/3cb82047f2f51af553df09b9323796af507d36f8/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala#L365-L376

and then these files are set up to `spark.submit.pyFiles`

5. Py4J JVM is launched and then the Python paths are set via:

https://github.com/apache/spark/blob/7013eea11cb32b1e0038dc751c485da5c94a484b/python/pyspark/context.py#L209-L216

However, these are not actually set because those files were copied into a tmp directory in 4. whereas this code path looks for `SparkFiles.getRootDirectory` where the files are stored only when `SparkContext.addFile()` is called.

In other cluster mode, `spark.files` are set via:

https://github.com/apache/spark/blob/3cb82047f2f51af553df09b9323796af507d36f8/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala#L554-L555

and those files are explicitly added via:

https://github.com/apache/spark/blob/ecb8b383af1cf1b67f3111c148229e00c9c17c40/core/src/main/scala/org/apache/spark/SparkContext.scala#L395

So we are fine in other modes.

In case of Yarn client and cluster with _submit_, these are manually being handled. In particular https://github.com/apache/spark/pull/6360 added most of the logics. In this case, the Python path looks manually set via, for example, `deploy.PythonRunner`. We don't use `spark.files` here.

I tried to make an isolated approach as possible as I can: simply copy py file or zip files into `SparkFiles.getRootDirectory()` in driver side if not existing. Another possible way is to set `spark.files` but it does unnecessary stuff together and sounds a bit invasive.

**Before**

```python
>>> def test():
...     import tmp
...     return tmp.testtest()
...
>>> spark.range(1).rdd.map(lambda _: test()).collect()
[1]
>>> test()
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "<stdin>", line 2, in test
ImportError: No module named tmp
```

**After**

```python
>>> def test():
...     import tmp
...     return tmp.testtest()
...
>>> spark.range(1).rdd.map(lambda _: test()).collect()
[1]
>>> test()
1
```

I manually tested in standalone and yarn cluster with PySpark shell. .zip and .py files were also tested with the similar steps above. It's difficult to add a test.

Author: hyukjinkwon <gurwls223@apache.org>

Closes #21267 from HyukjinKwon/SPARK-21945.

(cherry picked from commit 9a641e7f721d01d283afb09dccefaf32972d3c04)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 python/pyspark/context.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index d1f89e8d16dd6..880559719d3a5 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -211,9 +211,21 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
         for path in self._conf.get("spark.submit.pyFiles", "").split(","):
             if path != "":
                 (dirname, filename) = os.path.split(path)
-                if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:
-                    self._python_includes.append(filename)
-                    sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))
+                try:
+                    filepath = os.path.join(SparkFiles.getRootDirectory(), filename)
+                    if not os.path.exists(filepath):
+                        # In case of YARN with shell mode, 'spark.submit.pyFiles' files are
+                        # not added via SparkContext.addFile. Here we check if the file exists,
+                        # try to copy and then add it to the path. See SPARK-21945.
+                        shutil.copyfile(path, filepath)
+                    if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:
+                        self._python_includes.append(filename)
+                        sys.path.insert(1, filepath)
+                except Exception:
+                    warnings.warn(
+                        "Failed to add file [%s] speficied in 'spark.submit.pyFiles' to "
+                        "Python path:\n  %s" % (path, "\n  ".join(sys.path)),
+                        RuntimeWarning)
 
         # Create a temporary directory inside spark.local.dir:
         local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(self._jsc.sc().conf())

From 895c95e5bde86099c0ed01edce00d5d87b192cbb Mon Sep 17 00:00:00 2001
From: Artem Rudoy <artem.rudoy@gmail.com>
Date: Thu, 17 May 2018 18:49:46 +0800
Subject: [PATCH 458/566] [SPARK-22371][CORE] Return None instead of throwing
 an exception when an accumulator is garbage collected.

## What changes were proposed in this pull request?

There's a period of time when an accumulator has been garbage collected, but hasn't been removed from AccumulatorContext.originals by ContextCleaner. When an update is received for such accumulator it will throw an exception and kill the whole job. This can happen when a stage completes, but there're still running tasks from other attempts, speculation etc. Since AccumulatorContext.get() returns an option we can just return None in such case.

## How was this patch tested?

Unit test.

Author: Artem Rudoy <artem.rudoy@gmail.com>

Closes #21114 from artemrd/SPARK-22371.

(cherry picked from commit 6c35865d949a8b46f654cd53c7e5f3288def18d0)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../org/apache/spark/util/AccumulatorV2.scala      | 14 +++++++++-----
 .../scala/org/apache/spark/AccumulatorSuite.scala  |  6 ++----
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
index 2bc84953a56eb..3b469a69437b9 100644
--- a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
+++ b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
@@ -24,6 +24,7 @@ import java.util.concurrent.ConcurrentHashMap
 import java.util.concurrent.atomic.AtomicLong
 
 import org.apache.spark.{InternalAccumulator, SparkContext, TaskContext}
+import org.apache.spark.internal.Logging
 import org.apache.spark.scheduler.AccumulableInfo
 
 private[spark] case class AccumulatorMetadata(
@@ -211,7 +212,7 @@ abstract class AccumulatorV2[IN, OUT] extends Serializable {
 /**
  * An internal class used to track accumulators by Spark itself.
  */
-private[spark] object AccumulatorContext {
+private[spark] object AccumulatorContext extends Logging {
 
   /**
    * This global map holds the original accumulator objects that are created on the driver.
@@ -258,13 +259,16 @@ private[spark] object AccumulatorContext {
    * Returns the [[AccumulatorV2]] registered with the given ID, if any.
    */
   def get(id: Long): Option[AccumulatorV2[_, _]] = {
-    Option(originals.get(id)).map { ref =>
-      // Since we are storing weak references, we must check whether the underlying data is valid.
+    val ref = originals.get(id)
+    if (ref eq null) {
+      None
+    } else {
+      // Since we are storing weak references, warn when the underlying data is not valid.
       val acc = ref.get
       if (acc eq null) {
-        throw new IllegalStateException(s"Attempted to access garbage collected accumulator $id")
+        logWarning(s"Attempted to access garbage collected accumulator $id")
       }
-      acc
+      Option(acc)
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala b/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
index 3990ee1ec326d..5d0ffd92647bc 100644
--- a/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
@@ -209,10 +209,8 @@ class AccumulatorSuite extends SparkFunSuite with Matchers with LocalSparkContex
     System.gc()
     assert(ref.get.isEmpty)
 
-    // Getting a garbage collected accum should throw error
-    intercept[IllegalStateException] {
-      AccumulatorContext.get(accId)
-    }
+    // Getting a garbage collected accum should return None.
+    assert(AccumulatorContext.get(accId).isEmpty)
 
     // Getting a normal accumulator. Note: this has to be separate because referencing an
     // accumulator above in an `assert` would keep it from being garbage collected.

From d88f3e474063010231a5a5d95d192c7796af01e4 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 18 May 2018 11:14:22 -0700
Subject: [PATCH 459/566] [SPARK-23850][SQL] Add separate config for SQL
 options redaction.

The old code was relying on a core configuration and extended its
default value to include things that redact desired things in the
app's environment. Instead, add a SQL-specific option for which
options to redact, and apply both the core and SQL-specific rules
when redacting the options in the save command.

This is a little sub-optimal since it adds another config, but it
retains the current default behavior.

While there I also fixed a typo and a couple of minor config API
usage issues in the related redaction option that SQL already had.

Tested with existing unit tests, plus checking the env page on
a shell UI.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #21158 from vanzin/SPARK-23850.

(cherry picked from commit ed7ba7db8fa344ff182b72d23ae458e711f63432)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../spark/internal/config/package.scala       |  2 +-
 .../apache/spark/sql/internal/SQLConf.scala   | 24 +++++++++++++++++--
 .../sql/execution/DataSourceScanExec.scala    |  2 +-
 .../spark/sql/execution/QueryExecution.scala  |  2 +-
 .../SaveIntoDataSourceCommand.scala           |  5 ++--
 .../SaveIntoDataSourceCommandSuite.scala      |  3 ---
 6 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index b180a40239f9c..11cb5d39e5647 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -332,7 +332,7 @@ package object config {
         "a property key or value, the value is redacted from the environment UI and various logs " +
         "like YARN and event logs.")
       .regexConf
-      .createWithDefault("(?i)secret|password|url|user|username".r)
+      .createWithDefault("(?i)secret|password".r)
 
   private[spark] val STRING_REDACTION_PATTERN =
     ConfigBuilder("spark.redaction.string.regex")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index eebb4c754819d..74e0f609e2e52 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1103,8 +1103,17 @@ object SQLConf {
       .booleanConf
       .createWithDefault(true)
 
+  val SQL_OPTIONS_REDACTION_PATTERN =
+    buildConf("spark.sql.redaction.options.regex")
+      .doc("Regex to decide which keys in a Spark SQL command's options map contain sensitive " +
+        "information. The values of options whose names that match this regex will be redacted " +
+        "in the explain output. This redaction is applied on top of the global redaction " +
+        s"configuration defined by ${SECRET_REDACTION_PATTERN.key}.")
+    .regexConf
+    .createWithDefault("(?i)url".r)
+
   val SQL_STRING_REDACTION_PATTERN =
-    ConfigBuilder("spark.sql.redaction.string.regex")
+    buildConf("spark.sql.redaction.string.regex")
       .doc("Regex to decide which parts of strings produced by Spark contain sensitive " +
         "information. When this regex matches a string part, that string part is replaced by a " +
         "dummy value. This is currently used to redact the output of SQL explain commands. " +
@@ -1331,7 +1340,7 @@ class SQLConf extends Serializable with Logging {
 
   def fileCompressionFactor: Double = getConf(FILE_COMRESSION_FACTOR)
 
-  def stringRedationPattern: Option[Regex] = SQL_STRING_REDACTION_PATTERN.readFrom(reader)
+  def stringRedactionPattern: Option[Regex] = getConf(SQL_STRING_REDACTION_PATTERN)
 
   def sortBeforeRepartition: Boolean = getConf(SORT_BEFORE_REPARTITION)
 
@@ -1630,6 +1639,17 @@ class SQLConf extends Serializable with Logging {
     }.toSeq
   }
 
+  /**
+   * Redacts the given option map according to the description of SQL_OPTIONS_REDACTION_PATTERN.
+   */
+  def redactOptions(options: Map[String, String]): Map[String, String] = {
+    val regexes = Seq(
+      getConf(SQL_OPTIONS_REDACTION_PATTERN),
+      SECRET_REDACTION_PATTERN.readFrom(reader))
+
+    regexes.foldLeft(options.toSeq) { case (opts, r) => Utils.redact(Some(r), opts) }.toMap
+  }
+
   /**
    * Return whether a given key is set in this [[SQLConf]].
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index aa66ee7e948ea..1a98fe0f3fa33 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -69,7 +69,7 @@ trait DataSourceScanExec extends LeafExecNode with CodegenSupport {
    * Shorthand for calling redactString() without specifying redacting rules
    */
   private def redact(text: String): String = {
-    Utils.redact(sqlContext.sessionState.conf.stringRedationPattern, text)
+    Utils.redact(sqlContext.sessionState.conf.stringRedactionPattern, text)
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
index 15379a0663f7d..3112b306c365e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -225,7 +225,7 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
    * Redact the sensitive information in the given string.
    */
   private def withRedaction(message: String): String = {
-    Utils.redact(sparkSession.sessionState.conf.stringRedationPattern, message)
+    Utils.redact(sparkSession.sessionState.conf.stringRedactionPattern, message)
   }
 
   /** A special namespace for commands that can be used to debug query execution. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommand.scala
index 568e953a5db66..00b1b5dedb593 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommand.scala
@@ -17,13 +17,12 @@
 
 package org.apache.spark.sql.execution.datasources
 
-import org.apache.spark.SparkEnv
 import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession}
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.command.RunnableCommand
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.CreatableRelationProvider
-import org.apache.spark.util.Utils
 
 /**
  * Saves the results of `query` in to a data source.
@@ -50,7 +49,7 @@ case class SaveIntoDataSourceCommand(
   }
 
   override def simpleString: String = {
-    val redacted = Utils.redact(SparkEnv.get.conf, options.toSeq).toMap
+    val redacted = SQLConf.get.redactOptions(options)
     s"SaveIntoDataSourceCommand ${dataSource}, ${redacted}, ${mode}"
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala
index 4b3ca8e60cab6..a1da3ec43eae3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala
@@ -23,9 +23,6 @@ import org.apache.spark.sql.test.SharedSQLContext
 
 class SaveIntoDataSourceCommandSuite extends SharedSQLContext {
 
-  override protected def sparkConf: SparkConf = super.sparkConf
-    .set("spark.redaction.regex", "(?i)password|url")
-
   test("simpleString is redacted") {
     val URL = "connection.url"
     val PASS = "123"

From 70b86654857a93deecc1e97f0af9faa7ef91326d Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Mon, 21 May 2018 16:26:39 -0700
Subject: [PATCH 460/566] [SPARK-24309][CORE] AsyncEventQueue should stop on
 interrupt.

EventListeners can interrupt the event queue thread.  In particular,
when the EventLoggingListener writes to hdfs, hdfs can interrupt the
thread.  When there is an interrupt, the queue should be removed and stop
accepting any more events.  Before this change, the queue would continue
to take more events (till it was full), and then would not stop when the
application was complete because the PoisonPill couldn't be added.

Added a unit test which failed before this change.

Author: Imran Rashid <irashid@cloudera.com>

Closes #21356 from squito/SPARK-24309.

(cherry picked from commit 32447079e9d0fa9f7e180b94ecac19091b6af1ab)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../spark/scheduler/AsyncEventQueue.scala     | 41 ++++++++------
 .../spark/scheduler/LiveListenerBus.scala     |  2 +-
 .../org/apache/spark/util/ListenerBus.scala   | 18 +++++++
 .../spark/scheduler/SparkListenerSuite.scala  | 54 +++++++++++++++++++
 4 files changed, 98 insertions(+), 17 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala b/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala
index c1fedd63f6a90..e2b6df4600590 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala
@@ -34,7 +34,11 @@ import org.apache.spark.util.Utils
  * Delivery will only begin when the `start()` method is called. The `stop()` method should be
  * called when no more events need to be delivered.
  */
-private class AsyncEventQueue(val name: String, conf: SparkConf, metrics: LiveListenerBusMetrics)
+private class AsyncEventQueue(
+    val name: String,
+    conf: SparkConf,
+    metrics: LiveListenerBusMetrics,
+    bus: LiveListenerBus)
   extends SparkListenerBus
   with Logging {
 
@@ -81,23 +85,18 @@ private class AsyncEventQueue(val name: String, conf: SparkConf, metrics: LiveLi
   }
 
   private def dispatch(): Unit = LiveListenerBus.withinListenerThread.withValue(true) {
-    try {
-      var next: SparkListenerEvent = eventQueue.take()
-      while (next != POISON_PILL) {
-        val ctx = processingTime.time()
-        try {
-          super.postToAll(next)
-        } finally {
-          ctx.stop()
-        }
-        eventCount.decrementAndGet()
-        next = eventQueue.take()
+    var next: SparkListenerEvent = eventQueue.take()
+    while (next != POISON_PILL) {
+      val ctx = processingTime.time()
+      try {
+        super.postToAll(next)
+      } finally {
+        ctx.stop()
       }
       eventCount.decrementAndGet()
-    } catch {
-      case ie: InterruptedException =>
-        logInfo(s"Stopping listener queue $name.", ie)
+      next = eventQueue.take()
     }
+    eventCount.decrementAndGet()
   }
 
   override protected def getTimer(listener: SparkListenerInterface): Option[Timer] = {
@@ -130,7 +129,11 @@ private class AsyncEventQueue(val name: String, conf: SparkConf, metrics: LiveLi
       eventCount.incrementAndGet()
       eventQueue.put(POISON_PILL)
     }
-    dispatchThread.join()
+    // this thread might be trying to stop itself as part of error handling -- we can't join
+    // in that case.
+    if (Thread.currentThread() != dispatchThread) {
+      dispatchThread.join()
+    }
   }
 
   def post(event: SparkListenerEvent): Unit = {
@@ -187,6 +190,12 @@ private class AsyncEventQueue(val name: String, conf: SparkConf, metrics: LiveLi
     true
   }
 
+  override def removeListenerOnError(listener: SparkListenerInterface): Unit = {
+    // the listener failed in an unrecoverably way, we want to remove it from the entire
+    // LiveListenerBus (potentially stopping a queue if it is empty)
+    bus.removeListener(listener)
+  }
+
 }
 
 private object AsyncEventQueue {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
index ba6387a8f08ad..d135190d1e919 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
@@ -102,7 +102,7 @@ private[spark] class LiveListenerBus(conf: SparkConf) {
         queue.addListener(listener)
 
       case None =>
-        val newQueue = new AsyncEventQueue(queue, conf, metrics)
+        val newQueue = new AsyncEventQueue(queue, conf, metrics, this)
         newQueue.addListener(listener)
         if (started.get()) {
           newQueue.start(sparkContext)
diff --git a/core/src/main/scala/org/apache/spark/util/ListenerBus.scala b/core/src/main/scala/org/apache/spark/util/ListenerBus.scala
index 76a56298aaebc..4a7798434680e 100644
--- a/core/src/main/scala/org/apache/spark/util/ListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/util/ListenerBus.scala
@@ -60,6 +60,15 @@ private[spark] trait ListenerBus[L <: AnyRef, E] extends Logging {
     }
   }
 
+  /**
+   * This can be overriden by subclasses if there is any extra cleanup to do when removing a
+   * listener.  In particular AsyncEventQueues can clean up queues in the LiveListenerBus.
+   */
+  def removeListenerOnError(listener: L): Unit = {
+    removeListener(listener)
+  }
+
+
   /**
    * Post the event to all registered listeners. The `postToAll` caller should guarantee calling
    * `postToAll` in the same thread for all events.
@@ -80,7 +89,16 @@ private[spark] trait ListenerBus[L <: AnyRef, E] extends Logging {
       }
       try {
         doPostEvent(listener, event)
+        if (Thread.interrupted()) {
+          // We want to throw the InterruptedException right away so we can associate the interrupt
+          // with this listener, as opposed to waiting for a queue.take() etc. to detect it.
+          throw new InterruptedException()
+        }
       } catch {
+        case ie: InterruptedException =>
+          logError(s"Interrupted while posting to ${Utils.getFormattedClassName(listener)}.  " +
+            s"Removing that listener.", ie)
+          removeListenerOnError(listener)
         case NonFatal(e) =>
           logError(s"Listener ${Utils.getFormattedClassName(listener)} threw an exception", e)
       } finally {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
index fa47a52bbbc47..6ffd1e84f7adb 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
@@ -489,6 +489,48 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     assert(bus.findListenersByClass[BasicJobCounter]().isEmpty)
   }
 
+  Seq(true, false).foreach { throwInterruptedException =>
+    val suffix = if (throwInterruptedException) "throw interrupt" else "set Thread interrupted"
+    test(s"interrupt within listener is handled correctly: $suffix") {
+      val conf = new SparkConf(false)
+        .set(LISTENER_BUS_EVENT_QUEUE_CAPACITY, 5)
+      val bus = new LiveListenerBus(conf)
+      val counter1 = new BasicJobCounter()
+      val counter2 = new BasicJobCounter()
+      val interruptingListener1 = new InterruptingListener(throwInterruptedException)
+      val interruptingListener2 = new InterruptingListener(throwInterruptedException)
+      bus.addToSharedQueue(counter1)
+      bus.addToSharedQueue(interruptingListener1)
+      bus.addToStatusQueue(counter2)
+      bus.addToEventLogQueue(interruptingListener2)
+      assert(bus.activeQueues() === Set(SHARED_QUEUE, APP_STATUS_QUEUE, EVENT_LOG_QUEUE))
+      assert(bus.findListenersByClass[BasicJobCounter]().size === 2)
+      assert(bus.findListenersByClass[InterruptingListener]().size === 2)
+
+      bus.start(mockSparkContext, mockMetricsSystem)
+
+      // after we post one event, both interrupting listeners should get removed, and the
+      // event log queue should be removed
+      bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded))
+      bus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
+      assert(bus.activeQueues() === Set(SHARED_QUEUE, APP_STATUS_QUEUE))
+      assert(bus.findListenersByClass[BasicJobCounter]().size === 2)
+      assert(bus.findListenersByClass[InterruptingListener]().size === 0)
+      assert(counter1.count === 1)
+      assert(counter2.count === 1)
+
+      // posting more events should be fine, they'll just get processed from the OK queue.
+      (0 until 5).foreach { _ => bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded)) }
+      bus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
+      assert(counter1.count === 6)
+      assert(counter2.count === 6)
+
+      // Make sure stopping works -- this requires putting a poison pill in all active queues, which
+      // would fail if our interrupted queue was still active, as its queue would be full.
+      bus.stop()
+    }
+  }
+
   /**
    * Assert that the given list of numbers has an average that is greater than zero.
    */
@@ -547,6 +589,18 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = { throw new Exception }
   }
 
+  /**
+   * A simple listener that interrupts on job end.
+   */
+  private class InterruptingListener(val throwInterruptedException: Boolean) extends SparkListener {
+    override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = {
+      if (throwInterruptedException) {
+        throw new InterruptedException("got interrupted")
+      } else {
+        Thread.currentThread().interrupt()
+      }
+    }
+  }
 }
 
 // These classes can't be declared inside of the SparkListenerSuite class because we don't want

From 93258d8057158ae562fe6c96583e86eaba8d6b64 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@apache.org>
Date: Tue, 22 May 2018 09:37:04 -0700
Subject: [PATCH 461/566] Preparing Spark release v2.3.1-rc2

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 8df26351614ec..632bcb342caeb 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.2
+Version: 2.3.1
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 02bf39bcb96f3..d744c8b4e2e0f 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 646fdfb23ef19..3a41e1627993f 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 76c7dcf52203f..f02108f03b74b 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index f2661fedd6f75..44304870bced2 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 229d4667169d2..167dc7aa503a7 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index febec1897c59c..ffb770ea6769e 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 4dec96f86d9c3..ddbfb49fe0021 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 7187313c5c2f7..ce7d67828e0b0 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index a9cc91f87566b..506b2740817af 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index 8579166c2e635..daef97ce691d0 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.2-SNAPSHOT
-SPARK_VERSION_SHORT: 2.3.2
+SPARK_VERSION: 2.3.1
+SPARK_VERSION_SHORT: 2.3.1
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index b873bc9b20322..7974a3847f01b 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index b7dc48e4a7001..33f72dfe1bacc 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 161fd0faac82a..b5ac7aee37049 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 26abc31f9620d..9118ad274cf44 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index bcc13ebb359b3..73b54bc15418a 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 020e0ba7d568b..815cab9c4ef8d 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index e12115ee6e20c..475cf7ff39d17 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 07d205b01dfde..553b73390a220 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 11d971a0de046..f911d0191b0f7 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 70eb580d474af..4cfc4577fe09d 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 0728baede26f0..b385bacf1027e 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index c6925851ebc23..416b40c2a798a 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 8b95f1be216c7..8c99aa3a1a97f 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index b5253fd8954f3..74535db06ad0a 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index dacd42126393b..7e93d8925a505 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 2fcb367cf301c..5e2d8503b8a1e 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 8ecd126ebb9b2..dd4825c19a7ad 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index f52b83475c515..5160d0b6e8b85 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index f4335f18c6350..3c445211a3d74 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.2-SNAPSHOT</version>
+  <version>2.3.1</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index ed1cbdd58f252..49c29a0042617 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.2.dev0"
+__version__ = "2.3.1"
diff --git a/repl/pom.xml b/repl/pom.xml
index bc10cce2d1aea..84802101496bd 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 3e0aa0872b99b..4457605fc1515 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index cf153476ce9f9..7e47916dcacaa 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 71ddbb5d4db08..78bfce4e429f9 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 04a3380b02115..5a1657d109812 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index c9a05cc9e43f6..b3fa6535dfc37 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 8a0c6fba66cda..f0b7d5e23a569 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 2eeff747a68c6..6b9302c4eb92a 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index d16f48dec205f..a84af92d1c39b 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index f5927c5e41939..1c10777ce8286 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From efe183f7b97659221c99e3838a9ddb69841ec27a Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@apache.org>
Date: Tue, 22 May 2018 09:37:08 -0700
Subject: [PATCH 462/566] Preparing development version 2.3.2-SNAPSHOT

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 632bcb342caeb..8df26351614ec 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.1
+Version: 2.3.2
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index d744c8b4e2e0f..02bf39bcb96f3 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 3a41e1627993f..646fdfb23ef19 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index f02108f03b74b..76c7dcf52203f 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 44304870bced2..f2661fedd6f75 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 167dc7aa503a7..229d4667169d2 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index ffb770ea6769e..febec1897c59c 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index ddbfb49fe0021..4dec96f86d9c3 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index ce7d67828e0b0..7187313c5c2f7 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 506b2740817af..a9cc91f87566b 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index daef97ce691d0..8579166c2e635 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.1
-SPARK_VERSION_SHORT: 2.3.1
+SPARK_VERSION: 2.3.2-SNAPSHOT
+SPARK_VERSION_SHORT: 2.3.2
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 7974a3847f01b..b873bc9b20322 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 33f72dfe1bacc..b7dc48e4a7001 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index b5ac7aee37049..161fd0faac82a 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 9118ad274cf44..26abc31f9620d 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 73b54bc15418a..bcc13ebb359b3 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 815cab9c4ef8d..020e0ba7d568b 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 475cf7ff39d17..e12115ee6e20c 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 553b73390a220..07d205b01dfde 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index f911d0191b0f7..11d971a0de046 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 4cfc4577fe09d..70eb580d474af 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index b385bacf1027e..0728baede26f0 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 416b40c2a798a..c6925851ebc23 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 8c99aa3a1a97f..8b95f1be216c7 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 74535db06ad0a..b5253fd8954f3 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 7e93d8925a505..dacd42126393b 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 5e2d8503b8a1e..2fcb367cf301c 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index dd4825c19a7ad..8ecd126ebb9b2 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 5160d0b6e8b85..f52b83475c515 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 3c445211a3d74..f4335f18c6350 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.1</version>
+  <version>2.3.2-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 49c29a0042617..ed1cbdd58f252 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.1"
+__version__ = "2.3.2.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 84802101496bd..bc10cce2d1aea 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 4457605fc1515..3e0aa0872b99b 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 7e47916dcacaa..cf153476ce9f9 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 78bfce4e429f9..71ddbb5d4db08 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 5a1657d109812..04a3380b02115 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index b3fa6535dfc37..c9a05cc9e43f6 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index f0b7d5e23a569..8a0c6fba66cda 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 6b9302c4eb92a..2eeff747a68c6 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index a84af92d1c39b..d16f48dec205f 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 1c10777ce8286..f5927c5e41939 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From ed0060ce786c78dd49099f55ab85412a01621184 Mon Sep 17 00:00:00 2001
From: Seth Fitzsimmons <seth@mojodna.net>
Date: Wed, 23 May 2018 09:14:03 +0800
Subject: [PATCH 463/566] Correct reference to Offset class

This is a documentation-only correction; `org.apache.spark.sql.sources.v2.reader.Offset` is actually `org.apache.spark.sql.sources.v2.reader.streaming.Offset`.

Author: Seth Fitzsimmons <seth@mojodna.net>

Closes #21387 from mojodna/patch-1.

(cherry picked from commit 00c13cfad78607fde0787c9d494f0df8ab7051ba)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 .../org/apache/spark/sql/execution/streaming/Offset.java      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.java b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.java
index 80aa5505db991..43ad4b3384ec3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.java
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.java
@@ -19,8 +19,8 @@
 
 /**
  * This is an internal, deprecated interface. New source implementations should use the
- * org.apache.spark.sql.sources.v2.reader.Offset class, which is the one that will be supported
- * in the long term.
+ * org.apache.spark.sql.sources.v2.reader.streaming.Offset class, which is the one that will be
+ * supported in the long term.
  *
  * This class will be removed in a future release.
  */

From ded6709dd96ab24f4f4c41ac5fecb2d6022a3796 Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Wed, 23 May 2018 22:26:39 +0800
Subject: [PATCH 464/566] [SPARK-24313][SQL][BACKPORT-2.3] Fix collection
 operations' interpreted evaluation for complex types

## What changes were proposed in this pull request?

The interpreted evaluation of several collection operations works only for simple datatypes. For complex data types, for instance, `array_contains` it returns always `false`. The list of the affected functions is `array_contains` and `GetMapValue`.

The PR fixes the behavior for all the datatypes.

## How was this patch tested?

added UT

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #21407 from mgaido91/SPARK-24313_2.3.
---
 .../expressions/collectionOperations.scala    |  9 ++++---
 .../expressions/complexTypeExtractors.scala   | 15 +++++++++--
 .../CollectionExpressionsSuite.scala          | 27 +++++++++++++++++++
 .../optimizer/complexTypesSuite.scala         | 13 +++++++++
 .../org/apache/spark/sql/DataFrameSuite.scala |  5 ++++
 5 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 4270b987d6de0..d5e94d7348a39 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -21,7 +21,7 @@ import java.util.Comparator
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodegenFallback, ExprCode}
-import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, MapData}
+import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, MapData, TypeUtils}
 import org.apache.spark.sql.types._
 
 /**
@@ -227,6 +227,9 @@ case class ArrayContains(left: Expression, right: Expression)
 
   override def dataType: DataType = BooleanType
 
+  @transient private lazy val ordering: Ordering[Any] =
+    TypeUtils.getInterpretedOrdering(right.dataType)
+
   override def inputTypes: Seq[AbstractDataType] = right.dataType match {
     case NullType => Seq.empty
     case _ => left.dataType match {
@@ -243,7 +246,7 @@ case class ArrayContains(left: Expression, right: Expression)
       TypeCheckResult.TypeCheckFailure(
         "Arguments must be an array followed by a value of same type as the array members")
     } else {
-      TypeCheckResult.TypeCheckSuccess
+      TypeUtils.checkForOrderingExpr(right.dataType, s"function $prettyName")
     }
   }
 
@@ -256,7 +259,7 @@ case class ArrayContains(left: Expression, right: Expression)
     arr.asInstanceOf[ArrayData].foreach(right.dataType, (i, v) =>
       if (v == null) {
         hasNull = true
-      } else if (v == value) {
+      } else if (ordering.equiv(v, value)) {
         return true
       }
     )
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
index 7e53ca3908905..eed25b5d96931 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
@@ -21,7 +21,7 @@ import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
-import org.apache.spark.sql.catalyst.util.{quoteIdentifier, ArrayData, GenericArrayData, MapData}
+import org.apache.spark.sql.catalyst.util.{quoteIdentifier, ArrayData, GenericArrayData, MapData, TypeUtils}
 import org.apache.spark.sql.types._
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -301,7 +301,7 @@ case class GetMapValue(child: Expression, key: Expression)
     var i = 0
     var found = false
     while (i < length && !found) {
-      if (keys.get(i, keyType) == ordinal) {
+      if (ordering.equiv(keys.get(i, keyType), ordinal)) {
         found = true
       } else {
         i += 1
@@ -352,4 +352,15 @@ case class GetMapValue(child: Expression, key: Expression)
       """
     })
   }
+
+  @transient private lazy val ordering: Ordering[Any] =
+    TypeUtils.getInterpretedOrdering(keyType)
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    super.checkInputDataTypes() match {
+      case f: TypeCheckResult.TypeCheckFailure => f
+      case TypeCheckResult.TypeCheckSuccess =>
+        TypeUtils.checkForOrderingExpr(keyType, s"function $prettyName")
+    }
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
index 020687e4b3a27..83e96ee331ed1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -104,5 +104,32 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
 
     checkEvaluation(ArrayContains(a3, Literal("")), null)
     checkEvaluation(ArrayContains(a3, Literal.create(null, StringType)), null)
+
+    // binary
+    val b0 = Literal.create(Seq[Array[Byte]](Array[Byte](5, 6), Array[Byte](1, 2)),
+      ArrayType(BinaryType))
+    val b1 = Literal.create(Seq[Array[Byte]](Array[Byte](2, 1), Array[Byte](4, 3)),
+      ArrayType(BinaryType))
+    val b2 = Literal.create(Seq[Array[Byte]](Array[Byte](2, 1), null),
+      ArrayType(BinaryType))
+    val b3 = Literal.create(Seq[Array[Byte]](null, Array[Byte](1, 2)),
+      ArrayType(BinaryType))
+    val be = Literal.create(Array[Byte](1, 2), BinaryType)
+    val nullBinary = Literal.create(null, BinaryType)
+
+    checkEvaluation(ArrayContains(b0, be), true)
+    checkEvaluation(ArrayContains(b1, be), false)
+    checkEvaluation(ArrayContains(b0, nullBinary), null)
+    checkEvaluation(ArrayContains(b2, be), null)
+    checkEvaluation(ArrayContains(b3, be), true)
+
+    // complex data types
+    val aa0 = Literal.create(Seq[Seq[Int]](Seq[Int](1, 2), Seq[Int](3, 4)),
+      ArrayType(ArrayType(IntegerType)))
+    val aa1 = Literal.create(Seq[Seq[Int]](Seq[Int](5, 6), Seq[Int](2, 1)),
+      ArrayType(ArrayType(IntegerType)))
+    val aae = Literal.create(Seq[Int](1, 2), ArrayType(IntegerType))
+    checkEvaluation(ArrayContains(aa0, aae), true)
+    checkEvaluation(ArrayContains(aa1, aae), false)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
index de544ac314789..c4c21369746a3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
@@ -331,4 +331,17 @@ class ComplexTypesSuite extends PlanTest with ExpressionEvalHelper {
       .analyze
     comparePlans(Optimizer execute rel, expected)
   }
+
+  test("SPARK-24313: support binary type as map keys in GetMapValue") {
+    val mb0 = Literal.create(
+      Map(Array[Byte](1, 2) -> "1", Array[Byte](3, 4) -> null, Array[Byte](2, 1) -> "2"),
+      MapType(BinaryType, StringType))
+    val mb1 = Literal.create(Map[Array[Byte], String](), MapType(BinaryType, StringType))
+
+    checkEvaluation(GetMapValue(mb0, Literal(Array[Byte](1, 2, 3))), null)
+
+    checkEvaluation(GetMapValue(mb1, Literal(Array[Byte](1, 2))), null)
+    checkEvaluation(GetMapValue(mb0, Literal(Array[Byte](2, 1), BinaryType)), "2")
+    checkEvaluation(GetMapValue(mb0, Literal(Array[Byte](3, 4))), null)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index f7b3393f65cb1..fd624ad8ef68d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -2270,4 +2270,9 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     val df = spark.range(1).select($"id", new Column(Uuid()))
     checkAnswer(df, df.collect())
   }
+
+  test("SPARK-24313: access map with binary keys") {
+    val mapWithBinaryKey = map(lit(Array[Byte](1.toByte)), lit(1))
+    checkAnswer(spark.range(1).select(mapWithBinaryKey.getItem(Array[Byte](1.toByte))), Row(1))
+  }
 }

From 3d2ae0ba866a3cb5b0681c2b8de67d5ead339712 Mon Sep 17 00:00:00 2001
From: sychen <sychen@ctrip.com>
Date: Thu, 24 May 2018 11:02:09 +0800
Subject: [PATCH 465/566] [SPARK-24257][SQL] LongToUnsafeRowMap calculate the
 new size may be wrong

LongToUnsafeRowMap has a mistake when growing its page array: it blindly grows to `oldSize * 2`, while the new record may be larger than `oldSize * 2`. Then we may have a malformed UnsafeRow when querying this map, whose actual data is smaller than its declared size, and the data is corrupted.

Author: sychen <sychen@ctrip.com>

Closes #21311 from cxzl25/fix_LongToUnsafeRowMap_page_size.

(cherry picked from commit 888340151f737bb68d0e419b1e949f11469881f9)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/execution/joins/HashedRelation.scala  | 38 +++++++++++--------
 .../execution/joins/HashedRelationSuite.scala | 26 ++++++++++++-
 2 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index 1465346eb802d..20ce01f4ce8cc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -557,7 +557,7 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
   def append(key: Long, row: UnsafeRow): Unit = {
     val sizeInBytes = row.getSizeInBytes
     if (sizeInBytes >= (1 << SIZE_BITS)) {
-      sys.error("Does not support row that is larger than 256M")
+      throw new UnsupportedOperationException("Does not support row that is larger than 256M")
     }
 
     if (key < minKey) {
@@ -567,19 +567,7 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
       maxKey = key
     }
 
-    // There is 8 bytes for the pointer to next value
-    if (cursor + 8 + row.getSizeInBytes > page.length * 8L + Platform.LONG_ARRAY_OFFSET) {
-      val used = page.length
-      if (used >= (1 << 30)) {
-        sys.error("Can not build a HashedRelation that is larger than 8G")
-      }
-      ensureAcquireMemory(used * 8L * 2)
-      val newPage = new Array[Long](used * 2)
-      Platform.copyMemory(page, Platform.LONG_ARRAY_OFFSET, newPage, Platform.LONG_ARRAY_OFFSET,
-        cursor - Platform.LONG_ARRAY_OFFSET)
-      page = newPage
-      freeMemory(used * 8L)
-    }
+    grow(row.getSizeInBytes)
 
     // copy the bytes of UnsafeRow
     val offset = cursor
@@ -615,7 +603,8 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
           growArray()
         } else if (numKeys > array.length / 2 * 0.75) {
           // The fill ratio should be less than 0.75
-          sys.error("Cannot build HashedRelation with more than 1/3 billions unique keys")
+          throw new UnsupportedOperationException(
+            "Cannot build HashedRelation with more than 1/3 billions unique keys")
         }
       }
     } else {
@@ -626,6 +615,25 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
     }
   }
 
+  private def grow(inputRowSize: Int): Unit = {
+    // There is 8 bytes for the pointer to next value
+    val neededNumWords = (cursor - Platform.LONG_ARRAY_OFFSET + 8 + inputRowSize + 7) / 8
+    if (neededNumWords > page.length) {
+      if (neededNumWords > (1 << 30)) {
+        throw new UnsupportedOperationException(
+          "Can not build a HashedRelation that is larger than 8G")
+      }
+      val newNumWords = math.max(neededNumWords, math.min(page.length * 2, 1 << 30))
+      ensureAcquireMemory(newNumWords * 8L)
+      val newPage = new Array[Long](newNumWords.toInt)
+      Platform.copyMemory(page, Platform.LONG_ARRAY_OFFSET, newPage, Platform.LONG_ARRAY_OFFSET,
+        cursor - Platform.LONG_ARRAY_OFFSET)
+      val used = page.length
+      page = newPage
+      freeMemory(used * 8L)
+    }
+  }
+
   private def growArray(): Unit = {
     var old_array = array
     val n = array.length
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
index 51f8c3325fdff..037cc2e3ccad7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
@@ -28,7 +28,7 @@ import org.apache.spark.serializer.KryoSerializer
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, StructType}
+import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.map.BytesToBytesMap
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.collection.CompactBuffer
@@ -254,6 +254,30 @@ class HashedRelationSuite extends SparkFunSuite with SharedSQLContext {
     map.free()
   }
 
+  test("SPARK-24257: insert big values into LongToUnsafeRowMap") {
+    val taskMemoryManager = new TaskMemoryManager(
+      new StaticMemoryManager(
+        new SparkConf().set(MEMORY_OFFHEAP_ENABLED.key, "false"),
+        Long.MaxValue,
+        Long.MaxValue,
+        1),
+      0)
+    val unsafeProj = UnsafeProjection.create(Array[DataType](StringType))
+    val map = new LongToUnsafeRowMap(taskMemoryManager, 1)
+
+    val key = 0L
+    // the page array is initialized with length 1 << 17 (1M bytes),
+    // so here we need a value larger than 1 << 18 (2M bytes), to trigger the bug
+    val bigStr = UTF8String.fromString("x" * (1 << 19))
+
+    map.append(key, unsafeProj(InternalRow(bigStr)))
+    map.optimize()
+
+    val resultRow = new UnsafeRow(1)
+    assert(map.getValue(key, resultRow).getUTF8String(0) === bigStr)
+    map.free()
+  }
+
   test("Spark-14521") {
     val ser = new KryoSerializer(
       (new SparkConf).set("spark.kryo.referenceTracking", "false")).newInstance()

From 75e2cd1313551a816480a6c830fb5fa73d21c426 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Thu, 24 May 2018 11:34:13 +0800
Subject: [PATCH 466/566] [SPARK-24322][BUILD] Upgrade Apache ORC to 1.4.4

ORC 1.4.4 includes [nine fixes](https://issues.apache.org/jira/issues/?filter=12342568&jql=project%20%3D%20ORC%20AND%20resolution%20%3D%20Fixed%20AND%20fixVersion%20%3D%201.4.4). One of the issues is about `Timestamp` bug (ORC-306) which occurs when `native` ORC vectorized reader reads ORC column vector's sub-vector `times` and `nanos`. ORC-306 fixes this according to the [original definition](https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java#L45-L46) and this PR includes the updated interpretation on ORC column vectors. Note that `hive` ORC reader and ORC MR reader is not affected.

```scala
scala> spark.version
res0: String = 2.3.0
scala> spark.sql("set spark.sql.orc.impl=native")
scala> Seq(java.sql.Timestamp.valueOf("1900-05-05 12:34:56.000789")).toDF().write.orc("/tmp/orc")
scala> spark.read.orc("/tmp/orc").show(false)
+--------------------------+
|value                     |
+--------------------------+
|1900-05-05 12:34:55.000789|
+--------------------------+
```

This PR aims to update Apache Spark to use it.

**FULL LIST**

ID | TITLE
-- | --
ORC-281 | Fix compiler warnings from clang 5.0
ORC-301 | `extractFileTail` should open a file in `try` statement
ORC-304 | Fix TestRecordReaderImpl to not fail with new storage-api
ORC-306 | Fix incorrect workaround for bug in java.sql.Timestamp
ORC-324 | Add support for ARM and PPC arch
ORC-330 | Remove unnecessary Hive artifacts from root pom
ORC-332 | Add syntax version to orc_proto.proto
ORC-336 | Remove avro and parquet dependency management entries
ORC-360 | Implement error checking on subtype fields in Java

Pass the Jenkins.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #21372 from dongjoon-hyun/SPARK_ORC144.

(cherry picked from commit 486ecc680e9a0e7b6b3c3a45fb883a61072096fc)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 dev/deps/spark-deps-hadoop-2.6                           | 4 ++--
 dev/deps/spark-deps-hadoop-2.7                           | 4 ++--
 pom.xml                                                  | 2 +-
 .../sql/execution/datasources/orc/OrcColumnVector.java   | 2 +-
 .../datasources/orc/OrcColumnarBatchReader.java          | 2 +-
 .../sql/execution/datasources/orc/OrcSourceSuite.scala   | 9 +++++++++
 6 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index f4559a831de21..4f0794d6f1a11 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -156,8 +156,8 @@ objenesis-2.1.jar
 okhttp-3.8.1.jar
 okio-1.13.0.jar
 opencsv-2.3.jar
-orc-core-1.4.3-nohive.jar
-orc-mapreduce-1.4.3-nohive.jar
+orc-core-1.4.4-nohive.jar
+orc-mapreduce-1.4.4-nohive.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index c2df998e1fa64..df2be777ff5ac 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -157,8 +157,8 @@ objenesis-2.1.jar
 okhttp-3.8.1.jar
 okio-1.13.0.jar
 opencsv-2.3.jar
-orc-core-1.4.3-nohive.jar
-orc-mapreduce-1.4.3-nohive.jar
+orc-core-1.4.4-nohive.jar
+orc-mapreduce-1.4.4-nohive.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar
diff --git a/pom.xml b/pom.xml
index f4335f18c6350..0e9354891f1a3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -130,7 +130,7 @@
     <hive.version.short>1.2.1</hive.version.short>
     <derby.version>10.12.1.1</derby.version>
     <parquet.version>1.8.3</parquet.version>
-    <orc.version>1.4.3</orc.version>
+    <orc.version>1.4.4</orc.version>
     <orc.classifier>nohive</orc.classifier>
     <hive.parquet.version>1.6.0</hive.parquet.version>
     <jetty.version>9.3.20.v20170531</jetty.version>
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
index 12f4d658b1868..9bfad1e83ee7b 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
@@ -136,7 +136,7 @@ public int getInt(int rowId) {
   public long getLong(int rowId) {
     int index = getRowIndex(rowId);
     if (isTimestamp) {
-      return timestampData.time[index] * 1000 + timestampData.nanos[index] / 1000;
+      return timestampData.time[index] * 1000 + timestampData.nanos[index] / 1000 % 1000;
     } else {
       return longData.vector[index];
     }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
index 5e7cad470e1d1..94de59ec4875c 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
@@ -495,7 +495,7 @@ private void putValues(
    * Returns the number of micros since epoch from an element of TimestampColumnVector.
    */
   private static long fromTimestampColumnVector(TimestampColumnVector vector, int index) {
-    return vector.time[index] * 1000L + vector.nanos[index] / 1000L;
+    return vector.time[index] * 1000 + (vector.nanos[index] / 1000 % 1000);
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
index 523f7cf77e103..9a20e271c11e8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution.datasources.orc
 
 import java.io.File
+import java.sql.Timestamp
 import java.util.Locale
 
 import org.apache.orc.OrcConf.COMPRESS
@@ -169,6 +170,14 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll {
       }
     }
   }
+
+  test("SPARK-24322 Fix incorrect workaround for bug in java.sql.Timestamp") {
+    withTempPath { path =>
+      val ts = Timestamp.valueOf("1900-05-05 12:34:56.000789")
+      Seq(ts).toDF.write.orc(path.getCanonicalPath)
+      checkAnswer(spark.read.orc(path.getCanonicalPath), Row(ts))
+    }
+  }
 }
 
 class OrcSourceSuite extends OrcSuite with SharedSQLContext {

From 068c4ae3437981824b65d56efb7889232d5a3fb7 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@apache.org>
Date: Thu, 24 May 2018 13:21:02 +0800
Subject: [PATCH 467/566] [SPARK-24364][SS] Prevent InMemoryFileIndex from
 failing if file path doesn't exist

## What changes were proposed in this pull request?

This PR proposes to follow up https://github.com/apache/spark/pull/15153 and complete SPARK-17599.

`FileSystem` operation (`fs.getFileBlockLocations`) can still fail if the file path does not exist. For example see the exception message below:

```
Error occurred while processing: File does not exist: /rel/00171151/input/PJ/part-00136-b6403bac-a240-44f8-a792-fc2e174682b7-c000.csv
...
java.io.FileNotFoundException: File does not exist: /rel/00171151/input/PJ/part-00136-b6403bac-a240-44f8-a792-fc2e174682b7-c000.csv
...
org.apache.hadoop.hdfs.DistributedFileSystem.getFileBlockLocations(DistributedFileSystem.java:249)
	at org.apache.hadoop.hdfs.DistributedFileSystem.getFileBlockLocations(DistributedFileSystem.java:229)
	at org.apache.spark.sql.execution.datasources.InMemoryFileIndex$$anonfun$org$apache$spark$sql$execution$datasources$InMemoryFileIndex$$listLeafFiles$3.apply(InMemoryFileIndex.scala:314)
	at org.apache.spark.sql.execution.datasources.InMemoryFileIndex$$anonfun$org$apache$spark$sql$execution$datasources$InMemoryFileIndex$$listLeafFiles$3.apply(InMemoryFileIndex.scala:297)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
	at org.apache.spark.sql.execution.datasources.InMemoryFileIndex$.org$apache$spark$sql$execution$datasources$InMemoryFileIndex$$listLeafFiles(InMemoryFileIndex.scala:297)
	at org.apache.spark.sql.execution.datasources.InMemoryFileIndex$$anonfun$org$apache$spark$sql$execution$datasources$InMemoryFileIndex$$bulkListLeafFiles$1.apply(InMemoryFileIndex.scala:174)
	at org.apache.spark.sql.execution.datasources.InMemoryFileIndex$$anonfun$org$apache$spark$sql$execution$datasources$InMemoryFileIndex$$bulkListLeafFiles$1.apply(InMemoryFileIndex.scala:173)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at scala.collection.AbstractTraversable.map(Traversable.scala:104)
	at org.apache.spark.sql.execution.datasources.InMemoryFileIndex$.org$apache$spark$sql$execution$datasources$InMemoryFileIndex$$bulkListLeafFiles(InMemoryFileIndex.scala:173)
	at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.listLeafFiles(InMemoryFileIndex.scala:126)
	at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.refresh0(InMemoryFileIndex.scala:91)
	at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.<init>(InMemoryFileIndex.scala:67)
	at org.apache.spark.sql.execution.datasources.DataSource.tempFileIndex$lzycompute$1(DataSource.scala:161)
	at org.apache.spark.sql.execution.datasources.DataSource.org$apache$spark$sql$execution$datasources$DataSource$$tempFileIndex$1(DataSource.scala:152)
	at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:166)
	at org.apache.spark.sql.execution.datasources.DataSource.sourceSchema(DataSource.scala:261)
	at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo$lzycompute(DataSource.scala:94)
	at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo(DataSource.scala:94)
	at org.apache.spark.sql.execution.streaming.StreamingRelation$.apply(StreamingRelation.scala:33)
	at org.apache.spark.sql.streaming.DataStreamReader.load(DataStreamReader.scala:196)
	at org.apache.spark.sql.streaming.DataStreamReader.load(DataStreamReader.scala:206)
	at com.hwx.StreamTest$.main(StreamTest.scala:97)
	at com.hwx.StreamTest.main(StreamTest.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
	at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:906)
	at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:197)
	at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:227)
	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:136)
	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: org.apache.hadoop.ipc.RemoteException(java.io.FileNotFoundException): File does not exist: /rel/00171151/input/PJ/part-00136-b6403bac-a240-44f8-a792-fc2e174682b7-c000.csv
...
```

So, it fixes it to make a warning instead.

## How was this patch tested?

It's hard to write a test. Manually tested multiple times.

Author: hyukjinkwon <gurwls223@apache.org>

Closes #21408 from HyukjinKwon/missing-files.

(cherry picked from commit 8a545822d0cc3a866ef91a94e58ea5c8b1014007)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 .../datasources/InMemoryFileIndex.scala       | 32 ++++++++++++++-----
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
index 318ada0ceefc5..4925831743465 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
@@ -294,9 +294,12 @@ object InMemoryFileIndex extends Logging {
       if (filter != null) allFiles.filter(f => filter.accept(f.getPath)) else allFiles
     }
 
-    allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
+    val missingFiles = mutable.ArrayBuffer.empty[String]
+    val filteredLeafStatuses = allLeafStatuses.filterNot(
+      status => shouldFilterOut(status.getPath.getName))
+    val resolvedLeafStatuses = filteredLeafStatuses.flatMap {
       case f: LocatedFileStatus =>
-        f
+        Some(f)
 
       // NOTE:
       //
@@ -311,14 +314,27 @@ object InMemoryFileIndex extends Logging {
         // The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
         // which is very slow on some file system (RawLocalFileSystem, which is launch a
         // subprocess and parse the stdout).
-        val locations = fs.getFileBlockLocations(f, 0, f.getLen)
-        val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
-          f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
-        if (f.isSymlink) {
-          lfs.setSymlink(f.getSymlink)
+        try {
+          val locations = fs.getFileBlockLocations(f, 0, f.getLen)
+          val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
+            f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
+          if (f.isSymlink) {
+            lfs.setSymlink(f.getSymlink)
+          }
+          Some(lfs)
+        } catch {
+          case _: FileNotFoundException =>
+            missingFiles += f.getPath.toString
+            None
         }
-        lfs
     }
+
+    if (missingFiles.nonEmpty) {
+      logWarning(
+        s"the following files were missing during file scan:\n  ${missingFiles.mkString("\n  ")}")
+    }
+
+    resolvedLeafStatuses
   }
 
   /** Checks if we should filter out this path name. */

From f48d62400a757470c44191b9e5581c10236fe976 Mon Sep 17 00:00:00 2001
From: Ryan Blue <blue@apache.org>
Date: Thu, 24 May 2018 20:55:26 +0800
Subject: [PATCH 468/566] [SPARK-24230][SQL] Fix
 SpecificParquetRecordReaderBase with dictionary filters.

## What changes were proposed in this pull request?

I missed this commit when preparing #21070.

When Parquet is able to filter blocks with dictionary filtering, the expected total value count to be too high in Spark, leading to an error when there were fewer than expected row groups to process. Spark should get the row groups from Parquet to pick up new filter schemes in Parquet like dictionary filtering.

## How was this patch tested?

Using in production at Netflix. Added test case for dictionary-filtered blocks.

Author: Ryan Blue <blue@apache.org>

Closes #21295 from rdblue/SPARK-24230-fix-parquet-block-tracking.

(cherry picked from commit 3469f5c989e686866051382a3a28b2265619cab9)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../parquet/SpecificParquetRecordReaderBase.java     |  6 ++++--
 .../datasources/parquet/ParquetQuerySuite.scala      | 12 ++++++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java
index e65cd252c3ddf..95fe130a54326 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java
@@ -147,7 +147,8 @@ public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptCont
     this.sparkSchema = StructType$.MODULE$.fromString(sparkRequestedSchemaString);
     this.reader = new ParquetFileReader(
         configuration, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());
-    for (BlockMetaData block : blocks) {
+    // use the blocks from the reader in case some do not match filters and will not be read
+    for (BlockMetaData block : reader.getRowGroups()) {
       this.totalRowCount += block.getRowCount();
     }
 
@@ -225,7 +226,8 @@ protected void initialize(String path, List<String> columns) throws IOException
     this.sparkSchema = new ParquetToSparkSchemaConverter(config).convert(requestedSchema);
     this.reader = new ParquetFileReader(
         config, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());
-    for (BlockMetaData block : blocks) {
+    // use the blocks from the reader in case some do not match filters and will not be read
+    for (BlockMetaData block : reader.getRowGroups()) {
       this.totalRowCount += block.getRowCount();
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
index 55b0f729be8ce..5b680bf3e2138 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -879,6 +879,18 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
       }
     }
   }
+
+  test("SPARK-24230: filter row group using dictionary") {
+    withSQLConf(("parquet.filter.dictionary.enabled", "true")) {
+      // create a table with values from 0, 2, ..., 18 that will be dictionary-encoded
+      withParquetTable((0 until 100).map(i => ((i * 2) % 20, s"data-$i")), "t") {
+        // search for a key that is not present so the dictionary filter eliminates all row groups
+        // Fails without SPARK-24230:
+        //   java.io.IOException: expecting more rows but reached last block. Read 0 out of 50
+        checkAnswer(sql("SELECT _2 FROM t WHERE t._1 = 5"), Seq.empty)
+      }
+    }
+  }
 }
 
 object TestingUDT {

From d0f30e3f36f50abfc18654e379fd03c1360c4fd6 Mon Sep 17 00:00:00 2001
From: Yuming Wang <yumwang@ebay.com>
Date: Thu, 24 May 2018 23:38:50 +0800
Subject: [PATCH 469/566] [SPARK-24378][SQL] Fix date_trunc function incorrect
 examples

## What changes were proposed in this pull request?

Fix `date_trunc` function incorrect examples.

## How was this patch tested?

N/A

Author: Yuming Wang <yumwang@ebay.com>

Closes #21423 from wangyum/SPARK-24378.
---
 .../expressions/datetimeExpressions.scala        | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 424871f2047e9..7859cd83e7cf8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -1434,14 +1434,14 @@ case class TruncDate(date: Expression, format: Expression)
   """,
   examples = """
     Examples:
-      > SELECT _FUNC_('2015-03-05T09:32:05.359', 'YEAR');
-       2015-01-01T00:00:00
-      > SELECT _FUNC_('2015-03-05T09:32:05.359', 'MM');
-       2015-03-01T00:00:00
-      > SELECT _FUNC_('2015-03-05T09:32:05.359', 'DD');
-       2015-03-05T00:00:00
-      > SELECT _FUNC_('2015-03-05T09:32:05.359', 'HOUR');
-       2015-03-05T09:00:00
+      > SELECT _FUNC_('YEAR', '2015-03-05T09:32:05.359');
+       2015-01-01 00:00:00
+      > SELECT _FUNC_('MM', '2015-03-05T09:32:05.359');
+       2015-03-01 00:00:00
+      > SELECT _FUNC_('DD', '2015-03-05T09:32:05.359');
+       2015-03-05 00:00:00
+      > SELECT _FUNC_('HOUR', '2015-03-05T09:32:05.359');
+       2015-03-05 09:00:00
   """,
   since = "2.3.0")
 // scalastyle:on line.size.limit

From 54aeae7ba7c2bdf2badd98c3679e8978c2c38d47 Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Fri, 25 May 2018 12:49:06 -0700
Subject: [PATCH 470/566] [MINOR] Add port SSL config in toString and scaladoc

## What changes were proposed in this pull request?

SPARK-17874 introduced a new configuration to set the port where SSL services bind to. We missed to update the scaladoc and the `toString` method, though. The PR adds it in the missing places

## How was this patch tested?

checked the `toString` output in the logs

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #21429 from mgaido91/minor_ssl.

(cherry picked from commit fd315f5884c03c6dd21abca178897584dee83f1a)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 core/src/main/scala/org/apache/spark/SSLOptions.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/SSLOptions.scala b/core/src/main/scala/org/apache/spark/SSLOptions.scala
index 477b01968c6ef..04c38f12acc78 100644
--- a/core/src/main/scala/org/apache/spark/SSLOptions.scala
+++ b/core/src/main/scala/org/apache/spark/SSLOptions.scala
@@ -128,7 +128,7 @@ private[spark] case class SSLOptions(
   }
 
   /** Returns a string representation of this SSLOptions with all the passwords masked. */
-  override def toString: String = s"SSLOptions{enabled=$enabled, " +
+  override def toString: String = s"SSLOptions{enabled=$enabled, port=$port, " +
       s"keyStore=$keyStore, keyStorePassword=${keyStorePassword.map(_ => "xxx")}, " +
       s"trustStore=$trustStore, trustStorePassword=${trustStorePassword.map(_ => "xxx")}, " +
       s"protocol=$protocol, enabledAlgorithms=$enabledAlgorithms}"
@@ -142,6 +142,7 @@ private[spark] object SSLOptions extends Logging {
    *
    * The following settings are allowed:
    * $ - `[ns].enabled` - `true` or `false`, to enable or disable SSL respectively
+   * $ - `[ns].port` - the port where to bind the SSL server
    * $ - `[ns].keyStore` - a path to the key-store file; can be relative to the current directory
    * $ - `[ns].keyStorePassword` - a password to the key-store file
    * $ - `[ns].keyPassword` - a password to the private key

From a06fc45baa5682410dd27edd19b9813b1ee62107 Mon Sep 17 00:00:00 2001
From: Yuming Wang <yumwang@ebay.com>
Date: Sat, 26 May 2018 20:26:00 +0800
Subject: [PATCH 471/566] [SPARK-19112][CORE][FOLLOW-UP] Add missing
 shortCompressionCodecNames to configuration.

## What changes were proposed in this pull request?

Spark provides four codecs: `lz4`, `lzf`, `snappy`, and `zstd`. This pr add missing shortCompressionCodecNames to configuration.

## How was this patch tested?

 manually tested

Author: Yuming Wang <yumwang@ebay.com>

Closes #21431 from wangyum/SPARK-19112.

(cherry picked from commit ed1a65448f228776afe2e5c6b1ac4228d2ed2854)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 docs/configuration.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index e7f2419cc2fa4..ec4f5d41c2d9d 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -912,8 +912,8 @@ Apart from these, the following properties are also available, and may be useful
   <td>lz4</td>
   <td>
     The codec used to compress internal data such as RDD partitions, event log, broadcast variables
-    and shuffle outputs. By default, Spark provides three codecs: <code>lz4</code>, <code>lzf</code>,
-    and <code>snappy</code>. You can also use fully qualified class names to specify the codec,
+    and shuffle outputs. By default, Spark provides four codecs: <code>lz4</code>, <code>lzf</code>,
+    <code>snappy</code>, and <code>zstd</code>. You can also use fully qualified class names to specify the codec,
     e.g.
     <code>org.apache.spark.io.LZ4CompressionCodec</code>,
     <code>org.apache.spark.io.LZFCompressionCodec</code>,

From 9b0f6f530dcc86c0865744f0d51c9577b12aa216 Mon Sep 17 00:00:00 2001
From: Li Jin <ice.xelloss@gmail.com>
Date: Mon, 28 May 2018 10:50:17 +0800
Subject: [PATCH 472/566] [SPARK-24334] Fix race condition in ArrowPythonRunner
 causes unclean shutdown of Arrow memory allocator

## What changes were proposed in this pull request?

There is a race condition of closing Arrow VectorSchemaRoot and Allocator in the writer thread of ArrowPythonRunner.

The race results in memory leak exception when closing the allocator. This patch removes the closing routine from the TaskCompletionListener and make the writer thread responsible for cleaning up the Arrow memory.

This issue be reproduced by this test:

```
def test_memory_leak(self):
    from pyspark.sql.functions import pandas_udf, col, PandasUDFType, array, lit, explode

   # Have all data in a single executor thread so it can trigger the race condition easier
    with self.sql_conf({'spark.sql.shuffle.partitions': 1}):
        df = self.spark.range(0, 1000)
        df = df.withColumn('id', array([lit(i) for i in range(0, 300)])) \
                   .withColumn('id', explode(col('id'))) \
                   .withColumn('v',  array([lit(i) for i in range(0, 1000)]))

       pandas_udf(df.schema, PandasUDFType.GROUPED_MAP)
       def foo(pdf):
           xxx
           return pdf

       result = df.groupby('id').apply(foo)

       with QuietTest(self.sc):
           with self.assertRaises(py4j.protocol.Py4JJavaError) as context:
               result.count()
           self.assertTrue('Memory leaked' not in str(context.exception))
```

Note: Because of the race condition, the test case cannot reproduce the issue reliably so it's not added to test cases.

## How was this patch tested?

Because of the race condition, the bug cannot be unit test easily. So far it has only happens on large amount of data. This is currently tested manually.

Author: Li Jin <ice.xelloss@gmail.com>

Closes #21397 from icexelloss/SPARK-24334-arrow-memory-leak.

(cherry picked from commit 672209f2909a95e891f3c779bfb2f0e534239851)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 .../execution/python/ArrowPythonRunner.scala  | 29 ++++++++++++-------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
index 5fcdcddca7d51..01e19bddbfb66 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
@@ -70,19 +70,13 @@ class ArrowPythonRunner(
         val arrowSchema = ArrowUtils.toArrowSchema(schema, timeZoneId)
         val allocator = ArrowUtils.rootAllocator.newChildAllocator(
           s"stdout writer for $pythonExec", 0, Long.MaxValue)
-
         val root = VectorSchemaRoot.create(arrowSchema, allocator)
-        val arrowWriter = ArrowWriter.create(root)
-
-        context.addTaskCompletionListener { _ =>
-          root.close()
-          allocator.close()
-        }
-
-        val writer = new ArrowStreamWriter(root, null, dataOut)
-        writer.start()
 
         Utils.tryWithSafeFinally {
+          val arrowWriter = ArrowWriter.create(root)
+          val writer = new ArrowStreamWriter(root, null, dataOut)
+          writer.start()
+
           while (inputIterator.hasNext) {
             val nextBatch = inputIterator.next()
 
@@ -94,8 +88,21 @@ class ArrowPythonRunner(
             writer.writeBatch()
             arrowWriter.reset()
           }
-        } {
+          // end writes footer to the output stream and doesn't clean any resources.
+          // It could throw exception if the output stream is closed, so it should be
+          // in the try block.
           writer.end()
+        } {
+          // If we close root and allocator in TaskCompletionListener, there could be a race
+          // condition where the writer thread keeps writing to the VectorSchemaRoot while
+          // it's being closed by the TaskCompletion listener.
+          // Closing root and allocator here is cleaner because root and allocator is owned
+          // by the writer thread and is only visible to the writer thread.
+          //
+          // If the writer thread is interrupted by TaskCompletionListener, it should either
+          // (1) in the try block, in which case it will get an InterruptedException when
+          // performing io, and goes into the finally block or (2) in the finally block,
+          // in which case it will ignore the interruption and close the resources.
           root.close()
           allocator.close()
         }

From 8bb6c2285c6017f28d8c94f4030df518f6d3048d Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Mon, 28 May 2018 12:56:05 +0800
Subject: [PATCH 473/566] [SPARK-24392][PYTHON] Label pandas_udf as
 Experimental

The pandas_udf functionality was introduced in 2.3.0, but is not completely stable and still evolving.  This adds a label to indicate it is still an experimental API.

NA

Author: Bryan Cutler <cutlerb@gmail.com>

Closes #21435 from BryanCutler/arrow-pandas_udf-experimental-SPARK-24392.

(cherry picked from commit fa2ae9d2019f839647d17932d8fea769e7622777)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 docs/sql-programming-guide.md   | 4 ++++
 python/pyspark/sql/dataframe.py | 2 ++
 python/pyspark/sql/functions.py | 2 ++
 python/pyspark/sql/group.py     | 2 ++
 python/pyspark/sql/session.py   | 2 ++
 5 files changed, 12 insertions(+)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 651e4408aecb5..14bc5e626771c 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1797,6 +1797,10 @@ working with timestamps in `pandas_udf`s to get the best performance, see
 
 # Migration Guide
 
+## Upgrading From Spark SQL 2.3.0 to 2.3.1 and above
+
+  - As of version 2.3.1 Arrow functionality, including `pandas_udf` and `toPandas()`/`createDataFrame()` with `spark.sql.execution.arrow.enabled` set to `True`, has been marked as experimental. These are still evolving and not currently recommended for use in production.
+
 ## Upgrading From Spark SQL 2.2 to 2.3
 
   - Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the referenced columns only include the internal corrupt record column (named `_corrupt_record` by default). For example, `spark.read.schema(schema).json(file).filter($"_corrupt_record".isNotNull).count()` and `spark.read.schema(schema).json(file).select("_corrupt_record").show()`. Instead, you can cache or save the parsed results and then send the same query. For example, `val df = spark.read.schema(schema).json(file).cache()` and then `df.filter($"_corrupt_record".isNotNull).count()`.
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 9bb0dca95788c..d416b3be08b43 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1924,6 +1924,8 @@ def toPandas(self):
         .. note:: This method should only be used if the resulting Pandas's DataFrame is expected
             to be small, as all the data is loaded into the driver's memory.
 
+        .. note:: Usage with spark.sql.execution.arrow.enabled=True is experimental.
+
         >>> df.toPandas()  # doctest: +SKIP
            age   name
         0    2  Alice
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 365be7be3aaa1..cf26523b3cb45 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2172,6 +2172,8 @@ def pandas_udf(f=None, returnType=None, functionType=None):
     :param functionType: an enum value in :class:`pyspark.sql.functions.PandasUDFType`.
                          Default: SCALAR.
 
+    .. note:: Experimental
+
     The function type of the UDF can be one of the following:
 
     1. SCALAR
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index 330faf2addcbe..bc6c094fa0ce2 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -212,6 +212,8 @@ def apply(self, udf):
         This function does not support partial aggregation, and requires shuffling all the data in
         the :class:`DataFrame`.
 
+        .. note:: Experimental
+
         :param udf: a grouped map user-defined function returned by
             :func:`pyspark.sql.functions.pandas_udf`.
 
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 2ac2ec2fe6328..a459cb5f25ee5 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -578,6 +578,8 @@ def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=Tr
         .. versionchanged:: 2.1
            Added verifySchema.
 
+        .. note:: Usage with spark.sql.execution.arrow.enabled=True is experimental.
+
         >>> l = [('Alice', 1)]
         >>> spark.createDataFrame(l).collect()
         [Row(_1=u'Alice', _2=1)]

From a9700cb4a622b5dd2fd379292f293ad4a603e87d Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Mon, 28 May 2018 12:09:44 +0800
Subject: [PATCH 474/566] [SPARK-24373][SQL] Add AnalysisBarrier to
 RelationalGroupedDataset's and KeyValueGroupedDataset's child

When we create a `RelationalGroupedDataset` or a `KeyValueGroupedDataset` we set its child to the `logicalPlan` of the `DataFrame` we need to aggregate. Since the `logicalPlan` is already analyzed, we should not analyze it again. But this happens when the new plan of the aggregate is analyzed.

The current behavior in most of the cases is likely to produce no harm, but in other cases re-analyzing an analyzed plan can change it, since the analysis is not idempotent. This can cause issues like the one described in the JIRA (missing to find a cached plan).

The PR adds an `AnalysisBarrier` to the `logicalPlan` which is used as child of `RelationalGroupedDataset` or a `KeyValueGroupedDataset`.

added UT

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #21432 from mgaido91/SPARK-24373.

(cherry picked from commit de01a8d50c9c3e196591db057d544f5d7b24d95f)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../scala/org/apache/spark/sql/Dataset.scala  |  2 +-
 .../spark/sql/KeyValueGroupedDataset.scala    |  2 +-
 .../spark/sql/RelationalGroupedDataset.scala  | 12 +--
 .../spark/sql/GroupedDatasetSuite.scala       | 96 +++++++++++++++++++
 4 files changed, 104 insertions(+), 8 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/GroupedDatasetSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 0058e6a25d911..21c92506692e1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -196,7 +196,7 @@ class Dataset[T] private[sql](
   }
 
   // Wraps analyzed logical plans with an analysis barrier so we won't traverse/resolve it again.
-  @transient private val planWithBarrier = AnalysisBarrier(logicalPlan)
+  @transient private[sql] val planWithBarrier = AnalysisBarrier(logicalPlan)
 
   /**
    * Currently [[ExpressionEncoder]] is the only implementation of [[Encoder]], here we turn the
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
index 6bab21dca0cbd..36f6038aa9485 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
@@ -49,7 +49,7 @@ class KeyValueGroupedDataset[K, V] private[sql](
   private implicit val kExprEnc = encoderFor(kEncoder)
   private implicit val vExprEnc = encoderFor(vEncoder)
 
-  private def logicalPlan = queryExecution.analyzed
+  private def logicalPlan = AnalysisBarrier(queryExecution.analyzed)
   private def sparkSession = queryExecution.sparkSession
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
index 4e0785a19de34..f79c0da85a73e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
@@ -64,17 +64,17 @@ class RelationalGroupedDataset protected[sql](
     groupType match {
       case RelationalGroupedDataset.GroupByType =>
         Dataset.ofRows(
-          df.sparkSession, Aggregate(groupingExprs, aliasedAgg, df.logicalPlan))
+          df.sparkSession, Aggregate(groupingExprs, aliasedAgg, df.planWithBarrier))
       case RelationalGroupedDataset.RollupType =>
         Dataset.ofRows(
-          df.sparkSession, Aggregate(Seq(Rollup(groupingExprs)), aliasedAgg, df.logicalPlan))
+          df.sparkSession, Aggregate(Seq(Rollup(groupingExprs)), aliasedAgg, df.planWithBarrier))
       case RelationalGroupedDataset.CubeType =>
         Dataset.ofRows(
-          df.sparkSession, Aggregate(Seq(Cube(groupingExprs)), aliasedAgg, df.logicalPlan))
+          df.sparkSession, Aggregate(Seq(Cube(groupingExprs)), aliasedAgg, df.planWithBarrier))
       case RelationalGroupedDataset.PivotType(pivotCol, values) =>
         val aliasedGrps = groupingExprs.map(alias)
         Dataset.ofRows(
-          df.sparkSession, Pivot(aliasedGrps, pivotCol, values, aggExprs, df.logicalPlan))
+          df.sparkSession, Pivot(aliasedGrps, pivotCol, values, aggExprs, df.planWithBarrier))
     }
   }
 
@@ -434,7 +434,7 @@ class RelationalGroupedDataset protected[sql](
           df.exprEnc.schema,
           groupingAttributes,
           df.logicalPlan.output,
-          df.logicalPlan))
+          df.planWithBarrier))
   }
 
   /**
@@ -460,7 +460,7 @@ class RelationalGroupedDataset protected[sql](
       case other => Alias(other, other.toString)()
     }
     val groupingAttributes = groupingNamedExpressions.map(_.toAttribute)
-    val child = df.logicalPlan
+    val child = df.planWithBarrier
     val project = Project(groupingNamedExpressions ++ child.output, child)
     val output = expr.dataType.asInstanceOf[StructType].toAttributes
     val plan = FlatMapGroupsInPandas(groupingAttributes, expr, output, project)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/GroupedDatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/GroupedDatasetSuite.scala
new file mode 100644
index 0000000000000..218a1b7248f12
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/GroupedDatasetSuite.scala
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.api.python.PythonEvalType
+import org.apache.spark.sql.catalyst.plans.logical.AnalysisBarrier
+import org.apache.spark.sql.execution.python.PythonUDF
+import org.apache.spark.sql.functions.udf
+import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.types.{LongType, StructField, StructType}
+
+class GroupedDatasetSuite extends QueryTest with SharedSQLContext {
+  import testImplicits._
+
+  private val scalaUDF = udf((x: Long) => { x + 1 })
+  private lazy val datasetWithUDF = spark.range(1).toDF("s").select($"s", scalaUDF($"s"))
+
+  private def assertContainsAnalysisBarrier(ds: Dataset[_], atLevel: Int = 1): Unit = {
+    assert(atLevel >= 0)
+    var children = Seq(ds.queryExecution.logical)
+    (1 to atLevel).foreach { _ =>
+      children = children.flatMap(_.children)
+    }
+    val barriers = children.collect {
+      case ab: AnalysisBarrier => ab
+    }
+    assert(barriers.nonEmpty, s"Plan does not contain AnalysisBarrier at level $atLevel:\n" +
+      ds.queryExecution.logical)
+  }
+
+  test("SPARK-24373: avoid running Analyzer rules twice on RelationalGroupedDataset") {
+    val groupByDataset = datasetWithUDF.groupBy()
+    val rollupDataset = datasetWithUDF.rollup("s")
+    val cubeDataset = datasetWithUDF.cube("s")
+    val pivotDataset = datasetWithUDF.groupBy().pivot("s", Seq(1, 2))
+    datasetWithUDF.cache()
+    Seq(groupByDataset, rollupDataset, cubeDataset, pivotDataset).foreach { rgDS =>
+      val df = rgDS.count()
+      assertContainsAnalysisBarrier(df)
+      assertCached(df)
+    }
+
+    val flatMapGroupsInRDF = datasetWithUDF.groupBy().flatMapGroupsInR(
+      Array.emptyByteArray,
+      Array.emptyByteArray,
+      Array.empty,
+      StructType(Seq(StructField("s", LongType))))
+    val flatMapGroupsInPandasDF = datasetWithUDF.groupBy().flatMapGroupsInPandas(PythonUDF(
+      "pyUDF",
+      null,
+      StructType(Seq(StructField("s", LongType))),
+      Seq.empty,
+      PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
+      true))
+    Seq(flatMapGroupsInRDF, flatMapGroupsInPandasDF).foreach { df =>
+      assertContainsAnalysisBarrier(df, 2)
+      assertCached(df)
+    }
+    datasetWithUDF.unpersist(true)
+  }
+
+  test("SPARK-24373: avoid running Analyzer rules twice on KeyValueGroupedDataset") {
+    val kvDasaset = datasetWithUDF.groupByKey(_.getLong(0))
+    datasetWithUDF.cache()
+    val mapValuesKVDataset = kvDasaset.mapValues(_.getLong(0)).reduceGroups(_ + _)
+    val keysKVDataset = kvDasaset.keys
+    val flatMapGroupsKVDataset = kvDasaset.flatMapGroups((k, _) => Seq(k))
+    val aggKVDataset = kvDasaset.count()
+    val otherKVDataset = spark.range(1).groupByKey(_ + 1)
+    val cogroupKVDataset = kvDasaset.cogroup(otherKVDataset)((k, _, _) => Seq(k))
+    Seq((mapValuesKVDataset, 1),
+        (keysKVDataset, 2),
+        (flatMapGroupsKVDataset, 2),
+        (aggKVDataset, 1),
+        (cogroupKVDataset, 2)).foreach { case (df, analysisBarrierDepth) =>
+      assertContainsAnalysisBarrier(df, analysisBarrierDepth)
+      assertCached(df)
+    }
+    datasetWithUDF.unpersist(true)
+  }
+}

From fec43fe1bcd7553eff077805a13bb301445c62a3 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 29 May 2018 10:35:30 +0800
Subject: [PATCH 475/566] [SPARK-19613][SS][TEST] Random.nextString is not safe
 for directory namePrefix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

`Random.nextString` is good for generating random string data, but it's not proper for directory name prefix in `Utils.createDirectory(tempDir, Random.nextString(10))`. This PR uses more safe directory namePrefix.

```scala
scala> scala.util.Random.nextString(10)
res0: String = 馨쭔ᎰႻ穚䃈兩㻞藑並
```

```scala
StateStoreRDDSuite:
- versioning and immutability
- recovering from files
- usage with iterators - only gets and only puts
- preferred locations using StateStoreCoordinator *** FAILED ***
  java.io.IOException: Failed to create a temp directory (under /.../spark/sql/core/target/tmp/StateStoreRDDSuite8712796397908632676) after 10 attempts!
  at org.apache.spark.util.Utils$.createDirectory(Utils.scala:295)
  at org.apache.spark.sql.execution.streaming.state.StateStoreRDDSuite$$anonfun$13$$anonfun$apply$6.apply(StateStoreRDDSuite.scala:152)
  at org.apache.spark.sql.execution.streaming.state.StateStoreRDDSuite$$anonfun$13$$anonfun$apply$6.apply(StateStoreRDDSuite.scala:149)
  at org.apache.spark.sql.catalyst.util.package$.quietly(package.scala:42)
  at org.apache.spark.sql.execution.streaming.state.StateStoreRDDSuite$$anonfun$13.apply(StateStoreRDDSuite.scala:149)
  at org.apache.spark.sql.execution.streaming.state.StateStoreRDDSuite$$anonfun$13.apply(StateStoreRDDSuite.scala:149)
...
- distributed test *** FAILED ***
  java.io.IOException: Failed to create a temp directory (under /.../spark/sql/core/target/tmp/StateStoreRDDSuite8712796397908632676) after 10 attempts!
  at org.apache.spark.util.Utils$.createDirectory(Utils.scala:295)
```

## How was this patch tested?

Pass the existing tests.StateStoreRDDSuite:

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #21446 from dongjoon-hyun/SPARK-19613.

(cherry picked from commit b31b587cd091010337378cf448fd598c37757053)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 .../execution/streaming/state/StateStoreRDDSuite.scala | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDDSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDDSuite.scala
index 65b39f0fbd73d..579a364ebc3e5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDDSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDDSuite.scala
@@ -55,7 +55,7 @@ class StateStoreRDDSuite extends SparkFunSuite with BeforeAndAfter with BeforeAn
 
   test("versioning and immutability") {
     withSparkSession(SparkSession.builder.config(sparkConf).getOrCreate()) { spark =>
-      val path = Utils.createDirectory(tempDir, Random.nextString(10)).toString
+      val path = Utils.createDirectory(tempDir, Random.nextFloat.toString).toString
       val rdd1 = makeRDD(spark.sparkContext, Seq("a", "b", "a")).mapPartitionsWithStateStore(
             spark.sqlContext, operatorStateInfo(path, version = 0), keySchema, valueSchema, None)(
             increment)
@@ -73,7 +73,7 @@ class StateStoreRDDSuite extends SparkFunSuite with BeforeAndAfter with BeforeAn
   }
 
   test("recovering from files") {
-    val path = Utils.createDirectory(tempDir, Random.nextString(10)).toString
+    val path = Utils.createDirectory(tempDir, Random.nextFloat.toString).toString
 
     def makeStoreRDD(
         spark: SparkSession,
@@ -101,7 +101,7 @@ class StateStoreRDDSuite extends SparkFunSuite with BeforeAndAfter with BeforeAn
   test("usage with iterators - only gets and only puts") {
     withSparkSession(SparkSession.builder.config(sparkConf).getOrCreate()) { spark =>
       implicit val sqlContext = spark.sqlContext
-      val path = Utils.createDirectory(tempDir, Random.nextString(10)).toString
+      val path = Utils.createDirectory(tempDir, Random.nextFloat.toString).toString
       val opId = 0
 
       // Returns an iterator of the incremented value made into the store
@@ -149,7 +149,7 @@ class StateStoreRDDSuite extends SparkFunSuite with BeforeAndAfter with BeforeAn
     quietly {
       val queryRunId = UUID.randomUUID
       val opId = 0
-      val path = Utils.createDirectory(tempDir, Random.nextString(10)).toString
+      val path = Utils.createDirectory(tempDir, Random.nextFloat.toString).toString
 
       withSparkSession(SparkSession.builder.config(sparkConf).getOrCreate()) { spark =>
         implicit val sqlContext = spark.sqlContext
@@ -189,7 +189,7 @@ class StateStoreRDDSuite extends SparkFunSuite with BeforeAndAfter with BeforeAn
           .config(sparkConf.setMaster("local-cluster[2, 1, 1024]"))
           .getOrCreate()) { spark =>
         implicit val sqlContext = spark.sqlContext
-        val path = Utils.createDirectory(tempDir, Random.nextString(10)).toString
+        val path = Utils.createDirectory(tempDir, Random.nextFloat.toString).toString
         val opId = 0
         val rdd1 = makeRDD(spark.sparkContext, Seq("a", "b", "a")).mapPartitionsWithStateStore(
           sqlContext, operatorStateInfo(path, version = 0), keySchema, valueSchema, None)(increment)

From 49a6c2b915f37682781efba708a103e709c54cf7 Mon Sep 17 00:00:00 2001
From: Gabor Somogyi <gabor.g.somogyi@gmail.com>
Date: Tue, 29 May 2018 20:10:59 +0800
Subject: [PATCH 476/566] [SPARK-23991][DSTREAMS] Fix data loss when WAL write
 fails in allocateBlocksToBatch

When blocks tried to get allocated to a batch and WAL write fails then the blocks will be removed from the received block queue. This fact simply produces data loss because the next allocation will not find the mentioned blocks in the queue.

In this PR blocks will be removed from the received queue only if WAL write succeded.

Additional unit test.

Author: Gabor Somogyi <gabor.g.somogyi@gmail.com>

Closes #21430 from gaborgsomogyi/SPARK-23991.

Change-Id: I5ead84f0233f0c95e6d9f2854ac2ff6906f6b341
(cherry picked from commit aca65c63cb12073eb193fe08998994c60acb8b58)
Signed-off-by: jerryshao <sshao@hortonworks.com>
---
 .../scheduler/ReceivedBlockTracker.scala      |  3 +-
 .../streaming/ReceivedBlockTrackerSuite.scala | 47 ++++++++++++++++++-
 2 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala
index dacff69d55dd2..cf4324578ea87 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala
@@ -112,10 +112,11 @@ private[streaming] class ReceivedBlockTracker(
   def allocateBlocksToBatch(batchTime: Time): Unit = synchronized {
     if (lastAllocatedBatchTime == null || batchTime > lastAllocatedBatchTime) {
       val streamIdToBlocks = streamIds.map { streamId =>
-          (streamId, getReceivedBlockQueue(streamId).dequeueAll(x => true))
+        (streamId, getReceivedBlockQueue(streamId).clone())
       }.toMap
       val allocatedBlocks = AllocatedBlocks(streamIdToBlocks)
       if (writeToLog(BatchAllocationEvent(batchTime, allocatedBlocks))) {
+        streamIds.foreach(getReceivedBlockQueue(_).clear())
         timeToAllocatedBlocks.put(batchTime, allocatedBlocks)
         lastAllocatedBatchTime = batchTime
       } else {
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala
index 4fa236bd39663..fd7e00b1de25f 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala
@@ -26,10 +26,12 @@ import scala.language.{implicitConversions, postfixOps}
 import scala.util.Random
 
 import org.apache.hadoop.conf.Configuration
+import org.mockito.Matchers.any
+import org.mockito.Mockito.{doThrow, reset, spy}
 import org.scalatest.{BeforeAndAfter, Matchers}
 import org.scalatest.concurrent.Eventually._
 
-import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.internal.Logging
 import org.apache.spark.storage.StreamBlockId
 import org.apache.spark.streaming.receiver.BlockManagerBasedStoreResult
@@ -115,6 +117,47 @@ class ReceivedBlockTrackerSuite
     tracker2.stop()
   }
 
+  test("block allocation to batch should not loose blocks from received queue") {
+    val tracker1 = spy(createTracker())
+    tracker1.isWriteAheadLogEnabled should be (true)
+    tracker1.getUnallocatedBlocks(streamId) shouldEqual Seq.empty
+
+    // Add blocks
+    val blockInfos = generateBlockInfos()
+    blockInfos.map(tracker1.addBlock)
+    tracker1.getUnallocatedBlocks(streamId) shouldEqual blockInfos
+
+    // Try to allocate the blocks to a batch and verify that it's failing
+    // The blocks should stay in the received queue when WAL write failing
+    doThrow(new RuntimeException("Not able to write BatchAllocationEvent"))
+      .when(tracker1).writeToLog(any(classOf[BatchAllocationEvent]))
+    val errMsg = intercept[RuntimeException] {
+      tracker1.allocateBlocksToBatch(1)
+    }
+    assert(errMsg.getMessage === "Not able to write BatchAllocationEvent")
+    tracker1.getUnallocatedBlocks(streamId) shouldEqual blockInfos
+    tracker1.getBlocksOfBatch(1) shouldEqual Map.empty
+    tracker1.getBlocksOfBatchAndStream(1, streamId) shouldEqual Seq.empty
+
+    // Allocate the blocks to a batch and verify that all of them have been allocated
+    reset(tracker1)
+    tracker1.allocateBlocksToBatch(2)
+    tracker1.getUnallocatedBlocks(streamId) shouldEqual Seq.empty
+    tracker1.hasUnallocatedReceivedBlocks should be (false)
+    tracker1.getBlocksOfBatch(2) shouldEqual Map(streamId -> blockInfos)
+    tracker1.getBlocksOfBatchAndStream(2, streamId) shouldEqual blockInfos
+
+    tracker1.stop()
+
+    // Recover from WAL to see the correctness
+    val tracker2 = createTracker(recoverFromWriteAheadLog = true)
+    tracker2.getUnallocatedBlocks(streamId) shouldEqual Seq.empty
+    tracker2.hasUnallocatedReceivedBlocks should be (false)
+    tracker2.getBlocksOfBatch(2) shouldEqual Map(streamId -> blockInfos)
+    tracker2.getBlocksOfBatchAndStream(2, streamId) shouldEqual blockInfos
+    tracker2.stop()
+  }
+
   test("recovery and cleanup with write ahead logs") {
     val manualClock = new ManualClock
     // Set the time increment level to twice the rotation interval so that every increment creates
@@ -312,7 +355,7 @@ class ReceivedBlockTrackerSuite
       recoverFromWriteAheadLog: Boolean = false,
       clock: Clock = new SystemClock): ReceivedBlockTracker = {
     val cpDirOption = if (setCheckpointDir) Some(checkpointDirectory.toString) else None
-    val tracker = new ReceivedBlockTracker(
+    var tracker = new ReceivedBlockTracker(
       conf, hadoopConf, Seq(streamId), clock, recoverFromWriteAheadLog, cpDirOption)
     allReceivedBlockTrackers += tracker
     tracker

From 66289a3e067769cb8ed35953187f6363463791e1 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Thu, 31 May 2018 00:23:25 +0800
Subject: [PATCH 477/566] [SPARK-24369][SQL] Correct handling for multiple
 distinct aggregations having the same argument set

## What changes were proposed in this pull request?
This pr fixed an issue when having multiple distinct aggregations having the same argument set, e.g.,
```
scala>: paste
val df = sql(
  s"""SELECT corr(DISTINCT x, y), corr(DISTINCT y, x), count(*)
     | FROM (VALUES (1, 1), (2, 2), (2, 2)) t(x, y)
   """.stripMargin)

java.lang.RuntimeException
You hit a query analyzer bug. Please report your query to Spark user mailing list.
```
The root cause is that `RewriteDistinctAggregates` can't detect multiple distinct aggregations if they have the same argument set. This pr modified code so that `RewriteDistinctAggregates` could count the number of aggregate expressions with `isDistinct=true`.

## How was this patch tested?
Added tests in `DataFrameAggregateSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #21443 from maropu/SPARK-24369.

(cherry picked from commit 1e46f92f956a00d04d47340489b6125d44dbd47b)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../optimizer/RewriteDistinctAggregates.scala         |  7 ++++---
 .../apache/spark/sql/execution/SparkStrategies.scala  |  2 +-
 .../src/test/resources/sql-tests/inputs/group-by.sql  |  6 +++++-
 .../test/resources/sql-tests/results/group-by.sql.out | 11 ++++++++++-
 4 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
index 4448ace7105a4..bc898ab0dc723 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
@@ -115,7 +115,8 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] {
     }
 
     // Extract distinct aggregate expressions.
-    val distinctAggGroups = aggExpressions.filter(_.isDistinct).groupBy { e =>
+    val distincgAggExpressions = aggExpressions.filter(_.isDistinct)
+    val distinctAggGroups = distincgAggExpressions.groupBy { e =>
         val unfoldableChildren = e.aggregateFunction.children.filter(!_.foldable).toSet
         if (unfoldableChildren.nonEmpty) {
           // Only expand the unfoldable children
@@ -132,7 +133,7 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] {
     }
 
     // Aggregation strategy can handle queries with a single distinct group.
-    if (distinctAggGroups.size > 1) {
+    if (distincgAggExpressions.size > 1) {
       // Create the attributes for the grouping id and the group by clause.
       val gid = AttributeReference("gid", IntegerType, nullable = false)()
       val groupByMap = a.groupingExpressions.collect {
@@ -151,7 +152,7 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] {
       }
 
       // Setup unique distinct aggregate children.
-      val distinctAggChildren = distinctAggGroups.keySet.flatten.toSeq.distinct
+      val distinctAggChildren = distinctAggGroups.keySet.flatten.toSeq
       val distinctAggChildAttrMap = distinctAggChildren.map(expressionAttributePair)
       val distinctAggChildAttrs = distinctAggChildAttrMap.map(_._2)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 25436e1c8dd45..9fed2f9bc255d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -374,7 +374,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
           aggregateExpressions.partition(_.isDistinct)
         if (functionsWithDistinct.map(_.aggregateFunction.children).distinct.length > 1) {
           // This is a sanity check. We should not reach here when we have multiple distinct
-          // column sets. Our MultipleDistinctRewriter should take care this case.
+          // column sets. Our `RewriteDistinctAggregates` should take care this case.
           sys.error("You hit a query analyzer bug. Please report your query to " +
               "Spark user mailing list.")
         }
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
index c5070b734d521..2c18d6aaabdba 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
@@ -68,4 +68,8 @@ SELECT 1 from (
   FROM (select 1 as x) a
   WHERE false
 ) b
-where b.z != b.z
+where b.z != b.z;
+
+-- SPARK-24369 multiple distinct aggregations having the same argument set
+SELECT corr(DISTINCT x, y), corr(DISTINCT y, x), count(*)
+  FROM (VALUES (1, 1), (2, 2), (2, 2)) t(x, y);
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
index c1abc6dff754b..581aa1754ce14 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 26
+-- Number of queries: 27
 
 
 -- !query 0
@@ -241,3 +241,12 @@ where b.z != b.z
 struct<1:int>
 -- !query 25 output
 
+
+
+-- !query 26
+SELECT corr(DISTINCT x, y), corr(DISTINCT y, x), count(*)
+  FROM (VALUES (1, 1), (2, 2), (2, 2)) t(x, y)
+-- !query 26 schema
+struct<corr(DISTINCT CAST(x AS DOUBLE), CAST(y AS DOUBLE)):double,corr(DISTINCT CAST(y AS DOUBLE), CAST(x AS DOUBLE)):double,count(1):bigint>
+-- !query 26 output
+1.0	1.0	3

From e1c0ab16c71f102bfd9f5133647d168e49ae06bc Mon Sep 17 00:00:00 2001
From: e-dorigatti <emilio.dorigatti@gmail.com>
Date: Thu, 31 May 2018 00:46:09 +0800
Subject: [PATCH 478/566] [SPARK-23754][BRANCH-2.3][PYTHON] Re-raising
 StopIteration in client code

## What changes are proposed
Make sure that `StopIteration`s raised in users' code do not silently interrupt processing by spark, but are raised as exceptions to the users. The users' functions are wrapped in `safe_iter` (in `shuffle.py`), which re-raises `StopIteration`s as `RuntimeError`s

## How were the changes tested
Unit tests, making sure that the exceptions are indeed raised. I am not sure how to check whether a `Py4JJavaError` contains my exception, so I simply looked for the exception message in the java exception's `toString`. Can you propose a better way?

This is my original work, licensed in the same way as spark

---

Author: e-dorigatti <emilio.dorigattigmail.com>

Closes #21383 from e-dorigatti/fix_spark_23754.

(cherry picked from commit 0ebb0c0d4dd3e192464dc5e0e6f01efa55b945ed)

Author: e-dorigatti <emilio.dorigatti@gmail.com>

Closes #21463 from e-dorigatti/branch-2.3.
---
 python/pyspark/rdd.py       | 18 ++++++++++---
 python/pyspark/shuffle.py   |  7 ++---
 python/pyspark/sql/tests.py | 16 +++++++++++
 python/pyspark/sql/udf.py   |  4 ++-
 python/pyspark/tests.py     | 53 +++++++++++++++++++++++++++++++++++++
 python/pyspark/util.py      | 17 ++++++++++++
 6 files changed, 108 insertions(+), 7 deletions(-)

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index acd0fa28958ed..8dc30a42f74e0 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -53,6 +53,7 @@
 from pyspark.shuffle import Aggregator, ExternalMerger, \
     get_used_memory, ExternalSorter, ExternalGroupBy
 from pyspark.traceback_utils import SCCallSiteSync
+from pyspark.util import fail_on_stopiteration
 
 
 __all__ = ["RDD"]
@@ -338,7 +339,7 @@ def map(self, f, preservesPartitioning=False):
         [('a', 1), ('b', 1), ('c', 1)]
         """
         def func(_, iterator):
-            return map(f, iterator)
+            return map(fail_on_stopiteration(f), iterator)
         return self.mapPartitionsWithIndex(func, preservesPartitioning)
 
     def flatMap(self, f, preservesPartitioning=False):
@@ -353,7 +354,7 @@ def flatMap(self, f, preservesPartitioning=False):
         [(2, 2), (2, 2), (3, 3), (3, 3), (4, 4), (4, 4)]
         """
         def func(s, iterator):
-            return chain.from_iterable(map(f, iterator))
+            return chain.from_iterable(map(fail_on_stopiteration(f), iterator))
         return self.mapPartitionsWithIndex(func, preservesPartitioning)
 
     def mapPartitions(self, f, preservesPartitioning=False):
@@ -416,7 +417,7 @@ def filter(self, f):
         [2, 4]
         """
         def func(iterator):
-            return filter(f, iterator)
+            return filter(fail_on_stopiteration(f), iterator)
         return self.mapPartitions(func, True)
 
     def distinct(self, numPartitions=None):
@@ -797,6 +798,8 @@ def foreach(self, f):
         >>> def f(x): print(x)
         >>> sc.parallelize([1, 2, 3, 4, 5]).foreach(f)
         """
+        f = fail_on_stopiteration(f)
+
         def processPartition(iterator):
             for x in iterator:
                 f(x)
@@ -846,6 +849,8 @@ def reduce(self, f):
             ...
         ValueError: Can not reduce() empty RDD
         """
+        f = fail_on_stopiteration(f)
+
         def func(iterator):
             iterator = iter(iterator)
             try:
@@ -917,6 +922,8 @@ def fold(self, zeroValue, op):
         >>> sc.parallelize([1, 2, 3, 4, 5]).fold(0, add)
         15
         """
+        op = fail_on_stopiteration(op)
+
         def func(iterator):
             acc = zeroValue
             for obj in iterator:
@@ -949,6 +956,9 @@ def aggregate(self, zeroValue, seqOp, combOp):
         >>> sc.parallelize([]).aggregate((0, 0), seqOp, combOp)
         (0, 0)
         """
+        seqOp = fail_on_stopiteration(seqOp)
+        combOp = fail_on_stopiteration(combOp)
+
         def func(iterator):
             acc = zeroValue
             for obj in iterator:
@@ -1642,6 +1652,8 @@ def reduceByKeyLocally(self, func):
         >>> sorted(rdd.reduceByKeyLocally(add).items())
         [('a', 2), ('b', 1)]
         """
+        func = fail_on_stopiteration(func)
+
         def reducePartition(iterator):
             m = {}
             for k, v in iterator:
diff --git a/python/pyspark/shuffle.py b/python/pyspark/shuffle.py
index e974cda9fc3e1..68f9df7e8b2a9 100644
--- a/python/pyspark/shuffle.py
+++ b/python/pyspark/shuffle.py
@@ -27,6 +27,7 @@
 import pyspark.heapq3 as heapq
 from pyspark.serializers import BatchedSerializer, PickleSerializer, FlattenedValuesSerializer, \
     CompressedSerializer, AutoBatchedSerializer
+from pyspark.util import fail_on_stopiteration
 
 
 try:
@@ -93,9 +94,9 @@ class Aggregator(object):
     """
 
     def __init__(self, createCombiner, mergeValue, mergeCombiners):
-        self.createCombiner = createCombiner
-        self.mergeValue = mergeValue
-        self.mergeCombiners = mergeCombiners
+        self.createCombiner = fail_on_stopiteration(createCombiner)
+        self.mergeValue = fail_on_stopiteration(mergeValue)
+        self.mergeCombiners = fail_on_stopiteration(mergeCombiners)
 
 
 class SimpleAggregator(Aggregator):
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 7ff09afbb8e09..818ba833a1433 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -853,6 +853,22 @@ def __call__(self, x):
         self.assertEqual(f, f_.func)
         self.assertEqual(return_type, f_.returnType)
 
+    def test_stopiteration_in_udf(self):
+        # test for SPARK-23754
+        from pyspark.sql.functions import udf
+        from py4j.protocol import Py4JJavaError
+
+        def foo(x):
+            raise StopIteration()
+
+        with self.assertRaises(Py4JJavaError) as cm:
+            self.spark.range(0, 1000).withColumn('v', udf(foo)('id')).show()
+
+        self.assertIn(
+            "Caught StopIteration thrown from user's code; failing the task",
+            cm.exception.java_exception.toString()
+        )
+
     def test_validate_column_types(self):
         from pyspark.sql.functions import udf, to_json
         from pyspark.sql.column import _to_java_column
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index 671e5680b8e7b..7d813af15cb6b 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -24,6 +24,7 @@
 from pyspark.sql.column import Column, _to_java_column, _to_seq
 from pyspark.sql.types import StringType, DataType, StructType, _parse_datatype_string, \
     to_arrow_type, to_arrow_schema
+from pyspark.util import fail_on_stopiteration
 
 __all__ = ["UDFRegistration"]
 
@@ -154,7 +155,8 @@ def _create_judf(self):
         spark = SparkSession.builder.getOrCreate()
         sc = spark.sparkContext
 
-        wrapped_func = _wrap_function(sc, self.func, self.returnType)
+        func = fail_on_stopiteration(self.func)
+        wrapped_func = _wrap_function(sc, func, self.returnType)
         jdt = spark._jsparkSession.parseDataType(self.returnType.json())
         judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction(
             self._name, wrapped_func, jdt, self.evalType, self.deterministic)
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 18bbeb53d769a..af394504b1d7a 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -161,6 +161,37 @@ def gen_gs(N, step=1):
             self.assertEqual(k, len(vs))
             self.assertEqual(list(range(k)), list(vs))
 
+    def test_stopiteration_is_raised(self):
+
+        def stopit(*args, **kwargs):
+            raise StopIteration()
+
+        def legit_create_combiner(x):
+            return [x]
+
+        def legit_merge_value(x, y):
+            return x.append(y) or x
+
+        def legit_merge_combiners(x, y):
+            return x.extend(y) or x
+
+        data = [(x % 2, x) for x in range(100)]
+
+        # wrong create combiner
+        m = ExternalMerger(Aggregator(stopit, legit_merge_value, legit_merge_combiners), 20)
+        with self.assertRaises((Py4JJavaError, RuntimeError)) as cm:
+            m.mergeValues(data)
+
+        # wrong merge value
+        m = ExternalMerger(Aggregator(legit_create_combiner, stopit, legit_merge_combiners), 20)
+        with self.assertRaises((Py4JJavaError, RuntimeError)) as cm:
+            m.mergeValues(data)
+
+        # wrong merge combiners
+        m = ExternalMerger(Aggregator(legit_create_combiner, legit_merge_value, stopit), 20)
+        with self.assertRaises((Py4JJavaError, RuntimeError)) as cm:
+            m.mergeCombiners(map(lambda x_y1: (x_y1[0], [x_y1[1]]), data))
+
 
 class SorterTests(unittest.TestCase):
     def test_in_memory_sort(self):
@@ -1239,6 +1270,28 @@ def test_pipe_functions(self):
         self.assertRaises(Py4JJavaError, rdd.pipe('grep 4', checkCode=True).collect)
         self.assertEqual([], rdd.pipe('grep 4').collect())
 
+    def test_stopiteration_in_client_code(self):
+
+        def stopit(*x):
+            raise StopIteration()
+
+        seq_rdd = self.sc.parallelize(range(10))
+        keyed_rdd = self.sc.parallelize((x % 2, x) for x in range(10))
+
+        self.assertRaises(Py4JJavaError, seq_rdd.map(stopit).collect)
+        self.assertRaises(Py4JJavaError, seq_rdd.filter(stopit).collect)
+        self.assertRaises(Py4JJavaError, seq_rdd.cartesian(seq_rdd).flatMap(stopit).collect)
+        self.assertRaises(Py4JJavaError, seq_rdd.foreach, stopit)
+        self.assertRaises(Py4JJavaError, keyed_rdd.reduceByKeyLocally, stopit)
+        self.assertRaises(Py4JJavaError, seq_rdd.reduce, stopit)
+        self.assertRaises(Py4JJavaError, seq_rdd.fold, 0, stopit)
+
+        # the exception raised is non-deterministic
+        self.assertRaises((Py4JJavaError, RuntimeError),
+                          seq_rdd.aggregate, 0, stopit, lambda *x: 1)
+        self.assertRaises((Py4JJavaError, RuntimeError),
+                          seq_rdd.aggregate, 0, lambda *x: 1, stopit)
+
 
 class ProfilerTests(PySparkTestCase):
 
diff --git a/python/pyspark/util.py b/python/pyspark/util.py
index ad4a0bc68ef41..83d528f0e31f8 100644
--- a/python/pyspark/util.py
+++ b/python/pyspark/util.py
@@ -45,6 +45,23 @@ def _exception_message(excp):
     return str(excp)
 
 
+def fail_on_stopiteration(f):
+    """
+    Wraps the input function to fail on 'StopIteration' by raising a 'RuntimeError'
+    prevents silent loss of data when 'f' is used in a for loop
+    """
+    def wrapper(*args, **kwargs):
+        try:
+            return f(*args, **kwargs)
+        except StopIteration as exc:
+            raise RuntimeError(
+                "Caught StopIteration thrown from user's code; failing the task",
+                exc
+            )
+
+    return wrapper
+
+
 if __name__ == "__main__":
     import doctest
     (failure_count, test_count) = doctest.testmod()

From 3a024a4db5b531025fbd7761bccf2525f83f4234 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@apache.org>
Date: Wed, 30 May 2018 10:33:34 -0700
Subject: [PATCH 479/566] [SPARK-24384][PYTHON][SPARK SUBMIT] Add .py files
 correctly into PythonRunner in submit with client mode in spark-submit

## What changes were proposed in this pull request?

In client side before context initialization specifically,  .py file doesn't work in client side before context initialization when the application is a Python file. See below:

```
$ cat /home/spark/tmp.py
def testtest():
    return 1
```

This works:

```
$ cat app.py
import pyspark
pyspark.sql.SparkSession.builder.getOrCreate()
import tmp
print("************************%s" % tmp.testtest())

$ ./bin/spark-submit --master yarn --deploy-mode client --py-files /home/spark/tmp.py app.py
...
************************1
```

but this doesn't:

```
$ cat app.py
import pyspark
import tmp
pyspark.sql.SparkSession.builder.getOrCreate()
print("************************%s" % tmp.testtest())

$ ./bin/spark-submit --master yarn --deploy-mode client --py-files /home/spark/tmp.py app.py
Traceback (most recent call last):
  File "/home/spark/spark/app.py", line 2, in <module>
    import tmp
ImportError: No module named tmp
```

### How did it happen?

In client mode specifically, the paths are being added into PythonRunner as are:

https://github.com/apache/spark/blob/628c7b517969c4a7ccb26ea67ab3dd61266073ca/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala#L430

https://github.com/apache/spark/blob/628c7b517969c4a7ccb26ea67ab3dd61266073ca/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala#L49-L88

The problem here is, .py file shouldn't be added as are since `PYTHONPATH` expects a directory or an archive like zip or egg.

### How does this PR fix?

We shouldn't simply just add its parent directory because other files in the parent directory could also be added into the `PYTHONPATH` in client mode before context initialization.

Therefore, we copy .py files into a temp directory for .py files and add it to `PYTHONPATH`.

## How was this patch tested?

Unit tests are added and manually tested in both standalond and yarn client modes with submit.

Author: hyukjinkwon <gurwls223@apache.org>

Closes #21426 from HyukjinKwon/SPARK-24384.

(cherry picked from commit b142157dcc7f595eea93d66dda8b1d169a38d95c)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../apache/spark/deploy/PythonRunner.scala    | 29 ++++++++++++++++++-
 .../spark/deploy/yarn/YarnClusterSuite.scala  | 15 ++++------
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
index 1b7e031ee0678..ccb30e205ca40 100644
--- a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
@@ -19,6 +19,7 @@ package org.apache.spark.deploy
 
 import java.io.File
 import java.net.{InetAddress, URI}
+import java.nio.file.Files
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
@@ -48,7 +49,7 @@ object PythonRunner {
 
     // Format python file paths before adding them to the PYTHONPATH
     val formattedPythonFile = formatPath(pythonFile)
-    val formattedPyFiles = formatPaths(pyFiles)
+    val formattedPyFiles = resolvePyFiles(formatPaths(pyFiles))
 
     // Launch a Py4J gateway server for the process to connect to; this will let it see our
     // Java system properties and such
@@ -153,4 +154,30 @@ object PythonRunner {
       .map { p => formatPath(p, testWindows) }
   }
 
+  /**
+   * Resolves the ".py" files. ".py" file should not be added as is because PYTHONPATH does
+   * not expect a file. This method creates a temporary directory and puts the ".py" files
+   * if exist in the given paths.
+   */
+  private def resolvePyFiles(pyFiles: Array[String]): Array[String] = {
+    lazy val dest = Utils.createTempDir(namePrefix = "localPyFiles")
+    pyFiles.flatMap { pyFile =>
+      // In case of client with submit, the python paths should be set before context
+      // initialization because the context initialization can be done later.
+      // We will copy the local ".py" files because ".py" file shouldn't be added
+      // alone but its parent directory in PYTHONPATH. See SPARK-24384.
+      if (pyFile.endsWith(".py")) {
+        val source = new File(pyFile)
+        if (source.exists() && source.isFile && source.canRead) {
+          Files.copy(source.toPath, new File(dest, source.getName).toPath)
+          Some(dest.getAbsolutePath)
+        } else {
+          // Don't have to add it if it doesn't exist or isn't readable.
+          None
+        }
+      } else {
+        Some(pyFile)
+      }
+    }.distinct
+  }
 }
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index 6a974d558403f..4210737310c6b 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -263,16 +263,11 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
       "PYSPARK_ARCHIVES_PATH" -> pythonPath.map("local:" + _).mkString(File.pathSeparator),
       "PYTHONPATH" -> pythonPath.mkString(File.pathSeparator)) ++ extraEnv
 
-    val moduleDir =
-      if (clientMode) {
-        // In client-mode, .py files added with --py-files are not visible in the driver.
-        // This is something that the launcher library would have to handle.
-        tempDir
-      } else {
-        val subdir = new File(tempDir, "pyModules")
-        subdir.mkdir()
-        subdir
-      }
+    val moduleDir = {
+      val subdir = new File(tempDir, "pyModules")
+      subdir.mkdir()
+      subdir
+    }
     val pyModule = new File(moduleDir, "mod1.py")
     Files.write(TEST_PYMODULE, pyModule, StandardCharsets.UTF_8)
 

From dc24da23341eafb8163ce5292166048750b7f6ed Mon Sep 17 00:00:00 2001
From: Sean Owen <srowen@gmail.com>
Date: Thu, 31 May 2018 09:34:39 -0700
Subject: [PATCH 480/566] [WEBUI] Avoid possibility of script in query param
 keys

As discussed separately, this avoids the possibility of XSS on certain request param keys.

CC vanzin

Author: Sean Owen <srowen@gmail.com>

Closes #21464 from srowen/XSS2.

(cherry picked from commit 698b9a0981f0ec322e15d6ac89cc38c8f49ed33d)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala | 4 +++-
 core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
index c2668a7ff832f..a1bc93e8f6781 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
@@ -206,7 +206,9 @@ private[ui] class AllJobsPage(parent: JobsTab, store: AppStatusStore) extends We
       jobs: Seq[v1.JobData],
       killEnabled: Boolean): Seq[Node] = {
     // stripXSS is called to remove suspicious characters used in XSS attacks
-    val allParameters = request.getParameterMap.asScala.toMap.mapValues(_.map(UIUtils.stripXSS))
+    val allParameters = request.getParameterMap.asScala.toMap.map { case (k, v) =>
+      UIUtils.stripXSS(k) -> v.map(UIUtils.stripXSS).toSeq
+    }
     val parameterOtherTable = allParameters.filterNot(_._1.startsWith(jobTag))
       .map(para => para._1 + "=" + para._2(0))
 
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
index 18a4926f2f6c0..f001a01de3952 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
@@ -43,7 +43,9 @@ private[ui] class StageTableBase(
     killEnabled: Boolean,
     isFailedStage: Boolean) {
   // stripXSS is called to remove suspicious characters used in XSS attacks
-  val allParameters = request.getParameterMap.asScala.toMap.mapValues(_.map(UIUtils.stripXSS))
+  val allParameters = request.getParameterMap.asScala.toMap.map { case (k, v) =>
+    UIUtils.stripXSS(k) -> v.map(UIUtils.stripXSS).toSeq
+  }
   val parameterOtherTable = allParameters.filterNot(_._1.startsWith(stageTag))
     .map(para => para._1 + "=" + para._2(0))
 

From b37e76fa4cfa066574fa4684b18f5f62fa3cf5fc Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 31 May 2018 10:05:20 -0700
Subject: [PATCH 481/566] [SPARK-24414][UI] Calculate the correct number of
 tasks for a stage.

This change takes into account all non-pending tasks when calculating
the number of tasks to be shown. This also means that when the stage
is pending, the task table (or, in fact, most of the data in the stage
page) will not be rendered.

I also fixed the label when the known number of tasks is larger than
the recorded number of tasks (it was inverted).

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #21457 from vanzin/SPARK-24414.

(cherry picked from commit 7a82e93b349b4f414f2075dd5add8e4ed72fe357)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../scala/org/apache/spark/ui/jobs/StagePage.scala   | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 08a927a8b4885..7ab433655233e 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -117,8 +117,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
 
     val localitySummary = store.localitySummary(stageData.stageId, stageData.attemptId)
 
-    val totalTasks = stageData.numActiveTasks + stageData.numCompleteTasks +
-      stageData.numFailedTasks + stageData.numKilledTasks
+    val totalTasks = taskCount(stageData)
     if (totalTasks == 0) {
       val content =
         <div>
@@ -133,7 +132,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
     val totalTasksNumStr = if (totalTasks == storedTasks) {
       s"$totalTasks"
     } else {
-      s"$totalTasks, showing ${storedTasks}"
+      s"$storedTasks, showing ${totalTasks}"
     }
 
     val summary =
@@ -678,7 +677,7 @@ private[ui] class TaskDataSource(
 
   private var _tasksToShow: Seq[TaskData] = null
 
-  override def dataSize: Int = stage.numTasks
+  override def dataSize: Int = taskCount(stage)
 
   override def sliceData(from: Int, to: Int): Seq[TaskData] = {
     if (_tasksToShow == null) {
@@ -1044,4 +1043,9 @@ private[ui] object ApiHelper {
     (stage.map(_.name).getOrElse(""), stage.flatMap(_.description).getOrElse(job.name))
   }
 
+  def taskCount(stageData: StageData): Int = {
+    stageData.numActiveTasks + stageData.numCompleteTasks + stageData.numFailedTasks +
+      stageData.numKilledTasks
+  }
+
 }

From e56266ad719488d3887fb7ea0985b3760b3ece12 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Fri, 1 Jun 2018 14:27:10 +0800
Subject: [PATCH 482/566] [SPARK-24444][DOCS][PYTHON][BRANCH-2.3] Improve
 Pandas UDF docs to explain column assignment

## What changes were proposed in this pull request?
Added sections to pandas_udf docs, in the grouped map section, to indicate columns are assigned by position. Backported to branch-2.3.

## How was this patch tested?
NA

Author: Bryan Cutler <cutlerb@gmail.com>

Closes #21478 from BryanCutler/arrow-doc-pandas_udf-column_by_pos-2_3_1-SPARK-21427.
---
 docs/sql-programming-guide.md   | 9 +++++++++
 python/pyspark/sql/functions.py | 9 ++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 14bc5e626771c..461806a659965 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1737,6 +1737,15 @@ To use `groupBy().apply()`, the user needs to define the following:
 * A Python function that defines the computation for each group.
 * A `StructType` object or a string that defines the schema of the output `DataFrame`.
 
+The output schema will be applied to the columns of the returned `pandas.DataFrame` in order by position,
+not by name. This means that the columns in the `pandas.DataFrame` must be indexed so that their
+position matches the corresponding field in the schema.
+
+Note that when creating a new `pandas.DataFrame` using a dictionary, the actual position of the column
+can differ from the order that it was placed in the dictionary. It is recommended in this case to
+explicitly define the column order using the `columns` keyword, e.g.
+`pandas.DataFrame({'id': ids, 'a': data}, columns=['id', 'a'])`, or alternatively use an `OrderedDict`.
+
 Note that all data for a group will be loaded into memory before the function is applied. This can
 lead to out of memory exceptons, especially if the group sizes are skewed. The configuration for
 [maxRecordsPerBatch](#setting-arrow-batch-size) is not applied on groups and it is up to the user
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index cf26523b3cb45..9c02982e4ae22 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2216,7 +2216,8 @@ def pandas_udf(f=None, returnType=None, functionType=None):
        A grouped map UDF defines transformation: A `pandas.DataFrame` -> A `pandas.DataFrame`
        The returnType should be a :class:`StructType` describing the schema of the returned
        `pandas.DataFrame`.
-       The length of the returned `pandas.DataFrame` can be arbitrary.
+       The length of the returned `pandas.DataFrame` can be arbitrary and the columns must be
+       indexed so that their position matches the corresponding field in the schema.
 
        Grouped map UDFs are used with :meth:`pyspark.sql.GroupedData.apply`.
 
@@ -2239,6 +2240,12 @@ def pandas_udf(f=None, returnType=None, functionType=None):
        |  2| 1.1094003924504583|
        +---+-------------------+
 
+       .. note:: If returning a new `pandas.DataFrame` constructed with a dictionary, it is
+           recommended to explicitly index the columns by name to ensure the positions are correct,
+           or alternatively use an `OrderedDict`.
+           For example, `pd.DataFrame({'id': ids, 'a': data}, columns=['id', 'a'])` or
+           `pd.DataFrame(OrderedDict([('id', ids), ('a', data)]))`.
+
        .. seealso:: :meth:`pyspark.sql.GroupedData.apply`
 
     .. note:: The user-defined functions are considered deterministic by default. Due to

From 1cc5f68be73a980111ce0443413356f2b7634bd1 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@apache.org>
Date: Fri, 1 Jun 2018 10:56:26 -0700
Subject: [PATCH 483/566] Preparing Spark release v2.3.1-rc3

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 8df26351614ec..632bcb342caeb 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.2
+Version: 2.3.1
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 02bf39bcb96f3..d744c8b4e2e0f 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 646fdfb23ef19..3a41e1627993f 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 76c7dcf52203f..f02108f03b74b 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index f2661fedd6f75..44304870bced2 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 229d4667169d2..167dc7aa503a7 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index febec1897c59c..ffb770ea6769e 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 4dec96f86d9c3..ddbfb49fe0021 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 7187313c5c2f7..ce7d67828e0b0 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index a9cc91f87566b..506b2740817af 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index 8579166c2e635..daef97ce691d0 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.2-SNAPSHOT
-SPARK_VERSION_SHORT: 2.3.2
+SPARK_VERSION: 2.3.1
+SPARK_VERSION_SHORT: 2.3.1
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index b873bc9b20322..7974a3847f01b 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index b7dc48e4a7001..33f72dfe1bacc 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 161fd0faac82a..b5ac7aee37049 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 26abc31f9620d..9118ad274cf44 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index bcc13ebb359b3..73b54bc15418a 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 020e0ba7d568b..815cab9c4ef8d 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index e12115ee6e20c..475cf7ff39d17 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 07d205b01dfde..553b73390a220 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 11d971a0de046..f911d0191b0f7 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 70eb580d474af..4cfc4577fe09d 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 0728baede26f0..b385bacf1027e 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index c6925851ebc23..416b40c2a798a 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 8b95f1be216c7..8c99aa3a1a97f 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index b5253fd8954f3..74535db06ad0a 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index dacd42126393b..7e93d8925a505 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 2fcb367cf301c..5e2d8503b8a1e 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 8ecd126ebb9b2..dd4825c19a7ad 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index f52b83475c515..5160d0b6e8b85 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 0e9354891f1a3..f4b943fbe529b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.2-SNAPSHOT</version>
+  <version>2.3.1</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index ed1cbdd58f252..49c29a0042617 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.2.dev0"
+__version__ = "2.3.1"
diff --git a/repl/pom.xml b/repl/pom.xml
index bc10cce2d1aea..84802101496bd 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 3e0aa0872b99b..4457605fc1515 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index cf153476ce9f9..7e47916dcacaa 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 71ddbb5d4db08..78bfce4e429f9 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 04a3380b02115..5a1657d109812 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index c9a05cc9e43f6..b3fa6535dfc37 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 8a0c6fba66cda..f0b7d5e23a569 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 2eeff747a68c6..6b9302c4eb92a 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index d16f48dec205f..a84af92d1c39b 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index f5927c5e41939..1c10777ce8286 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 2e0c3469d0d0d7b351d50c937b9862bd80b946ba Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@apache.org>
Date: Fri, 1 Jun 2018 10:56:29 -0700
Subject: [PATCH 484/566] Preparing development version 2.3.2-SNAPSHOT

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 632bcb342caeb..8df26351614ec 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.1
+Version: 2.3.2
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index d744c8b4e2e0f..02bf39bcb96f3 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 3a41e1627993f..646fdfb23ef19 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index f02108f03b74b..76c7dcf52203f 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 44304870bced2..f2661fedd6f75 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 167dc7aa503a7..229d4667169d2 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index ffb770ea6769e..febec1897c59c 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index ddbfb49fe0021..4dec96f86d9c3 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index ce7d67828e0b0..7187313c5c2f7 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 506b2740817af..a9cc91f87566b 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index daef97ce691d0..8579166c2e635 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.1
-SPARK_VERSION_SHORT: 2.3.1
+SPARK_VERSION: 2.3.2-SNAPSHOT
+SPARK_VERSION_SHORT: 2.3.2
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 7974a3847f01b..b873bc9b20322 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 33f72dfe1bacc..b7dc48e4a7001 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index b5ac7aee37049..161fd0faac82a 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 9118ad274cf44..26abc31f9620d 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 73b54bc15418a..bcc13ebb359b3 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 815cab9c4ef8d..020e0ba7d568b 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 475cf7ff39d17..e12115ee6e20c 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 553b73390a220..07d205b01dfde 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index f911d0191b0f7..11d971a0de046 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 4cfc4577fe09d..70eb580d474af 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index b385bacf1027e..0728baede26f0 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 416b40c2a798a..c6925851ebc23 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 8c99aa3a1a97f..8b95f1be216c7 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 74535db06ad0a..b5253fd8954f3 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 7e93d8925a505..dacd42126393b 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 5e2d8503b8a1e..2fcb367cf301c 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index dd4825c19a7ad..8ecd126ebb9b2 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 5160d0b6e8b85..f52b83475c515 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index f4b943fbe529b..0e9354891f1a3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.1</version>
+  <version>2.3.2-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 49c29a0042617..ed1cbdd58f252 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.1"
+__version__ = "2.3.2.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 84802101496bd..bc10cce2d1aea 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 4457605fc1515..3e0aa0872b99b 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 7e47916dcacaa..cf153476ce9f9 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 78bfce4e429f9..71ddbb5d4db08 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 5a1657d109812..04a3380b02115 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index b3fa6535dfc37..c9a05cc9e43f6 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index f0b7d5e23a569..8a0c6fba66cda 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 6b9302c4eb92a..2eeff747a68c6 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index a84af92d1c39b..d16f48dec205f 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 1c10777ce8286..f5927c5e41939 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From e4e96f9291b51a295fb3f97023061500280162c4 Mon Sep 17 00:00:00 2001
From: Xiao Li <gatorsmile@gmail.com>
Date: Fri, 1 Jun 2018 11:52:20 -0700
Subject: [PATCH 485/566] Revert "[SPARK-24369][SQL] Correct handling for
 multiple distinct aggregations having the same argument set"

This reverts commit 66289a3e067769cb8ed35953187f6363463791e1.
---
 .../optimizer/RewriteDistinctAggregates.scala         |  7 +++----
 .../apache/spark/sql/execution/SparkStrategies.scala  |  2 +-
 .../src/test/resources/sql-tests/inputs/group-by.sql  |  6 +-----
 .../test/resources/sql-tests/results/group-by.sql.out | 11 +----------
 4 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
index bc898ab0dc723..4448ace7105a4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
@@ -115,8 +115,7 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] {
     }
 
     // Extract distinct aggregate expressions.
-    val distincgAggExpressions = aggExpressions.filter(_.isDistinct)
-    val distinctAggGroups = distincgAggExpressions.groupBy { e =>
+    val distinctAggGroups = aggExpressions.filter(_.isDistinct).groupBy { e =>
         val unfoldableChildren = e.aggregateFunction.children.filter(!_.foldable).toSet
         if (unfoldableChildren.nonEmpty) {
           // Only expand the unfoldable children
@@ -133,7 +132,7 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] {
     }
 
     // Aggregation strategy can handle queries with a single distinct group.
-    if (distincgAggExpressions.size > 1) {
+    if (distinctAggGroups.size > 1) {
       // Create the attributes for the grouping id and the group by clause.
       val gid = AttributeReference("gid", IntegerType, nullable = false)()
       val groupByMap = a.groupingExpressions.collect {
@@ -152,7 +151,7 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] {
       }
 
       // Setup unique distinct aggregate children.
-      val distinctAggChildren = distinctAggGroups.keySet.flatten.toSeq
+      val distinctAggChildren = distinctAggGroups.keySet.flatten.toSeq.distinct
       val distinctAggChildAttrMap = distinctAggChildren.map(expressionAttributePair)
       val distinctAggChildAttrs = distinctAggChildAttrMap.map(_._2)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 9fed2f9bc255d..25436e1c8dd45 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -374,7 +374,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
           aggregateExpressions.partition(_.isDistinct)
         if (functionsWithDistinct.map(_.aggregateFunction.children).distinct.length > 1) {
           // This is a sanity check. We should not reach here when we have multiple distinct
-          // column sets. Our `RewriteDistinctAggregates` should take care this case.
+          // column sets. Our MultipleDistinctRewriter should take care this case.
           sys.error("You hit a query analyzer bug. Please report your query to " +
               "Spark user mailing list.")
         }
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
index 2c18d6aaabdba..c5070b734d521 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
@@ -68,8 +68,4 @@ SELECT 1 from (
   FROM (select 1 as x) a
   WHERE false
 ) b
-where b.z != b.z;
-
--- SPARK-24369 multiple distinct aggregations having the same argument set
-SELECT corr(DISTINCT x, y), corr(DISTINCT y, x), count(*)
-  FROM (VALUES (1, 1), (2, 2), (2, 2)) t(x, y);
+where b.z != b.z
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
index 581aa1754ce14..c1abc6dff754b 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 27
+-- Number of queries: 26
 
 
 -- !query 0
@@ -241,12 +241,3 @@ where b.z != b.z
 struct<1:int>
 -- !query 25 output
 
-
-
--- !query 26
-SELECT corr(DISTINCT x, y), corr(DISTINCT y, x), count(*)
-  FROM (VALUES (1, 1), (2, 2), (2, 2)) t(x, y)
--- !query 26 schema
-struct<corr(DISTINCT CAST(x AS DOUBLE), CAST(y AS DOUBLE)):double,corr(DISTINCT CAST(y AS DOUBLE), CAST(x AS DOUBLE)):double,count(1):bigint>
--- !query 26 output
-1.0	1.0	3

From 30aaa5a3a1076ca52439a905274b1fcf498bc562 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@apache.org>
Date: Fri, 1 Jun 2018 13:34:19 -0700
Subject: [PATCH 486/566] Preparing Spark release v2.3.1-rc4

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 8df26351614ec..632bcb342caeb 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.2
+Version: 2.3.1
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 02bf39bcb96f3..d744c8b4e2e0f 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 646fdfb23ef19..3a41e1627993f 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 76c7dcf52203f..f02108f03b74b 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index f2661fedd6f75..44304870bced2 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 229d4667169d2..167dc7aa503a7 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index febec1897c59c..ffb770ea6769e 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 4dec96f86d9c3..ddbfb49fe0021 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 7187313c5c2f7..ce7d67828e0b0 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index a9cc91f87566b..506b2740817af 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index 8579166c2e635..daef97ce691d0 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.2-SNAPSHOT
-SPARK_VERSION_SHORT: 2.3.2
+SPARK_VERSION: 2.3.1
+SPARK_VERSION_SHORT: 2.3.1
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index b873bc9b20322..7974a3847f01b 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index b7dc48e4a7001..33f72dfe1bacc 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 161fd0faac82a..b5ac7aee37049 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 26abc31f9620d..9118ad274cf44 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index bcc13ebb359b3..73b54bc15418a 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 020e0ba7d568b..815cab9c4ef8d 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index e12115ee6e20c..475cf7ff39d17 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 07d205b01dfde..553b73390a220 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 11d971a0de046..f911d0191b0f7 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 70eb580d474af..4cfc4577fe09d 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 0728baede26f0..b385bacf1027e 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index c6925851ebc23..416b40c2a798a 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 8b95f1be216c7..8c99aa3a1a97f 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index b5253fd8954f3..74535db06ad0a 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index dacd42126393b..7e93d8925a505 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 2fcb367cf301c..5e2d8503b8a1e 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 8ecd126ebb9b2..dd4825c19a7ad 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index f52b83475c515..5160d0b6e8b85 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 0e9354891f1a3..f4b943fbe529b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.2-SNAPSHOT</version>
+  <version>2.3.1</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index ed1cbdd58f252..49c29a0042617 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.2.dev0"
+__version__ = "2.3.1"
diff --git a/repl/pom.xml b/repl/pom.xml
index bc10cce2d1aea..84802101496bd 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 3e0aa0872b99b..4457605fc1515 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index cf153476ce9f9..7e47916dcacaa 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 71ddbb5d4db08..78bfce4e429f9 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 04a3380b02115..5a1657d109812 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index c9a05cc9e43f6..b3fa6535dfc37 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 8a0c6fba66cda..f0b7d5e23a569 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 2eeff747a68c6..6b9302c4eb92a 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index d16f48dec205f..a84af92d1c39b 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index f5927c5e41939..1c10777ce8286 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.1</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 21800b878605598988b54bcc4ef5b24a546ba9cc Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@apache.org>
Date: Fri, 1 Jun 2018 13:34:24 -0700
Subject: [PATCH 487/566] Preparing development version 2.3.2-SNAPSHOT

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 632bcb342caeb..8df26351614ec 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.1
+Version: 2.3.2
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index d744c8b4e2e0f..02bf39bcb96f3 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 3a41e1627993f..646fdfb23ef19 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index f02108f03b74b..76c7dcf52203f 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 44304870bced2..f2661fedd6f75 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 167dc7aa503a7..229d4667169d2 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index ffb770ea6769e..febec1897c59c 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index ddbfb49fe0021..4dec96f86d9c3 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index ce7d67828e0b0..7187313c5c2f7 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 506b2740817af..a9cc91f87566b 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index daef97ce691d0..8579166c2e635 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.1
-SPARK_VERSION_SHORT: 2.3.1
+SPARK_VERSION: 2.3.2-SNAPSHOT
+SPARK_VERSION_SHORT: 2.3.2
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 7974a3847f01b..b873bc9b20322 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 33f72dfe1bacc..b7dc48e4a7001 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index b5ac7aee37049..161fd0faac82a 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 9118ad274cf44..26abc31f9620d 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 73b54bc15418a..bcc13ebb359b3 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 815cab9c4ef8d..020e0ba7d568b 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 475cf7ff39d17..e12115ee6e20c 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 553b73390a220..07d205b01dfde 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index f911d0191b0f7..11d971a0de046 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 4cfc4577fe09d..70eb580d474af 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index b385bacf1027e..0728baede26f0 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 416b40c2a798a..c6925851ebc23 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 8c99aa3a1a97f..8b95f1be216c7 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 74535db06ad0a..b5253fd8954f3 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 7e93d8925a505..dacd42126393b 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 5e2d8503b8a1e..2fcb367cf301c 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index dd4825c19a7ad..8ecd126ebb9b2 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 5160d0b6e8b85..f52b83475c515 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index f4b943fbe529b..0e9354891f1a3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.1</version>
+  <version>2.3.2-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 49c29a0042617..ed1cbdd58f252 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.1"
+__version__ = "2.3.2.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 84802101496bd..bc10cce2d1aea 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 4457605fc1515..3e0aa0872b99b 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 7e47916dcacaa..cf153476ce9f9 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 78bfce4e429f9..71ddbb5d4db08 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 5a1657d109812..04a3380b02115 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index b3fa6535dfc37..c9a05cc9e43f6 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index f0b7d5e23a569..8a0c6fba66cda 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 6b9302c4eb92a..2eeff747a68c6 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index a84af92d1c39b..d16f48dec205f 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 1c10777ce8286..f5927c5e41939 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.1</version>
+    <version>2.3.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 18194544fb7219d9ddaa55da96d35c9ba15559c3 Mon Sep 17 00:00:00 2001
From: xueyu <xueyu@yidian-inc.com>
Date: Mon, 4 Jun 2018 08:10:49 +0700
Subject: [PATCH 488/566] [SPARK-24455][CORE] fix typo in TaskSchedulerImpl
 comment

change runTasks to submitTasks  in the TaskSchedulerImpl.scala 's comment

Author: xueyu <xueyu@yidian-inc.com>
Author: Xue Yu <278006819@qq.com>

Closes #21485 from xueyumusic/fixtypo1.

(cherry picked from commit a2166ecddaec030f78acaa66ce660d979a35079c)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 .../org/apache/spark/scheduler/TaskSchedulerImpl.scala    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index 8e97b3da33820..598b62f85a1fa 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -42,7 +42,7 @@ import org.apache.spark.util.{AccumulatorV2, ThreadUtils, Utils}
  * up to launch speculative tasks, etc.
  *
  * Clients should first call initialize() and start(), then submit task sets through the
- * runTasks method.
+ * submitTasks method.
  *
  * THREADING: [[SchedulerBackend]]s and task-submitting clients can call this class from multiple
  * threads, so it needs locks in public API methods to maintain its state. In addition, some
@@ -62,7 +62,7 @@ private[spark] class TaskSchedulerImpl(
     this(sc, sc.conf.get(config.MAX_TASK_FAILURES))
   }
 
-  // Lazily initializing blackListTrackOpt to avoid getting empty ExecutorAllocationClient,
+  // Lazily initializing blacklistTrackerOpt to avoid getting empty ExecutorAllocationClient,
   // because ExecutorAllocationClient is created after this TaskSchedulerImpl.
   private[scheduler] lazy val blacklistTrackerOpt = maybeCreateBlacklistTracker(sc)
 
@@ -228,7 +228,7 @@ private[spark] class TaskSchedulerImpl(
         // 1. The task set manager has been created and some tasks have been scheduled.
         //    In this case, send a kill signal to the executors to kill the task and then abort
         //    the stage.
-        // 2. The task set manager has been created but no tasks has been scheduled. In this case,
+        // 2. The task set manager has been created but no tasks have been scheduled. In this case,
         //    simply abort the stage.
         tsm.runningTasksSet.foreach { tid =>
             taskIdToExecutorId.get(tid).foreach(execId =>
@@ -694,7 +694,7 @@ private[spark] class TaskSchedulerImpl(
    *
    * After stage failure and retry, there may be multiple TaskSetManagers for the stage.
    * If an earlier attempt of a stage completes a task, we should ensure that the later attempts
-   * do not also submit those same tasks.  That also means that a task completion from an  earlier
+   * do not also submit those same tasks.  That also means that a task completion from an earlier
    * attempt can lead to the entire stage getting marked as successful.
    */
   private[scheduler] def markPartitionCompletedInAllTaskSets(stageId: Int, partitionId: Int) = {

From 36f1d5e17a79bc343d6ea88af2d1fed1f02c132f Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Sun, 3 Jun 2018 21:57:42 -0700
Subject: [PATCH 489/566] [SPARK-24369][SQL] Correct handling for multiple
 distinct aggregations having the same argument set

## What changes were proposed in this pull request?

bring back https://github.com/apache/spark/pull/21443

This is a different approach: just change the check to count distinct columns with `toSet`

## How was this patch tested?

a new test to verify the planner behavior.

Author: Wenchen Fan <wenchen@databricks.com>
Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #21487 from cloud-fan/back.

(cherry picked from commit 416cd1fd96c0db9194e32ba877b1396b6dc13c8e)
Signed-off-by: Xiao Li <gatorsmile@gmail.com>
---
 .../spark/sql/execution/SparkStrategies.scala |  4 ++--
 .../resources/sql-tests/inputs/group-by.sql   |  6 +++++-
 .../sql-tests/results/group-by.sql.out        | 11 +++++++++-
 .../spark/sql/execution/PlannerSuite.scala    | 21 +++++++++++++++++++
 4 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 25436e1c8dd45..c6565fcf66559 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -372,9 +372,9 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
 
         val (functionsWithDistinct, functionsWithoutDistinct) =
           aggregateExpressions.partition(_.isDistinct)
-        if (functionsWithDistinct.map(_.aggregateFunction.children).distinct.length > 1) {
+        if (functionsWithDistinct.map(_.aggregateFunction.children.toSet).distinct.length > 1) {
           // This is a sanity check. We should not reach here when we have multiple distinct
-          // column sets. Our MultipleDistinctRewriter should take care this case.
+          // column sets. Our `RewriteDistinctAggregates` should take care this case.
           sys.error("You hit a query analyzer bug. Please report your query to " +
               "Spark user mailing list.")
         }
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
index c5070b734d521..2c18d6aaabdba 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
@@ -68,4 +68,8 @@ SELECT 1 from (
   FROM (select 1 as x) a
   WHERE false
 ) b
-where b.z != b.z
+where b.z != b.z;
+
+-- SPARK-24369 multiple distinct aggregations having the same argument set
+SELECT corr(DISTINCT x, y), corr(DISTINCT y, x), count(*)
+  FROM (VALUES (1, 1), (2, 2), (2, 2)) t(x, y);
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
index c1abc6dff754b..581aa1754ce14 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 26
+-- Number of queries: 27
 
 
 -- !query 0
@@ -241,3 +241,12 @@ where b.z != b.z
 struct<1:int>
 -- !query 25 output
 
+
+
+-- !query 26
+SELECT corr(DISTINCT x, y), corr(DISTINCT y, x), count(*)
+  FROM (VALUES (1, 1), (2, 2), (2, 2)) t(x, y)
+-- !query 26 schema
+struct<corr(DISTINCT CAST(x AS DOUBLE), CAST(y AS DOUBLE)):double,corr(DISTINCT CAST(y AS DOUBLE), CAST(x AS DOUBLE)):double,count(1):bigint>
+-- !query 26 output
+1.0	1.0	3
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index f8b26f5b28cc7..dfbc0346cb247 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -69,6 +69,27 @@ class PlannerSuite extends SharedSQLContext {
     testPartialAggregationPlan(query)
   }
 
+  test("mixed aggregates with same distinct columns") {
+    def assertNoExpand(plan: SparkPlan): Unit = {
+      assert(plan.collect { case e: ExpandExec => e }.isEmpty)
+    }
+
+    withTempView("v") {
+      Seq((1, 1.0, 1.0), (1, 2.0, 2.0)).toDF("i", "j", "k").createTempView("v")
+      // one distinct column
+      val query1 = sql("SELECT sum(DISTINCT j), max(DISTINCT j) FROM v GROUP BY i")
+      assertNoExpand(query1.queryExecution.executedPlan)
+
+      // 2 distinct columns
+      val query2 = sql("SELECT corr(DISTINCT j, k), count(DISTINCT j, k) FROM v GROUP BY i")
+      assertNoExpand(query2.queryExecution.executedPlan)
+
+      // 2 distinct columns with different order
+      val query3 = sql("SELECT corr(DISTINCT j, k), count(DISTINCT k, j) FROM v GROUP BY i")
+      assertNoExpand(query3.queryExecution.executedPlan)
+    }
+  }
+
   test("sizeInBytes estimation of limit operator for broadcast hash join optimization") {
     def checkPlan(fieldTypes: Seq[DataType]): Unit = {
       withTempView("testLimit") {

From 1582945d0472bd16b07fd4e79ac872533a6885cd Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Fri, 8 Jun 2018 18:51:56 -0700
Subject: [PATCH 490/566] [SPARK-24468][SQL] Handle negative scale when
 adjusting precision for decimal operations

## What changes were proposed in this pull request?

In SPARK-22036 we introduced the possibility to allow precision loss in arithmetic operations (according to the SQL standard). The implementation was drawn from Hive's one, where Decimals with a negative scale are not allowed in the operations.

The PR handles the case when the scale is negative, removing the assertion that it is not.

## How was this patch tested?

added UTs

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #21499 from mgaido91/SPARK-24468.

(cherry picked from commit f07c5064a3967cdddf57c2469635ee50a26d864c)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/sql/types/DecimalType.scala  |   8 +-
 .../analysis/DecimalPrecisionSuite.scala      |   9 +
 .../native/decimalArithmeticOperations.sql    |   4 +
 .../decimalArithmeticOperations.sql.out       | 164 +++++++++++-------
 4 files changed, 117 insertions(+), 68 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
index ef3b67c0d48d0..dbf51c398fa47 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
@@ -161,13 +161,17 @@ object DecimalType extends AbstractDataType {
    * This method is used only when `spark.sql.decimalOperations.allowPrecisionLoss` is set to true.
    */
   private[sql] def adjustPrecisionScale(precision: Int, scale: Int): DecimalType = {
-    // Assumptions:
+    // Assumption:
     assert(precision >= scale)
-    assert(scale >= 0)
 
     if (precision <= MAX_PRECISION) {
       // Adjustment only needed when we exceed max precision
       DecimalType(precision, scale)
+    } else if (scale < 0) {
+      // Decimal can have negative scale (SPARK-24468). In this case, we cannot allow a precision
+      // loss since we would cause a loss of digits in the integer part.
+      // In this case, we are likely to meet an overflow.
+      DecimalType(MAX_PRECISION, scale)
     } else {
       // Precision/scale exceed maximum precision. Result must be adjusted to MAX_PRECISION.
       val intDigits = precision - scale
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
index c86dc18dfa680..bd87ca6017e99 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
@@ -272,6 +272,15 @@ class DecimalPrecisionSuite extends AnalysisTest with BeforeAndAfter {
     }
   }
 
+  test("SPARK-24468: operations on decimals with negative scale") {
+    val a = AttributeReference("a", DecimalType(3, -10))()
+    val b = AttributeReference("b", DecimalType(1, -1))()
+    val c = AttributeReference("c", DecimalType(35, 1))()
+    checkType(Multiply(a, b), DecimalType(5, -11))
+    checkType(Multiply(a, c), DecimalType(38, -9))
+    checkType(Multiply(b, c), DecimalType(37, 0))
+  }
+
   /** strength reduction for integer/decimal comparisons */
   def ruleTest(initial: Expression, transformed: Expression): Unit = {
     val testRelation = LocalRelation(AttributeReference("a", IntegerType)())
diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalArithmeticOperations.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalArithmeticOperations.sql
index 9be7fcdadfea8..28a0e20c0f495 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalArithmeticOperations.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalArithmeticOperations.sql
@@ -40,12 +40,14 @@ select 10.3000 * 3.0;
 select 10.30000 * 30.0;
 select 10.300000000000000000 * 3.000000000000000000;
 select 10.300000000000000000 * 3.0000000000000000000;
+select 2.35E10 * 1.0;
 
 -- arithmetic operations causing an overflow return NULL
 select (5e36 + 0.1) + 5e36;
 select (-4e36 - 0.1) - 7e36;
 select 12345678901234567890.0 * 12345678901234567890.0;
 select 1e35 / 0.1;
+select 1.2345678901234567890E30 * 1.2345678901234567890E25;
 
 -- arithmetic operations causing a precision loss are truncated
 select 12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345;
@@ -67,12 +69,14 @@ select 10.3000 * 3.0;
 select 10.30000 * 30.0;
 select 10.300000000000000000 * 3.000000000000000000;
 select 10.300000000000000000 * 3.0000000000000000000;
+select 2.35E10 * 1.0;
 
 -- arithmetic operations causing an overflow return NULL
 select (5e36 + 0.1) + 5e36;
 select (-4e36 - 0.1) - 7e36;
 select 12345678901234567890.0 * 12345678901234567890.0;
 select 1e35 / 0.1;
+select 1.2345678901234567890E30 * 1.2345678901234567890E25;
 
 -- arithmetic operations causing a precision loss return NULL
 select 12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345;
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalArithmeticOperations.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalArithmeticOperations.sql.out
index 6bfdb84548d4d..cbf44548b3cce 100644
--- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalArithmeticOperations.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalArithmeticOperations.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 36
+-- Number of queries: 40
 
 
 -- !query 0
@@ -114,190 +114,222 @@ struct<(CAST(10.300000000000000000 AS DECIMAL(21,19)) * CAST(3.00000000000000000
 
 
 -- !query 13
-select (5e36 + 0.1) + 5e36
+select 2.35E10 * 1.0
 -- !query 13 schema
-struct<(CAST((CAST(5E+36 AS DECIMAL(38,1)) + CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) + CAST(5E+36 AS DECIMAL(38,1))):decimal(38,1)>
+struct<(CAST(2.35E+10 AS DECIMAL(12,1)) * CAST(1.0 AS DECIMAL(12,1))):decimal(6,-7)>
 -- !query 13 output
-NULL
+23500000000
 
 
 -- !query 14
-select (-4e36 - 0.1) - 7e36
+select (5e36 + 0.1) + 5e36
 -- !query 14 schema
-struct<(CAST((CAST(-4E+36 AS DECIMAL(38,1)) - CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) - CAST(7E+36 AS DECIMAL(38,1))):decimal(38,1)>
+struct<(CAST((CAST(5E+36 AS DECIMAL(38,1)) + CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) + CAST(5E+36 AS DECIMAL(38,1))):decimal(38,1)>
 -- !query 14 output
 NULL
 
 
 -- !query 15
-select 12345678901234567890.0 * 12345678901234567890.0
+select (-4e36 - 0.1) - 7e36
 -- !query 15 schema
-struct<(12345678901234567890.0 * 12345678901234567890.0):decimal(38,2)>
+struct<(CAST((CAST(-4E+36 AS DECIMAL(38,1)) - CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) - CAST(7E+36 AS DECIMAL(38,1))):decimal(38,1)>
 -- !query 15 output
 NULL
 
 
 -- !query 16
-select 1e35 / 0.1
+select 12345678901234567890.0 * 12345678901234567890.0
 -- !query 16 schema
-struct<(CAST(1E+35 AS DECIMAL(37,1)) / CAST(0.1 AS DECIMAL(37,1))):decimal(38,6)>
+struct<(12345678901234567890.0 * 12345678901234567890.0):decimal(38,2)>
 -- !query 16 output
 NULL
 
 
 -- !query 17
-select 12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345
+select 1e35 / 0.1
 -- !query 17 schema
-struct<(CAST(12345678912345678912345678912.1234567 AS DECIMAL(38,6)) + CAST(9999999999999999999999999999999.12345 AS DECIMAL(38,6))):decimal(38,6)>
+struct<(CAST(1E+35 AS DECIMAL(37,1)) / CAST(0.1 AS DECIMAL(37,1))):decimal(38,6)>
 -- !query 17 output
-10012345678912345678912345678911.246907
+NULL
 
 
 -- !query 18
-select 123456789123456789.1234567890 * 1.123456789123456789
+select 1.2345678901234567890E30 * 1.2345678901234567890E25
 -- !query 18 schema
-struct<(CAST(123456789123456789.1234567890 AS DECIMAL(36,18)) * CAST(1.123456789123456789 AS DECIMAL(36,18))):decimal(38,18)>
+struct<(CAST(1.2345678901234567890E+30 AS DECIMAL(25,-6)) * CAST(1.2345678901234567890E+25 AS DECIMAL(25,-6))):decimal(38,-17)>
 -- !query 18 output
-138698367904130467.654320988515622621
+NULL
 
 
 -- !query 19
-select 12345678912345.123456789123 / 0.000000012345678
+select 12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345
 -- !query 19 schema
-struct<(CAST(12345678912345.123456789123 AS DECIMAL(29,15)) / CAST(1.2345678E-8 AS DECIMAL(29,15))):decimal(38,9)>
+struct<(CAST(12345678912345678912345678912.1234567 AS DECIMAL(38,6)) + CAST(9999999999999999999999999999999.12345 AS DECIMAL(38,6))):decimal(38,6)>
 -- !query 19 output
-1000000073899961059796.725866332
+10012345678912345678912345678911.246907
 
 
 -- !query 20
-set spark.sql.decimalOperations.allowPrecisionLoss=false
+select 123456789123456789.1234567890 * 1.123456789123456789
 -- !query 20 schema
-struct<key:string,value:string>
+struct<(CAST(123456789123456789.1234567890 AS DECIMAL(36,18)) * CAST(1.123456789123456789 AS DECIMAL(36,18))):decimal(38,18)>
 -- !query 20 output
-spark.sql.decimalOperations.allowPrecisionLoss	false
+138698367904130467.654320988515622621
 
 
 -- !query 21
-select id, a+b, a-b, a*b, a/b from decimals_test order by id
+select 12345678912345.123456789123 / 0.000000012345678
 -- !query 21 schema
-struct<id:int,(a + b):decimal(38,18),(a - b):decimal(38,18),(a * b):decimal(38,36),(a / b):decimal(38,18)>
+struct<(CAST(12345678912345.123456789123 AS DECIMAL(29,15)) / CAST(1.2345678E-8 AS DECIMAL(29,15))):decimal(38,9)>
 -- !query 21 output
-1	1099	-899	NULL	0.1001001001001001
-2	24690.246	0	NULL	1
-3	1234.2234567891011	-1233.9765432108989	NULL	0.000100037913541123
-4	123456789123456790.123456789123456789	123456789123456787.876543210876543211	NULL	109890109097814272.043109406191131436
+1000000073899961059796.725866332
 
 
 -- !query 22
-select id, a*10, b/10 from decimals_test order by id
+set spark.sql.decimalOperations.allowPrecisionLoss=false
 -- !query 22 schema
-struct<id:int,(CAST(a AS DECIMAL(38,18)) * CAST(CAST(10 AS DECIMAL(2,0)) AS DECIMAL(38,18))):decimal(38,18),(CAST(b AS DECIMAL(38,18)) / CAST(CAST(10 AS DECIMAL(2,0)) AS DECIMAL(38,18))):decimal(38,19)>
+struct<key:string,value:string>
 -- !query 22 output
-1	1000	99.9
-2	123451.23	1234.5123
-3	1.234567891011	123.41
-4	1234567891234567890	0.1123456789123456789
+spark.sql.decimalOperations.allowPrecisionLoss	false
 
 
 -- !query 23
-select 10.3 * 3.0
+select id, a+b, a-b, a*b, a/b from decimals_test order by id
 -- !query 23 schema
-struct<(CAST(10.3 AS DECIMAL(3,1)) * CAST(3.0 AS DECIMAL(3,1))):decimal(6,2)>
+struct<id:int,(a + b):decimal(38,18),(a - b):decimal(38,18),(a * b):decimal(38,36),(a / b):decimal(38,18)>
 -- !query 23 output
-30.9
+1	1099	-899	NULL	0.1001001001001001
+2	24690.246	0	NULL	1
+3	1234.2234567891011	-1233.9765432108989	NULL	0.000100037913541123
+4	123456789123456790.123456789123456789	123456789123456787.876543210876543211	NULL	109890109097814272.043109406191131436
 
 
 -- !query 24
-select 10.3000 * 3.0
+select id, a*10, b/10 from decimals_test order by id
 -- !query 24 schema
-struct<(CAST(10.3000 AS DECIMAL(6,4)) * CAST(3.0 AS DECIMAL(6,4))):decimal(9,5)>
+struct<id:int,(CAST(a AS DECIMAL(38,18)) * CAST(CAST(10 AS DECIMAL(2,0)) AS DECIMAL(38,18))):decimal(38,18),(CAST(b AS DECIMAL(38,18)) / CAST(CAST(10 AS DECIMAL(2,0)) AS DECIMAL(38,18))):decimal(38,19)>
 -- !query 24 output
-30.9
+1	1000	99.9
+2	123451.23	1234.5123
+3	1.234567891011	123.41
+4	1234567891234567890	0.1123456789123456789
 
 
 -- !query 25
-select 10.30000 * 30.0
+select 10.3 * 3.0
 -- !query 25 schema
-struct<(CAST(10.30000 AS DECIMAL(7,5)) * CAST(30.0 AS DECIMAL(7,5))):decimal(11,6)>
+struct<(CAST(10.3 AS DECIMAL(3,1)) * CAST(3.0 AS DECIMAL(3,1))):decimal(6,2)>
 -- !query 25 output
-309
+30.9
 
 
 -- !query 26
-select 10.300000000000000000 * 3.000000000000000000
+select 10.3000 * 3.0
 -- !query 26 schema
-struct<(CAST(10.300000000000000000 AS DECIMAL(20,18)) * CAST(3.000000000000000000 AS DECIMAL(20,18))):decimal(38,36)>
+struct<(CAST(10.3000 AS DECIMAL(6,4)) * CAST(3.0 AS DECIMAL(6,4))):decimal(9,5)>
 -- !query 26 output
 30.9
 
 
 -- !query 27
-select 10.300000000000000000 * 3.0000000000000000000
+select 10.30000 * 30.0
 -- !query 27 schema
-struct<(CAST(10.300000000000000000 AS DECIMAL(21,19)) * CAST(3.0000000000000000000 AS DECIMAL(21,19))):decimal(38,37)>
+struct<(CAST(10.30000 AS DECIMAL(7,5)) * CAST(30.0 AS DECIMAL(7,5))):decimal(11,6)>
 -- !query 27 output
-NULL
+309
 
 
 -- !query 28
-select (5e36 + 0.1) + 5e36
+select 10.300000000000000000 * 3.000000000000000000
 -- !query 28 schema
-struct<(CAST((CAST(5E+36 AS DECIMAL(38,1)) + CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) + CAST(5E+36 AS DECIMAL(38,1))):decimal(38,1)>
+struct<(CAST(10.300000000000000000 AS DECIMAL(20,18)) * CAST(3.000000000000000000 AS DECIMAL(20,18))):decimal(38,36)>
 -- !query 28 output
-NULL
+30.9
 
 
 -- !query 29
-select (-4e36 - 0.1) - 7e36
+select 10.300000000000000000 * 3.0000000000000000000
 -- !query 29 schema
-struct<(CAST((CAST(-4E+36 AS DECIMAL(38,1)) - CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) - CAST(7E+36 AS DECIMAL(38,1))):decimal(38,1)>
+struct<(CAST(10.300000000000000000 AS DECIMAL(21,19)) * CAST(3.0000000000000000000 AS DECIMAL(21,19))):decimal(38,37)>
 -- !query 29 output
 NULL
 
 
 -- !query 30
-select 12345678901234567890.0 * 12345678901234567890.0
+select 2.35E10 * 1.0
 -- !query 30 schema
-struct<(12345678901234567890.0 * 12345678901234567890.0):decimal(38,2)>
+struct<(CAST(2.35E+10 AS DECIMAL(12,1)) * CAST(1.0 AS DECIMAL(12,1))):decimal(6,-7)>
 -- !query 30 output
-NULL
+23500000000
 
 
 -- !query 31
-select 1e35 / 0.1
+select (5e36 + 0.1) + 5e36
 -- !query 31 schema
-struct<(CAST(1E+35 AS DECIMAL(37,1)) / CAST(0.1 AS DECIMAL(37,1))):decimal(38,3)>
+struct<(CAST((CAST(5E+36 AS DECIMAL(38,1)) + CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) + CAST(5E+36 AS DECIMAL(38,1))):decimal(38,1)>
 -- !query 31 output
 NULL
 
 
 -- !query 32
-select 12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345
+select (-4e36 - 0.1) - 7e36
 -- !query 32 schema
-struct<(CAST(12345678912345678912345678912.1234567 AS DECIMAL(38,7)) + CAST(9999999999999999999999999999999.12345 AS DECIMAL(38,7))):decimal(38,7)>
+struct<(CAST((CAST(-4E+36 AS DECIMAL(38,1)) - CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) - CAST(7E+36 AS DECIMAL(38,1))):decimal(38,1)>
 -- !query 32 output
 NULL
 
 
 -- !query 33
-select 123456789123456789.1234567890 * 1.123456789123456789
+select 12345678901234567890.0 * 12345678901234567890.0
 -- !query 33 schema
-struct<(CAST(123456789123456789.1234567890 AS DECIMAL(36,18)) * CAST(1.123456789123456789 AS DECIMAL(36,18))):decimal(38,28)>
+struct<(12345678901234567890.0 * 12345678901234567890.0):decimal(38,2)>
 -- !query 33 output
 NULL
 
 
 -- !query 34
-select 12345678912345.123456789123 / 0.000000012345678
+select 1e35 / 0.1
 -- !query 34 schema
-struct<(CAST(12345678912345.123456789123 AS DECIMAL(29,15)) / CAST(1.2345678E-8 AS DECIMAL(29,15))):decimal(38,18)>
+struct<(CAST(1E+35 AS DECIMAL(37,1)) / CAST(0.1 AS DECIMAL(37,1))):decimal(38,3)>
 -- !query 34 output
 NULL
 
 
 -- !query 35
-drop table decimals_test
+select 1.2345678901234567890E30 * 1.2345678901234567890E25
 -- !query 35 schema
-struct<>
+struct<(CAST(1.2345678901234567890E+30 AS DECIMAL(25,-6)) * CAST(1.2345678901234567890E+25 AS DECIMAL(25,-6))):decimal(38,-17)>
 -- !query 35 output
+NULL
+
+
+-- !query 36
+select 12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345
+-- !query 36 schema
+struct<(CAST(12345678912345678912345678912.1234567 AS DECIMAL(38,7)) + CAST(9999999999999999999999999999999.12345 AS DECIMAL(38,7))):decimal(38,7)>
+-- !query 36 output
+NULL
+
+
+-- !query 37
+select 123456789123456789.1234567890 * 1.123456789123456789
+-- !query 37 schema
+struct<(CAST(123456789123456789.1234567890 AS DECIMAL(36,18)) * CAST(1.123456789123456789 AS DECIMAL(36,18))):decimal(38,28)>
+-- !query 37 output
+NULL
+
+
+-- !query 38
+select 12345678912345.123456789123 / 0.000000012345678
+-- !query 38 schema
+struct<(CAST(12345678912345.123456789123 AS DECIMAL(29,15)) / CAST(1.2345678E-8 AS DECIMAL(29,15))):decimal(38,18)>
+-- !query 38 output
+NULL
+
+
+-- !query 39
+drop table decimals_test
+-- !query 39 schema
+struct<>
+-- !query 39 output
 

From 4d4548a6eba0fb3e05c6e2457a51541475e456d3 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 12 Jun 2018 09:32:14 +0800
Subject: [PATCH 491/566] [SPARK-23732][DOCS] Fix source links in generated
 scaladoc.

Apply the suggestion on the bug to fix source links. Tested with
the 2.3.1 release docs.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #21521 from vanzin/SPARK-23732.
---
 project/SparkBuild.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 7469f11df0294..c2e5137645d76 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -728,7 +728,8 @@ object Unidoc {
 
     scalacOptions in (ScalaUnidoc, unidoc) ++= Seq(
       "-groups", // Group similar methods together based on the @group annotation.
-      "-skip-packages", "org.apache.hadoop"
+      "-skip-packages", "org.apache.hadoop",
+      "-sourcepath", (baseDirectory in ThisBuild).value.getAbsolutePath
     ) ++ (
       // Add links to sources when generating Scaladoc for a non-snapshot release
       if (!isSnapshot.value) {

From bf58687572738ffc1a74bee9cffa310d4674ce31 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Mon, 11 Jun 2018 22:08:44 -0700
Subject: [PATCH 492/566] [SPARK-24502][SQL] flaky test:
 UnsafeRowSerializerSuite

## What changes were proposed in this pull request?

`UnsafeRowSerializerSuite` calls `UnsafeProjection.create` which accesses `SQLConf.get`, while the current active SparkSession may already be stopped, and we may hit exception like this

```
sbt.ForkMain$ForkError: java.lang.IllegalStateException: LiveListenerBus is stopped.
	at org.apache.spark.scheduler.LiveListenerBus.addToQueue(LiveListenerBus.scala:97)
	at org.apache.spark.scheduler.LiveListenerBus.addToStatusQueue(LiveListenerBus.scala:80)
	at org.apache.spark.sql.internal.SharedState.<init>(SharedState.scala:93)
	at org.apache.spark.sql.SparkSession$$anonfun$sharedState$1.apply(SparkSession.scala:120)
	at org.apache.spark.sql.SparkSession$$anonfun$sharedState$1.apply(SparkSession.scala:120)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.sql.SparkSession.sharedState$lzycompute(SparkSession.scala:120)
	at org.apache.spark.sql.SparkSession.sharedState(SparkSession.scala:119)
	at org.apache.spark.sql.internal.BaseSessionStateBuilder.build(BaseSessionStateBuilder.scala:286)
	at org.apache.spark.sql.test.TestSparkSession.sessionState$lzycompute(TestSQLContext.scala:42)
	at org.apache.spark.sql.test.TestSparkSession.sessionState(TestSQLContext.scala:41)
	at org.apache.spark.sql.SparkSession$$anonfun$1$$anonfun$apply$1.apply(SparkSession.scala:95)
	at org.apache.spark.sql.SparkSession$$anonfun$1$$anonfun$apply$1.apply(SparkSession.scala:95)
	at scala.Option.map(Option.scala:146)
	at org.apache.spark.sql.SparkSession$$anonfun$1.apply(SparkSession.scala:95)
	at org.apache.spark.sql.SparkSession$$anonfun$1.apply(SparkSession.scala:94)
	at org.apache.spark.sql.internal.SQLConf$.get(SQLConf.scala:126)
	at org.apache.spark.sql.catalyst.expressions.CodeGeneratorWithInterpretedFallback.createObject(CodeGeneratorWithInterpretedFallback.scala:54)
	at org.apache.spark.sql.catalyst.expressions.UnsafeProjection$.create(Projection.scala:157)
	at org.apache.spark.sql.catalyst.expressions.UnsafeProjection$.create(Projection.scala:150)
	at org.apache.spark.sql.execution.UnsafeRowSerializerSuite.org$apache$spark$sql$execution$UnsafeRowSerializerSuite$$unsafeRowConverter(UnsafeRowSerializerSuite.scala:54)
	at org.apache.spark.sql.execution.UnsafeRowSerializerSuite.org$apache$spark$sql$execution$UnsafeRowSerializerSuite$$toUnsafeRow(UnsafeRowSerializerSuite.scala:49)
	at org.apache.spark.sql.execution.UnsafeRowSerializerSuite$$anonfun$2.apply(UnsafeRowSerializerSuite.scala:63)
	at org.apache.spark.sql.execution.UnsafeRowSerializerSuite$$anonfun$2.apply(UnsafeRowSerializerSuite.scala:60)
...
```

## How was this patch tested?

N/A

Author: Wenchen Fan <wenchen@databricks.com>

Closes #21518 from cloud-fan/test.

(cherry picked from commit 01452ea9c75ff027ceeb8314368c6bbedefdb2bf)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/sql/LocalSparkSession.scala  |  4 +
 .../execution/UnsafeRowSerializerSuite.scala  | 80 +++++++------------
 2 files changed, 35 insertions(+), 49 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/LocalSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/LocalSparkSession.scala
index d66a6902b0510..cbef1c7828319 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/LocalSparkSession.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/LocalSparkSession.scala
@@ -30,11 +30,15 @@ trait LocalSparkSession extends BeforeAndAfterEach with BeforeAndAfterAll { self
   override def beforeAll() {
     super.beforeAll()
     InternalLoggerFactory.setDefaultFactory(Slf4JLoggerFactory.INSTANCE)
+    SparkSession.clearActiveSession()
+    SparkSession.clearDefaultSession()
   }
 
   override def afterEach() {
     try {
       resetSparkContext()
+      SparkSession.clearActiveSession()
+      SparkSession.clearDefaultSession()
     } finally {
       super.afterEach()
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
index a3ae93810aa3c..d305ce3e698ae 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
@@ -21,15 +21,13 @@ import java.io.{ByteArrayInputStream, ByteArrayOutputStream, File}
 import java.util.Properties
 
 import org.apache.spark._
-import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.memory.TaskMemoryManager
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.{LocalSparkSession, Row, SparkSession}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.types._
 import org.apache.spark.storage.ShuffleBlockId
-import org.apache.spark.util.Utils
 import org.apache.spark.util.collection.ExternalSorter
 
 /**
@@ -43,7 +41,7 @@ class ClosableByteArrayInputStream(buf: Array[Byte]) extends ByteArrayInputStrea
   }
 }
 
-class UnsafeRowSerializerSuite extends SparkFunSuite with LocalSparkContext {
+class UnsafeRowSerializerSuite extends SparkFunSuite with LocalSparkSession {
 
   private def toUnsafeRow(row: Row, schema: Array[DataType]): UnsafeRow = {
     val converter = unsafeRowConverter(schema)
@@ -58,7 +56,7 @@ class UnsafeRowSerializerSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   test("toUnsafeRow() test helper method") {
-    // This currently doesnt work because the generic getter throws an exception.
+    // This currently doesn't work because the generic getter throws an exception.
     val row = Row("Hello", 123)
     val unsafeRow = toUnsafeRow(row, Array(StringType, IntegerType))
     assert(row.getString(0) === unsafeRow.getUTF8String(0).toString)
@@ -97,59 +95,43 @@ class UnsafeRowSerializerSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   test("SPARK-10466: external sorter spilling with unsafe row serializer") {
-    var sc: SparkContext = null
-    var outputFile: File = null
-    val oldEnv = SparkEnv.get // save the old SparkEnv, as it will be overwritten
-    Utils.tryWithSafeFinally {
-      val conf = new SparkConf()
-        .set("spark.shuffle.spill.initialMemoryThreshold", "1")
-        .set("spark.shuffle.sort.bypassMergeThreshold", "0")
-        .set("spark.testing.memory", "80000")
-
-      sc = new SparkContext("local", "test", conf)
-      outputFile = File.createTempFile("test-unsafe-row-serializer-spill", "")
-      // prepare data
-      val converter = unsafeRowConverter(Array(IntegerType))
-      val data = (1 to 10000).iterator.map { i =>
-        (i, converter(Row(i)))
-      }
-      val taskMemoryManager = new TaskMemoryManager(sc.env.memoryManager, 0)
-      val taskContext = new TaskContextImpl(0, 0, 0, 0, 0, taskMemoryManager, new Properties, null)
-
-      val sorter = new ExternalSorter[Int, UnsafeRow, UnsafeRow](
-        taskContext,
-        partitioner = Some(new HashPartitioner(10)),
-        serializer = new UnsafeRowSerializer(numFields = 1))
-
-      // Ensure we spilled something and have to merge them later
-      assert(sorter.numSpills === 0)
-      sorter.insertAll(data)
-      assert(sorter.numSpills > 0)
+    val conf = new SparkConf()
+      .set("spark.shuffle.spill.initialMemoryThreshold", "1")
+      .set("spark.shuffle.sort.bypassMergeThreshold", "0")
+      .set("spark.testing.memory", "80000")
+    spark = SparkSession.builder().master("local").appName("test").config(conf).getOrCreate()
+    val outputFile = File.createTempFile("test-unsafe-row-serializer-spill", "")
+    outputFile.deleteOnExit()
+    // prepare data
+    val converter = unsafeRowConverter(Array(IntegerType))
+    val data = (1 to 10000).iterator.map { i =>
+      (i, converter(Row(i)))
+    }
+    val taskMemoryManager = new TaskMemoryManager(spark.sparkContext.env.memoryManager, 0)
+    val taskContext = new TaskContextImpl(0, 0, 0, 0, 0, taskMemoryManager, new Properties, null)
 
-      // Merging spilled files should not throw assertion error
-      sorter.writePartitionedFile(ShuffleBlockId(0, 0, 0), outputFile)
-    } {
-      // Clean up
-      if (sc != null) {
-        sc.stop()
-      }
+    val sorter = new ExternalSorter[Int, UnsafeRow, UnsafeRow](
+      taskContext,
+      partitioner = Some(new HashPartitioner(10)),
+      serializer = new UnsafeRowSerializer(numFields = 1))
 
-      // restore the spark env
-      SparkEnv.set(oldEnv)
+    // Ensure we spilled something and have to merge them later
+    assert(sorter.numSpills === 0)
+    sorter.insertAll(data)
+    assert(sorter.numSpills > 0)
 
-      if (outputFile != null) {
-        outputFile.delete()
-      }
-    }
+    // Merging spilled files should not throw assertion error
+    sorter.writePartitionedFile(ShuffleBlockId(0, 0, 0), outputFile)
   }
 
   test("SPARK-10403: unsafe row serializer with SortShuffleManager") {
     val conf = new SparkConf().set("spark.shuffle.manager", "sort")
-    sc = new SparkContext("local", "test", conf)
+    spark = SparkSession.builder().master("local").appName("test").config(conf).getOrCreate()
     val row = Row("Hello", 123)
     val unsafeRow = toUnsafeRow(row, Array(StringType, IntegerType))
-    val rowsRDD = sc.parallelize(Seq((0, unsafeRow), (1, unsafeRow), (0, unsafeRow)))
-      .asInstanceOf[RDD[Product2[Int, InternalRow]]]
+    val rowsRDD = spark.sparkContext.parallelize(
+      Seq((0, unsafeRow), (1, unsafeRow), (0, unsafeRow))
+    ).asInstanceOf[RDD[Product2[Int, InternalRow]]]
     val dependency =
       new ShuffleDependency[Int, InternalRow, InternalRow](
         rowsRDD,

From 63e1da16294e02affa27ab43a3ef0ae62e0c7672 Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Tue, 12 Jun 2018 09:56:35 -0700
Subject: [PATCH 493/566] [SPARK-24531][TESTS] Remove version 2.2.0 from
 testing versions in HiveExternalCatalogVersionsSuite

## What changes were proposed in this pull request?

Removing version 2.2.0 from testing versions in HiveExternalCatalogVersionsSuite as it is not present anymore in the mirrors and this is blocking all the open PRs.

## How was this patch tested?

running UTs

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #21540 from mgaido91/SPARK-24531.

(cherry picked from commit 2824f1436bb0371b7216730455f02456ef8479ce)
Signed-off-by: Xiao Li <gatorsmile@gmail.com>
---
 .../spark/sql/hive/HiveExternalCatalogVersionsSuite.scala       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
index ea86ab9772bc7..6f904c937348d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
@@ -195,7 +195,7 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
 
 object PROCESS_TABLES extends QueryTest with SQLTestUtils {
   // Tests the latest version of every release line.
-  val testingVersions = Seq("2.0.2", "2.1.2", "2.2.0", "2.2.1", "2.3.0")
+  val testingVersions = Seq("2.0.2", "2.1.2", "2.2.1", "2.3.0")
 
   protected var spark: SparkSession = _
 

From a55de387db901975e68e88c68ec464d49f240270 Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Tue, 12 Jun 2018 16:42:44 -0700
Subject: [PATCH 494/566] [SPARK-24506][UI] Add UI filters to tabs added after
 binding

## What changes were proposed in this pull request?

Currently, `spark.ui.filters` are not applied to the handlers added after binding the server. This means that every page which is added after starting the UI will not have the filters configured on it. This can allow unauthorized access to the pages.

The PR adds the filters also to the handlers added after the UI starts.

## How was this patch tested?

manual tests (without the patch, starting the thriftserver with `--conf spark.ui.filters=org.apache.hadoop.security.authentication.server.AuthenticationFilter --conf spark.org.apache.hadoop.security.authentication.server.AuthenticationFilter.params="type=simple"` you can access `http://localhost:4040/sqlserver`; with the patch, 401 is the response as for the other pages).

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #21523 from mgaido91/SPARK-24506.

(cherry picked from commit f53818d35bdef5d20a2718b14a2fed4c468545c6)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../org/apache/spark/deploy/history/HistoryServer.scala   | 1 -
 core/src/main/scala/org/apache/spark/ui/JettyUtils.scala  | 8 +++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
index 611fa563a7cd9..06540420399e0 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
@@ -152,7 +152,6 @@ class HistoryServer(
     assert(serverInfo.isDefined, "HistoryServer must be bound before attaching SparkUIs")
     handlers.synchronized {
       ui.getHandlers.foreach(attachHandler)
-      addFilters(ui.getHandlers, conf)
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
index 0e8a6307de6a8..ba98fa1548167 100644
--- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
@@ -263,7 +263,7 @@ private[spark] object JettyUtils extends Logging {
     filters.foreach {
       case filter : String =>
         if (!filter.isEmpty) {
-          logInfo("Adding filter: " + filter)
+          logInfo(s"Adding filter $filter to ${handlers.map(_.getContextPath).mkString(", ")}.")
           val holder : FilterHolder = new FilterHolder()
           holder.setClassName(filter)
           // Get any parameters for each filter
@@ -406,7 +406,7 @@ private[spark] object JettyUtils extends Logging {
       }
 
       pool.setMaxThreads(math.max(pool.getMaxThreads, minThreads))
-      ServerInfo(server, httpPort, securePort, collection)
+      ServerInfo(server, httpPort, securePort, conf, collection)
     } catch {
       case e: Exception =>
         server.stop()
@@ -506,10 +506,12 @@ private[spark] case class ServerInfo(
     server: Server,
     boundPort: Int,
     securePort: Option[Int],
+    conf: SparkConf,
     private val rootHandler: ContextHandlerCollection) {
 
-  def addHandler(handler: ContextHandler): Unit = {
+  def addHandler(handler: ServletContextHandler): Unit = {
     handler.setVirtualHosts(JettyUtils.toVirtualHosts(JettyUtils.SPARK_CONNECTOR_NAME))
+    JettyUtils.addFilters(Seq(handler), conf)
     rootHandler.addHandler(handler)
     if (!handler.isStarted()) {
       handler.start()

From 470cacd4982ca369ffd294ee37abfa1864d39967 Mon Sep 17 00:00:00 2001
From: edorigatti <emilio.dorigatti@gmail.com>
Date: Wed, 13 Jun 2018 09:06:06 +0800
Subject: [PATCH 495/566] [SPARK-23754][PYTHON][FOLLOWUP][BACKPORT-2.3] Move
 UDF stop iteration wrapping from driver to executor

SPARK-23754 was fixed in #21383 by changing the UDF code to wrap the user function, but this required a hack to save its argspec. This PR reverts this change and fixes the `StopIteration` bug in the worker.

The root of the problem is that when an user-supplied function raises a `StopIteration`, pyspark might stop processing data, if this function is used in a for-loop. The solution is to catch `StopIteration`s exceptions and re-raise them as `RuntimeError`s, so that the execution fails and the error is reported to the user. This is done using the `fail_on_stopiteration` wrapper, in different ways depending on where the function is used:
 - In RDDs, the user function is wrapped in the driver, because this function is also called in the driver itself.
 - In SQL UDFs, the function is wrapped in the worker, since all processing happens there. Moreover, the worker needs the signature of the user function, which is lost when wrapping it, but passing this signature to the worker requires a not so nice hack.

HyukjinKwon

Author: edorigatti <emilio.dorigatti@gmail.com>
Author: e-dorigatti <emilio.dorigatti@gmail.com>

Closes #21538 from e-dorigatti/branch-2.3.
---
 python/pyspark/sql/tests.py | 54 ++++++++++++++++++++++++++-----------
 python/pyspark/sql/udf.py   |  4 +--
 python/pyspark/tests.py     | 37 ++++++++++++++-----------
 python/pyspark/util.py      |  2 +-
 python/pyspark/worker.py    | 11 +++++---
 5 files changed, 70 insertions(+), 38 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 818ba833a1433..aa7d8eba1f692 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -853,22 +853,6 @@ def __call__(self, x):
         self.assertEqual(f, f_.func)
         self.assertEqual(return_type, f_.returnType)
 
-    def test_stopiteration_in_udf(self):
-        # test for SPARK-23754
-        from pyspark.sql.functions import udf
-        from py4j.protocol import Py4JJavaError
-
-        def foo(x):
-            raise StopIteration()
-
-        with self.assertRaises(Py4JJavaError) as cm:
-            self.spark.range(0, 1000).withColumn('v', udf(foo)('id')).show()
-
-        self.assertIn(
-            "Caught StopIteration thrown from user's code; failing the task",
-            cm.exception.java_exception.toString()
-        )
-
     def test_validate_column_types(self):
         from pyspark.sql.functions import udf, to_json
         from pyspark.sql.column import _to_java_column
@@ -3917,6 +3901,44 @@ def foo(df):
                 def foo(k, v):
                     return k
 
+    def test_stopiteration_in_udf(self):
+        from pyspark.sql.functions import udf, pandas_udf, PandasUDFType
+        from py4j.protocol import Py4JJavaError
+
+        def foo(x):
+            raise StopIteration()
+
+        def foofoo(x, y):
+            raise StopIteration()
+
+        exc_message = "Caught StopIteration thrown from user's code; failing the task"
+        df = self.spark.range(0, 100)
+
+        # plain udf (test for SPARK-23754)
+        self.assertRaisesRegexp(
+            Py4JJavaError,
+            exc_message,
+            df.withColumn('v', udf(foo)('id')).collect
+        )
+
+        # pandas scalar udf
+        self.assertRaisesRegexp(
+            Py4JJavaError,
+            exc_message,
+            df.withColumn(
+                'v', pandas_udf(foo, 'double', PandasUDFType.SCALAR)('id')
+            ).collect
+        )
+
+        # pandas grouped map
+        self.assertRaisesRegexp(
+            Py4JJavaError,
+            exc_message,
+            df.groupBy('id').apply(
+                pandas_udf(foo, df.schema, PandasUDFType.GROUPED_MAP)
+            ).collect
+        )
+
 
 @unittest.skipIf(
     not _have_pandas or not _have_pyarrow,
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index 7d813af15cb6b..671e5680b8e7b 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -24,7 +24,6 @@
 from pyspark.sql.column import Column, _to_java_column, _to_seq
 from pyspark.sql.types import StringType, DataType, StructType, _parse_datatype_string, \
     to_arrow_type, to_arrow_schema
-from pyspark.util import fail_on_stopiteration
 
 __all__ = ["UDFRegistration"]
 
@@ -155,8 +154,7 @@ def _create_judf(self):
         spark = SparkSession.builder.getOrCreate()
         sc = spark.sparkContext
 
-        func = fail_on_stopiteration(self.func)
-        wrapped_func = _wrap_function(sc, func, self.returnType)
+        wrapped_func = _wrap_function(sc, self.func, self.returnType)
         jdt = spark._jsparkSession.parseDataType(self.returnType.json())
         judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction(
             self._name, wrapped_func, jdt, self.evalType, self.deterministic)
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index af394504b1d7a..81bff4b253586 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -1270,27 +1270,34 @@ def test_pipe_functions(self):
         self.assertRaises(Py4JJavaError, rdd.pipe('grep 4', checkCode=True).collect)
         self.assertEqual([], rdd.pipe('grep 4').collect())
 
-    def test_stopiteration_in_client_code(self):
+    def test_stopiteration_in_user_code(self):
 
         def stopit(*x):
             raise StopIteration()
 
         seq_rdd = self.sc.parallelize(range(10))
         keyed_rdd = self.sc.parallelize((x % 2, x) for x in range(10))
-
-        self.assertRaises(Py4JJavaError, seq_rdd.map(stopit).collect)
-        self.assertRaises(Py4JJavaError, seq_rdd.filter(stopit).collect)
-        self.assertRaises(Py4JJavaError, seq_rdd.cartesian(seq_rdd).flatMap(stopit).collect)
-        self.assertRaises(Py4JJavaError, seq_rdd.foreach, stopit)
-        self.assertRaises(Py4JJavaError, keyed_rdd.reduceByKeyLocally, stopit)
-        self.assertRaises(Py4JJavaError, seq_rdd.reduce, stopit)
-        self.assertRaises(Py4JJavaError, seq_rdd.fold, 0, stopit)
-
-        # the exception raised is non-deterministic
-        self.assertRaises((Py4JJavaError, RuntimeError),
-                          seq_rdd.aggregate, 0, stopit, lambda *x: 1)
-        self.assertRaises((Py4JJavaError, RuntimeError),
-                          seq_rdd.aggregate, 0, lambda *x: 1, stopit)
+        msg = "Caught StopIteration thrown from user's code; failing the task"
+
+        self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.map(stopit).collect)
+        self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.filter(stopit).collect)
+        self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.foreach, stopit)
+        self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.reduce, stopit)
+        self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.fold, 0, stopit)
+        self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.foreach, stopit)
+        self.assertRaisesRegexp(Py4JJavaError, msg,
+                                seq_rdd.cartesian(seq_rdd).flatMap(stopit).collect)
+
+        # these methods call the user function both in the driver and in the executor
+        # the exception raised is different according to where the StopIteration happens
+        # RuntimeError is raised if in the driver
+        # Py4JJavaError is raised if in the executor (wraps the RuntimeError raised in the worker)
+        self.assertRaisesRegexp((Py4JJavaError, RuntimeError), msg,
+                                keyed_rdd.reduceByKeyLocally, stopit)
+        self.assertRaisesRegexp((Py4JJavaError, RuntimeError), msg,
+                                seq_rdd.aggregate, 0, stopit, lambda *x: 1)
+        self.assertRaisesRegexp((Py4JJavaError, RuntimeError), msg,
+                                seq_rdd.aggregate, 0, lambda *x: 1, stopit)
 
 
 class ProfilerTests(PySparkTestCase):
diff --git a/python/pyspark/util.py b/python/pyspark/util.py
index 83d528f0e31f8..94f51eec9d71a 100644
--- a/python/pyspark/util.py
+++ b/python/pyspark/util.py
@@ -48,7 +48,7 @@ def _exception_message(excp):
 def fail_on_stopiteration(f):
     """
     Wraps the input function to fail on 'StopIteration' by raising a 'RuntimeError'
-    prevents silent loss of data when 'f' is used in a for loop
+    prevents silent loss of data when 'f' is used in a for loop in Spark code
     """
     def wrapper(*args, **kwargs):
         try:
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 44e9106a23523..788b3237e1799 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -35,6 +35,7 @@
     write_long, read_int, SpecialLengths, UTF8Deserializer, PickleSerializer, \
     BatchedSerializer, ArrowStreamPandasSerializer
 from pyspark.sql.types import to_arrow_type
+from pyspark.util import fail_on_stopiteration
 from pyspark import shuffle
 
 pickleSer = PickleSerializer()
@@ -122,13 +123,17 @@ def read_single_udf(pickleSer, infile, eval_type):
         else:
             row_func = chain(row_func, f)
 
+    # make sure StopIteration's raised in the user code are not ignored
+    # when they are processed in a for loop, raise them as RuntimeError's instead
+    func = fail_on_stopiteration(row_func)
+
     # the last returnType will be the return type of UDF
     if eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF:
-        return arg_offsets, wrap_scalar_pandas_udf(row_func, return_type)
+        return arg_offsets, wrap_scalar_pandas_udf(func, return_type)
     elif eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
-        return arg_offsets, wrap_grouped_map_pandas_udf(row_func, return_type)
+        return arg_offsets, wrap_grouped_map_pandas_udf(func, return_type)
     else:
-        return arg_offsets, wrap_udf(row_func, return_type)
+        return arg_offsets, wrap_udf(func, return_type)
 
 
 def read_udfs(pickleSer, infile, eval_type):

From a2f65eb794c123e967e45f00f8b74dff24d2a28c Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Thu, 14 Jun 2018 14:20:48 +0800
Subject: [PATCH 496/566] [MINOR][CORE][TEST] Remove unnecessary sort in
 UnsafeInMemorySorterSuite

## What changes were proposed in this pull request?

We don't require specific ordering of the input data, the sort action is not necessary and misleading.

## How was this patch tested?

Existing test suite.

Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #21536 from jiangxb1987/sorterSuite.

(cherry picked from commit 534065efeb51ff0d308fa6cc9dea0715f8ce25ad)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 .../util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
index c145532328514..85ffdca436e14 100644
--- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
@@ -129,7 +129,6 @@ public int compare(
     final UnsafeSorterIterator iter = sorter.getSortedIterator();
     int iterLength = 0;
     long prevPrefix = -1;
-    Arrays.sort(dataToSort);
     while (iter.hasNext()) {
       iter.loadNext();
       final String str =

From e6bf325de1ea101264925deadb0476a65059e04a Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Thu, 14 Jun 2018 09:20:41 -0700
Subject: [PATCH 497/566] [SPARK-24495][SQL] EnsureRequirement returns wrong
 plan when reordering equal keys

`EnsureRequirement` in its `reorder` method currently assumes that the same key appears only once in the join condition. This of course might not be the case, and when it is not satisfied, it returns a wrong plan which produces a wrong result of the query.

added UT

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #21529 from mgaido91/SPARK-24495.

(cherry picked from commit fdadc4be08dcf1a06383bbb05e53540da2092c63)
Signed-off-by: Xiao Li <gatorsmile@gmail.com>
---
 .../execution/exchange/EnsureRequirements.scala | 14 ++++++++++++--
 .../scala/org/apache/spark/sql/JoinSuite.scala  | 11 +++++++++++
 .../spark/sql/execution/PlannerSuite.scala      | 17 +++++++++++++++++
 3 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
index e3d28388c5470..ad95879d86f42 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.exchange
 
+import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.sql.catalyst.expressions._
@@ -227,9 +228,16 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
       currentOrderOfKeys: Seq[Expression]): (Seq[Expression], Seq[Expression]) = {
     val leftKeysBuffer = ArrayBuffer[Expression]()
     val rightKeysBuffer = ArrayBuffer[Expression]()
+    val pickedIndexes = mutable.Set[Int]()
+    val keysAndIndexes = currentOrderOfKeys.zipWithIndex
 
     expectedOrderOfKeys.foreach(expression => {
-      val index = currentOrderOfKeys.indexWhere(e => e.semanticEquals(expression))
+      val index = keysAndIndexes.find { case (e, idx) =>
+        // As we may have the same key used many times, we need to filter out its occurrence we
+        // have already used.
+        e.semanticEquals(expression) && !pickedIndexes.contains(idx)
+      }.map(_._2).get
+      pickedIndexes += index
       leftKeysBuffer.append(leftKeys(index))
       rightKeysBuffer.append(rightKeys(index))
     })
@@ -270,7 +278,7 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
    * partitioning of the join nodes' children.
    */
   private def reorderJoinPredicates(plan: SparkPlan): SparkPlan = {
-    plan.transformUp {
+    plan match {
       case BroadcastHashJoinExec(leftKeys, rightKeys, joinType, buildSide, condition, left,
         right) =>
         val (reorderedLeftKeys, reorderedRightKeys) =
@@ -288,6 +296,8 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
         val (reorderedLeftKeys, reorderedRightKeys) =
           reorderJoinKeys(leftKeys, rightKeys, left.outputPartitioning, right.outputPartitioning)
         SortMergeJoinExec(reorderedLeftKeys, reorderedRightKeys, joinType, condition, left, right)
+
+      case other => other
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 8fa747465cb1a..44767dfc92497 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -882,4 +882,15 @@ class JoinSuite extends QueryTest with SharedSQLContext {
       checkAnswer(df, Row(3, 8, 7, 2) :: Row(3, 8, 4, 2) :: Nil)
     }
   }
+
+  test("SPARK-24495: Join may return wrong result when having duplicated equal-join keys") {
+    withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1",
+      SQLConf.CONSTRAINT_PROPAGATION_ENABLED.key -> "false",
+      SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") {
+      val df1 = spark.range(0, 100, 1, 2)
+      val df2 = spark.range(100).select($"id".as("b1"), (- $"id").as("b2"))
+      val res = df1.join(df2, $"id" === $"b1" && $"id" === $"b2").select($"b1", $"b2", $"id")
+      checkAnswer(res, Row(0, 0, 0))
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index dfbc0346cb247..4d3b2f6883e05 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -629,6 +629,23 @@ class PlannerSuite extends SharedSQLContext {
       requiredOrdering = Seq(orderingA, orderingB),
       shouldHaveSort = true)
   }
+
+  test("SPARK-24495: EnsureRequirements can return wrong plan when reusing the same key in join") {
+    val plan1 = DummySparkPlan(outputOrdering = Seq(orderingA),
+      outputPartitioning = HashPartitioning(exprA :: exprA :: Nil, 5))
+    val plan2 = DummySparkPlan(outputOrdering = Seq(orderingB),
+      outputPartitioning = HashPartitioning(exprB :: Nil, 5))
+    val smjExec = SortMergeJoinExec(
+      exprA :: exprA :: Nil, exprB :: exprC :: Nil, Inner, None, plan1, plan2)
+
+    val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(smjExec)
+    outputPlan match {
+      case SortMergeJoinExec(leftKeys, rightKeys, _, _, _, _) =>
+        assert(leftKeys == Seq(exprA, exprA))
+        assert(rightKeys == Seq(exprB, exprC))
+      case _ => fail()
+    }
+  }
 }
 
 // Used for unit-testing EnsureRequirements

From 7f1708a44759724b116742683e2d4290362a3b59 Mon Sep 17 00:00:00 2001
From: Ruben Berenguel Montoro <ruben@mostlymaths.net>
Date: Fri, 15 Jun 2018 16:59:00 +0800
Subject: [PATCH 498/566] [PYTHON] Fix typo in serializer exception

## What changes were proposed in this pull request?

Fix typo in exception raised in Python serializer

## How was this patch tested?

No code changes

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Ruben Berenguel Montoro <ruben@mostlymaths.net>

Closes #21566 from rberenguel/fix_typo_pyspark_serializers.

(cherry picked from commit 6567fc43aca75b41900cde976594e21c8b0ca98a)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 python/pyspark/serializers.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 91a7f093cec19..6d107f3069dc5 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -33,8 +33,9 @@
 [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
 >>> sc.stop()
 
-PySpark serialize objects in batches; By default, the batch size is chosen based
-on the size of objects, also configurable by SparkContext's C{batchSize} parameter:
+PySpark serializes objects in batches; by default, the batch size is chosen based
+on the size of objects and is also configurable by SparkContext's C{batchSize}
+parameter:
 
 >>> sc = SparkContext('local', 'test', batchSize=2)
 >>> rdd = sc.parallelize(range(16), 4).map(lambda x: x)
@@ -99,7 +100,7 @@ def load_stream(self, stream):
     def _load_stream_without_unbatching(self, stream):
         """
         Return an iterator of deserialized batches (iterable) of objects from the input stream.
-        if the serializer does not operate on batches the default implementation returns an
+        If the serializer does not operate on batches the default implementation returns an
         iterator of single element lists.
         """
         return map(lambda x: [x], self.load_stream(stream))
@@ -456,7 +457,7 @@ def dumps(self, obj):
         return obj
 
 
-# Hook namedtuple, make it picklable
+# Hack namedtuple, make it picklable
 
 __cls = {}
 
@@ -520,15 +521,15 @@ def namedtuple(*args, **kwargs):
         cls = _old_namedtuple(*args, **kwargs)
         return _hack_namedtuple(cls)
 
-    # replace namedtuple with new one
+    # replace namedtuple with the new one
     collections.namedtuple.__globals__["_old_namedtuple_kwdefaults"] = _old_namedtuple_kwdefaults
     collections.namedtuple.__globals__["_old_namedtuple"] = _old_namedtuple
     collections.namedtuple.__globals__["_hack_namedtuple"] = _hack_namedtuple
     collections.namedtuple.__code__ = namedtuple.__code__
     collections.namedtuple.__hijack = 1
 
-    # hack the cls already generated by namedtuple
-    # those created in other module can be pickled as normal,
+    # hack the cls already generated by namedtuple.
+    # Those created in other modules can be pickled as normal,
     # so only hack those in __main__ module
     for n, o in sys.modules["__main__"].__dict__.items():
         if (type(o) is type and o.__base__ is tuple
@@ -611,7 +612,7 @@ def loads(self, obj):
         elif _type == b'P':
             return pickle.loads(obj[1:])
         else:
-            raise ValueError("invalid sevialization type: %s" % _type)
+            raise ValueError("invalid serialization type: %s" % _type)
 
 
 class CompressedSerializer(FramedSerializer):

From d3255a57109a5cea79948aa4192008b988961aa3 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 15 Jun 2018 14:33:17 +0200
Subject: [PATCH 499/566] revert [SPARK-21743][SQL] top-most limit should not
 cause memory leak

## What changes were proposed in this pull request?

There is a performance regression in Spark 2.3. When we read a big compressed text file which is un-splittable(e.g. gz), and then take the first record, Spark will scan all the data in the text file which is very slow. For example, `spark.read.text("/tmp/test.csv.gz").head(1)`, we can check out the SQL UI and see that the file is fully scanned.

![image](https://user-images.githubusercontent.com/3182036/41445252-264b1e5a-6ffd-11e8-9a67-4c31d129a314.png)

This is introduced by #18955 , which adds a LocalLimit to the query when executing `Dataset.head`. The foundamental problem is, `Limit` is not well whole-stage-codegened. It keeps consuming the input even if we have already hit the limitation.

However, if we just fix LIMIT whole-stage-codegen, the memory leak test will fail, as we don't fully consume the inputs to trigger the resource cleanup.

To fix it completely, we should do the following
1. fix LIMIT whole-stage-codegen, stop consuming inputs after hitting the limitation.
2. in whole-stage-codegen, provide a way to release resource of the parant operator, and apply it in LIMIT
3. automatically release resource when task ends.

Howere this is a non-trivial change, and is risky to backport to Spark 2.3.

This PR proposes to revert #18955 in Spark 2.3. The memory leak is not a big issue. When task ends, Spark will release all the pages allocated by this task, which is kind of releasing most of the resources.

I'll submit a exhaustive fix to master later.

## How was this patch tested?

N/A

Author: Wenchen Fan <wenchen@databricks.com>

Closes #21573 from cloud-fan/limit.
---
 .../org/apache/spark/sql/execution/SparkStrategies.scala   | 7 +------
 .../test/scala/org/apache/spark/sql/SQLQuerySuite.scala    | 5 -----
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index c6565fcf66559..a0a641bc9667e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -70,12 +70,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         case Limit(IntegerLiteral(limit), Project(projectList, Sort(order, true, child))) =>
           TakeOrderedAndProjectExec(limit, order, projectList, planLater(child)) :: Nil
         case Limit(IntegerLiteral(limit), child) =>
-          // With whole stage codegen, Spark releases resources only when all the output data of the
-          // query plan are consumed. It's possible that `CollectLimitExec` only consumes a little
-          // data from child plan and finishes the query without releasing resources. Here we wrap
-          // the child plan with `LocalLimitExec`, to stop the processing of whole stage codegen and
-          // trigger the resource releasing work, after we consume `limit` rows.
-          CollectLimitExec(limit, LocalLimitExec(limit, planLater(child))) :: Nil
+          CollectLimitExec(limit, planLater(child)) :: Nil
         case other => planLater(other) :: Nil
       }
       case Limit(IntegerLiteral(limit), Sort(order, true, child)) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index ebebf62ca4cfa..bc57efeef69c1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2720,11 +2720,6 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     }
   }
 
-  test("SPARK-21743: top-most limit should not cause memory leak") {
-    // In unit test, Spark will fail the query if memory leak detected.
-    spark.range(100).groupBy("id").count().limit(1).collect()
-  }
-
   test("SPARK-21652: rule confliction of InferFiltersFromConstraints and ConstantPropagation") {
     withTempView("t1", "t2") {
       Seq((1, 1)).toDF("col1", "col2").createOrReplaceTempView("t1")

From a7d378e78d73503d4d1ad37d94641200a9ea1b2d Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Wed, 13 Jun 2018 15:18:19 -0700
Subject: [PATCH 500/566] [SPARK-24531][TESTS] Replace 2.3.0 version with 2.3.1

The PR updates the 2.3 version tested to the new release 2.3.1.

existing UTs

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #21543 from mgaido91/patch-1.

(cherry picked from commit 3bf76918fb67fb3ee9aed254d4fb3b87a7e66117)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../spark/sql/hive/HiveExternalCatalogVersionsSuite.scala       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
index 6f904c937348d..514921875f1f9 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
@@ -195,7 +195,7 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
 
 object PROCESS_TABLES extends QueryTest with SQLTestUtils {
   // Tests the latest version of every release line.
-  val testingVersions = Seq("2.0.2", "2.1.2", "2.2.1", "2.3.0")
+  val testingVersions = Seq("2.0.2", "2.1.2", "2.2.1", "2.3.1")
 
   protected var spark: SparkSession = _
 

From d42610440ac2e58ef77fcf42ad81ee4fdf5691ba Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Fri, 15 Jun 2018 13:47:48 -0700
Subject: [PATCH 501/566] [SPARK-24452][SQL][CORE] Avoid possible overflow in
 int add or multiple

This PR fixes possible overflow in int add or multiply. In particular, their overflows in multiply are detected by [Spotbugs](https://spotbugs.github.io/)

The following assignments may cause overflow in right hand side. As a result, the result may be negative.
```
long = int * int
long = int + int
```

To avoid this problem, this PR performs cast from int to long in right hand side.

Existing UTs.

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #21481 from kiszk/SPARK-24452.

(cherry picked from commit 90da7dc241f8eec2348c0434312c97c116330bc4)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/unsafe/map/BytesToBytesMap.java     |   2 +-
 .../spark/deploy/worker/DriverRunner.scala    |   2 +-
 .../apache/spark/rdd/AsyncRDDActions.scala    |   2 +-
 .../apache/spark/storage/BlockManager.scala   |   2 +-
 .../catalyst/expressions/UnsafeArrayData.java |  14 +--
 .../VariableLengthRowBasedKeyValueBatch.java  |   2 +-
 .../vectorized/OffHeapColumnVector.java       | 106 +++++++++---------
 .../vectorized/OnHeapColumnVector.java        |  10 +-
 .../spark/sql/hive/client/HiveShim.scala      |   2 +-
 .../util/FileBasedWriteAheadLog.scala         |   2 +-
 10 files changed, 72 insertions(+), 72 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
index 5f0045507aaab..9a767dd739b91 100644
--- a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
+++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
@@ -703,7 +703,7 @@ public boolean append(Object kbase, long koff, int klen, Object vbase, long voff
       // must be stored in the same memory page.
       // (8 byte key length) (key) (value) (8 byte pointer to next value)
       int uaoSize = UnsafeAlignedOffset.getUaoSize();
-      final long recordLength = (2 * uaoSize) + klen + vlen + 8;
+      final long recordLength = (2L * uaoSize) + klen + vlen + 8;
       if (currentPage == null || currentPage.size() - pageCursor < recordLength) {
         if (!acquireNewPage(recordLength + uaoSize)) {
           return false;
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
index 58a181128eb4d..a6d13d12fc28d 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
@@ -225,7 +225,7 @@ private[deploy] class DriverRunner(
       // check if attempting another run
       keepTrying = supervise && exitCode != 0 && !killed
       if (keepTrying) {
-        if (clock.getTimeMillis() - processStart > successfulRunDuration * 1000) {
+        if (clock.getTimeMillis() - processStart > successfulRunDuration * 1000L) {
           waitSeconds = 1
         }
         logInfo(s"Command exited with status $exitCode, re-launching after $waitSeconds s.")
diff --git a/core/src/main/scala/org/apache/spark/rdd/AsyncRDDActions.scala b/core/src/main/scala/org/apache/spark/rdd/AsyncRDDActions.scala
index c9ed12f4e1bd4..47669a0aeb478 100644
--- a/core/src/main/scala/org/apache/spark/rdd/AsyncRDDActions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/AsyncRDDActions.scala
@@ -95,7 +95,7 @@ class AsyncRDDActions[T: ClassTag](self: RDD[T]) extends Serializable with Loggi
             // the left side of max is >=1 whenever partsScanned >= 2
             numPartsToTry = Math.max(1,
               (1.5 * num * partsScanned / results.size).toInt - partsScanned)
-            numPartsToTry = Math.min(numPartsToTry, partsScanned * 4)
+            numPartsToTry = Math.min(numPartsToTry, partsScanned * 4L)
           }
         }
 
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index e0276a4dc4224..df1a4bef616b2 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -291,7 +291,7 @@ private[spark] class BlockManager(
         case e: Exception if i < MAX_ATTEMPTS =>
           logError(s"Failed to connect to external shuffle server, will retry ${MAX_ATTEMPTS - i}"
             + s" more times after waiting $SLEEP_TIME_SECS seconds...", e)
-          Thread.sleep(SLEEP_TIME_SECS * 1000)
+          Thread.sleep(SLEEP_TIME_SECS * 1000L)
         case NonFatal(e) =>
           throw new SparkException("Unable to register with external shuffle server due to : " +
             e.getMessage, e)
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
index d18542b188f71..bf7b98a62998e 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
@@ -72,7 +72,7 @@ public static int calculateHeaderPortionInBytes(int numFields) {
   private long elementOffset;
 
   private long getElementOffset(int ordinal, int elementSize) {
-    return elementOffset + ordinal * elementSize;
+    return elementOffset + ordinal * (long)elementSize;
   }
 
   public Object getBaseObject() { return baseObject; }
@@ -402,7 +402,7 @@ public byte[] toByteArray() {
   public short[] toShortArray() {
     short[] values = new short[numElements];
     Platform.copyMemory(
-      baseObject, elementOffset, values, Platform.SHORT_ARRAY_OFFSET, numElements * 2);
+      baseObject, elementOffset, values, Platform.SHORT_ARRAY_OFFSET, numElements * 2L);
     return values;
   }
 
@@ -410,7 +410,7 @@ public short[] toShortArray() {
   public int[] toIntArray() {
     int[] values = new int[numElements];
     Platform.copyMemory(
-      baseObject, elementOffset, values, Platform.INT_ARRAY_OFFSET, numElements * 4);
+      baseObject, elementOffset, values, Platform.INT_ARRAY_OFFSET, numElements * 4L);
     return values;
   }
 
@@ -418,7 +418,7 @@ public int[] toIntArray() {
   public long[] toLongArray() {
     long[] values = new long[numElements];
     Platform.copyMemory(
-      baseObject, elementOffset, values, Platform.LONG_ARRAY_OFFSET, numElements * 8);
+      baseObject, elementOffset, values, Platform.LONG_ARRAY_OFFSET, numElements * 8L);
     return values;
   }
 
@@ -426,7 +426,7 @@ public long[] toLongArray() {
   public float[] toFloatArray() {
     float[] values = new float[numElements];
     Platform.copyMemory(
-      baseObject, elementOffset, values, Platform.FLOAT_ARRAY_OFFSET, numElements * 4);
+      baseObject, elementOffset, values, Platform.FLOAT_ARRAY_OFFSET, numElements * 4L);
     return values;
   }
 
@@ -434,14 +434,14 @@ public float[] toFloatArray() {
   public double[] toDoubleArray() {
     double[] values = new double[numElements];
     Platform.copyMemory(
-      baseObject, elementOffset, values, Platform.DOUBLE_ARRAY_OFFSET, numElements * 8);
+      baseObject, elementOffset, values, Platform.DOUBLE_ARRAY_OFFSET, numElements * 8L);
     return values;
   }
 
   private static UnsafeArrayData fromPrimitiveArray(
        Object arr, int offset, int length, int elementSize) {
     final long headerInBytes = calculateHeaderPortionInBytes(length);
-    final long valueRegionInBytes = elementSize * length;
+    final long valueRegionInBytes = (long)elementSize * length;
     final long totalSizeInLongs = (headerInBytes + valueRegionInBytes + 7) / 8;
     if (totalSizeInLongs > Integer.MAX_VALUE / 8) {
       throw new UnsupportedOperationException("Cannot convert this array to unsafe format as " +
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/VariableLengthRowBasedKeyValueBatch.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/VariableLengthRowBasedKeyValueBatch.java
index 905e6820ce6e2..c823de4810f2b 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/VariableLengthRowBasedKeyValueBatch.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/VariableLengthRowBasedKeyValueBatch.java
@@ -41,7 +41,7 @@ public final class VariableLengthRowBasedKeyValueBatch extends RowBasedKeyValueB
   @Override
   public UnsafeRow appendRow(Object kbase, long koff, int klen,
                              Object vbase, long voff, int vlen) {
-    final long recordLength = 8 + klen + vlen + 8;
+    final long recordLength = 8L + klen + vlen + 8;
     // if run out of max supported rows or page size, return null
     if (numRows >= capacity || page == null || page.size() - pageCursor < recordLength) {
       return null;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
index 754c26579ff08..5e0cf7d370dd1 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
@@ -215,12 +215,12 @@ protected UTF8String getBytesAsUTF8String(int rowId, int count) {
 
   @Override
   public void putShort(int rowId, short value) {
-    Platform.putShort(null, data + 2 * rowId, value);
+    Platform.putShort(null, data + 2L * rowId, value);
   }
 
   @Override
   public void putShorts(int rowId, int count, short value) {
-    long offset = data + 2 * rowId;
+    long offset = data + 2L * rowId;
     for (int i = 0; i < count; ++i, offset += 2) {
       Platform.putShort(null, offset, value);
     }
@@ -228,20 +228,20 @@ public void putShorts(int rowId, int count, short value) {
 
   @Override
   public void putShorts(int rowId, int count, short[] src, int srcIndex) {
-    Platform.copyMemory(src, Platform.SHORT_ARRAY_OFFSET + srcIndex * 2,
-        null, data + 2 * rowId, count * 2);
+    Platform.copyMemory(src, Platform.SHORT_ARRAY_OFFSET + srcIndex * 2L,
+        null, data + 2L * rowId, count * 2L);
   }
 
   @Override
   public void putShorts(int rowId, int count, byte[] src, int srcIndex) {
     Platform.copyMemory(src, Platform.BYTE_ARRAY_OFFSET + srcIndex,
-      null, data + rowId * 2, count * 2);
+      null, data + rowId * 2L, count * 2L);
   }
 
   @Override
   public short getShort(int rowId) {
     if (dictionary == null) {
-      return Platform.getShort(null, data + 2 * rowId);
+      return Platform.getShort(null, data + 2L * rowId);
     } else {
       return (short) dictionary.decodeToInt(dictionaryIds.getDictId(rowId));
     }
@@ -251,7 +251,7 @@ public short getShort(int rowId) {
   public short[] getShorts(int rowId, int count) {
     assert(dictionary == null);
     short[] array = new short[count];
-    Platform.copyMemory(null, data + rowId * 2, array, Platform.SHORT_ARRAY_OFFSET, count * 2);
+    Platform.copyMemory(null, data + rowId * 2L, array, Platform.SHORT_ARRAY_OFFSET, count * 2L);
     return array;
   }
 
@@ -261,12 +261,12 @@ public short[] getShorts(int rowId, int count) {
 
   @Override
   public void putInt(int rowId, int value) {
-    Platform.putInt(null, data + 4 * rowId, value);
+    Platform.putInt(null, data + 4L * rowId, value);
   }
 
   @Override
   public void putInts(int rowId, int count, int value) {
-    long offset = data + 4 * rowId;
+    long offset = data + 4L * rowId;
     for (int i = 0; i < count; ++i, offset += 4) {
       Platform.putInt(null, offset, value);
     }
@@ -274,24 +274,24 @@ public void putInts(int rowId, int count, int value) {
 
   @Override
   public void putInts(int rowId, int count, int[] src, int srcIndex) {
-    Platform.copyMemory(src, Platform.INT_ARRAY_OFFSET + srcIndex * 4,
-        null, data + 4 * rowId, count * 4);
+    Platform.copyMemory(src, Platform.INT_ARRAY_OFFSET + srcIndex * 4L,
+        null, data + 4L * rowId, count * 4L);
   }
 
   @Override
   public void putInts(int rowId, int count, byte[] src, int srcIndex) {
     Platform.copyMemory(src, Platform.BYTE_ARRAY_OFFSET + srcIndex,
-      null, data + rowId * 4, count * 4);
+      null, data + rowId * 4L, count * 4L);
   }
 
   @Override
   public void putIntsLittleEndian(int rowId, int count, byte[] src, int srcIndex) {
     if (!bigEndianPlatform) {
       Platform.copyMemory(src, srcIndex + Platform.BYTE_ARRAY_OFFSET,
-          null, data + 4 * rowId, count * 4);
+          null, data + 4L * rowId, count * 4L);
     } else {
       int srcOffset = srcIndex + Platform.BYTE_ARRAY_OFFSET;
-      long offset = data + 4 * rowId;
+      long offset = data + 4L * rowId;
       for (int i = 0; i < count; ++i, offset += 4, srcOffset += 4) {
         Platform.putInt(null, offset,
             java.lang.Integer.reverseBytes(Platform.getInt(src, srcOffset)));
@@ -302,7 +302,7 @@ public void putIntsLittleEndian(int rowId, int count, byte[] src, int srcIndex)
   @Override
   public int getInt(int rowId) {
     if (dictionary == null) {
-      return Platform.getInt(null, data + 4 * rowId);
+      return Platform.getInt(null, data + 4L * rowId);
     } else {
       return dictionary.decodeToInt(dictionaryIds.getDictId(rowId));
     }
@@ -312,7 +312,7 @@ public int getInt(int rowId) {
   public int[] getInts(int rowId, int count) {
     assert(dictionary == null);
     int[] array = new int[count];
-    Platform.copyMemory(null, data + rowId * 4, array, Platform.INT_ARRAY_OFFSET, count * 4);
+    Platform.copyMemory(null, data + rowId * 4L, array, Platform.INT_ARRAY_OFFSET, count * 4L);
     return array;
   }
 
@@ -324,7 +324,7 @@ public int[] getInts(int rowId, int count) {
   public int getDictId(int rowId) {
     assert(dictionary == null)
             : "A ColumnVector dictionary should not have a dictionary for itself.";
-    return Platform.getInt(null, data + 4 * rowId);
+    return Platform.getInt(null, data + 4L * rowId);
   }
 
   //
@@ -333,12 +333,12 @@ public int getDictId(int rowId) {
 
   @Override
   public void putLong(int rowId, long value) {
-    Platform.putLong(null, data + 8 * rowId, value);
+    Platform.putLong(null, data + 8L * rowId, value);
   }
 
   @Override
   public void putLongs(int rowId, int count, long value) {
-    long offset = data + 8 * rowId;
+    long offset = data + 8L * rowId;
     for (int i = 0; i < count; ++i, offset += 8) {
       Platform.putLong(null, offset, value);
     }
@@ -346,24 +346,24 @@ public void putLongs(int rowId, int count, long value) {
 
   @Override
   public void putLongs(int rowId, int count, long[] src, int srcIndex) {
-    Platform.copyMemory(src, Platform.LONG_ARRAY_OFFSET + srcIndex * 8,
-        null, data + 8 * rowId, count * 8);
+    Platform.copyMemory(src, Platform.LONG_ARRAY_OFFSET + srcIndex * 8L,
+        null, data + 8L * rowId, count * 8L);
   }
 
   @Override
   public void putLongs(int rowId, int count, byte[] src, int srcIndex) {
     Platform.copyMemory(src, Platform.BYTE_ARRAY_OFFSET + srcIndex,
-      null, data + rowId * 8, count * 8);
+      null, data + rowId * 8L, count * 8L);
   }
 
   @Override
   public void putLongsLittleEndian(int rowId, int count, byte[] src, int srcIndex) {
     if (!bigEndianPlatform) {
       Platform.copyMemory(src, srcIndex + Platform.BYTE_ARRAY_OFFSET,
-          null, data + 8 * rowId, count * 8);
+          null, data + 8L * rowId, count * 8L);
     } else {
       int srcOffset = srcIndex + Platform.BYTE_ARRAY_OFFSET;
-      long offset = data + 8 * rowId;
+      long offset = data + 8L * rowId;
       for (int i = 0; i < count; ++i, offset += 8, srcOffset += 8) {
         Platform.putLong(null, offset,
             java.lang.Long.reverseBytes(Platform.getLong(src, srcOffset)));
@@ -374,7 +374,7 @@ public void putLongsLittleEndian(int rowId, int count, byte[] src, int srcIndex)
   @Override
   public long getLong(int rowId) {
     if (dictionary == null) {
-      return Platform.getLong(null, data + 8 * rowId);
+      return Platform.getLong(null, data + 8L * rowId);
     } else {
       return dictionary.decodeToLong(dictionaryIds.getDictId(rowId));
     }
@@ -384,7 +384,7 @@ public long getLong(int rowId) {
   public long[] getLongs(int rowId, int count) {
     assert(dictionary == null);
     long[] array = new long[count];
-    Platform.copyMemory(null, data + rowId * 8, array, Platform.LONG_ARRAY_OFFSET, count * 8);
+    Platform.copyMemory(null, data + rowId * 8L, array, Platform.LONG_ARRAY_OFFSET, count * 8L);
     return array;
   }
 
@@ -394,12 +394,12 @@ public long[] getLongs(int rowId, int count) {
 
   @Override
   public void putFloat(int rowId, float value) {
-    Platform.putFloat(null, data + rowId * 4, value);
+    Platform.putFloat(null, data + rowId * 4L, value);
   }
 
   @Override
   public void putFloats(int rowId, int count, float value) {
-    long offset = data + 4 * rowId;
+    long offset = data + 4L * rowId;
     for (int i = 0; i < count; ++i, offset += 4) {
       Platform.putFloat(null, offset, value);
     }
@@ -407,18 +407,18 @@ public void putFloats(int rowId, int count, float value) {
 
   @Override
   public void putFloats(int rowId, int count, float[] src, int srcIndex) {
-    Platform.copyMemory(src, Platform.FLOAT_ARRAY_OFFSET + srcIndex * 4,
-        null, data + 4 * rowId, count * 4);
+    Platform.copyMemory(src, Platform.FLOAT_ARRAY_OFFSET + srcIndex * 4L,
+        null, data + 4L * rowId, count * 4L);
   }
 
   @Override
   public void putFloats(int rowId, int count, byte[] src, int srcIndex) {
     if (!bigEndianPlatform) {
       Platform.copyMemory(src, Platform.BYTE_ARRAY_OFFSET + srcIndex,
-          null, data + rowId * 4, count * 4);
+          null, data + rowId * 4L, count * 4L);
     } else {
       ByteBuffer bb = ByteBuffer.wrap(src).order(ByteOrder.LITTLE_ENDIAN);
-      long offset = data + 4 * rowId;
+      long offset = data + 4L * rowId;
       for (int i = 0; i < count; ++i, offset += 4) {
         Platform.putFloat(null, offset, bb.getFloat(srcIndex + (4 * i)));
       }
@@ -428,7 +428,7 @@ public void putFloats(int rowId, int count, byte[] src, int srcIndex) {
   @Override
   public float getFloat(int rowId) {
     if (dictionary == null) {
-      return Platform.getFloat(null, data + rowId * 4);
+      return Platform.getFloat(null, data + rowId * 4L);
     } else {
       return dictionary.decodeToFloat(dictionaryIds.getDictId(rowId));
     }
@@ -438,7 +438,7 @@ public float getFloat(int rowId) {
   public float[] getFloats(int rowId, int count) {
     assert(dictionary == null);
     float[] array = new float[count];
-    Platform.copyMemory(null, data + rowId * 4, array, Platform.FLOAT_ARRAY_OFFSET, count * 4);
+    Platform.copyMemory(null, data + rowId * 4L, array, Platform.FLOAT_ARRAY_OFFSET, count * 4L);
     return array;
   }
 
@@ -449,12 +449,12 @@ public float[] getFloats(int rowId, int count) {
 
   @Override
   public void putDouble(int rowId, double value) {
-    Platform.putDouble(null, data + rowId * 8, value);
+    Platform.putDouble(null, data + rowId * 8L, value);
   }
 
   @Override
   public void putDoubles(int rowId, int count, double value) {
-    long offset = data + 8 * rowId;
+    long offset = data + 8L * rowId;
     for (int i = 0; i < count; ++i, offset += 8) {
       Platform.putDouble(null, offset, value);
     }
@@ -462,18 +462,18 @@ public void putDoubles(int rowId, int count, double value) {
 
   @Override
   public void putDoubles(int rowId, int count, double[] src, int srcIndex) {
-    Platform.copyMemory(src, Platform.DOUBLE_ARRAY_OFFSET + srcIndex * 8,
-      null, data + 8 * rowId, count * 8);
+    Platform.copyMemory(src, Platform.DOUBLE_ARRAY_OFFSET + srcIndex * 8L,
+      null, data + 8L * rowId, count * 8L);
   }
 
   @Override
   public void putDoubles(int rowId, int count, byte[] src, int srcIndex) {
     if (!bigEndianPlatform) {
       Platform.copyMemory(src, Platform.BYTE_ARRAY_OFFSET + srcIndex,
-        null, data + rowId * 8, count * 8);
+        null, data + rowId * 8L, count * 8L);
     } else {
       ByteBuffer bb = ByteBuffer.wrap(src).order(ByteOrder.LITTLE_ENDIAN);
-      long offset = data + 8 * rowId;
+      long offset = data + 8L * rowId;
       for (int i = 0; i < count; ++i, offset += 8) {
         Platform.putDouble(null, offset, bb.getDouble(srcIndex + (8 * i)));
       }
@@ -483,7 +483,7 @@ public void putDoubles(int rowId, int count, byte[] src, int srcIndex) {
   @Override
   public double getDouble(int rowId) {
     if (dictionary == null) {
-      return Platform.getDouble(null, data + rowId * 8);
+      return Platform.getDouble(null, data + rowId * 8L);
     } else {
       return dictionary.decodeToDouble(dictionaryIds.getDictId(rowId));
     }
@@ -493,7 +493,7 @@ public double getDouble(int rowId) {
   public double[] getDoubles(int rowId, int count) {
     assert(dictionary == null);
     double[] array = new double[count];
-    Platform.copyMemory(null, data + rowId * 8, array, Platform.DOUBLE_ARRAY_OFFSET, count * 8);
+    Platform.copyMemory(null, data + rowId * 8L, array, Platform.DOUBLE_ARRAY_OFFSET, count * 8L);
     return array;
   }
 
@@ -503,26 +503,26 @@ public double[] getDoubles(int rowId, int count) {
   @Override
   public void putArray(int rowId, int offset, int length) {
     assert(offset >= 0 && offset + length <= childColumns[0].capacity);
-    Platform.putInt(null, lengthData + 4 * rowId, length);
-    Platform.putInt(null, offsetData + 4 * rowId, offset);
+    Platform.putInt(null, lengthData + 4L * rowId, length);
+    Platform.putInt(null, offsetData + 4L * rowId, offset);
   }
 
   @Override
   public int getArrayLength(int rowId) {
-    return Platform.getInt(null, lengthData + 4 * rowId);
+    return Platform.getInt(null, lengthData + 4L * rowId);
   }
 
   @Override
   public int getArrayOffset(int rowId) {
-    return Platform.getInt(null, offsetData + 4 * rowId);
+    return Platform.getInt(null, offsetData + 4L * rowId);
   }
 
   // APIs dealing with ByteArrays
   @Override
   public int putByteArray(int rowId, byte[] value, int offset, int length) {
     int result = arrayData().appendBytes(length, value, offset);
-    Platform.putInt(null, lengthData + 4 * rowId, length);
-    Platform.putInt(null, offsetData + 4 * rowId, result);
+    Platform.putInt(null, lengthData + 4L * rowId, length);
+    Platform.putInt(null, offsetData + 4L * rowId, result);
     return result;
   }
 
@@ -532,19 +532,19 @@ protected void reserveInternal(int newCapacity) {
     int oldCapacity = (nulls == 0L) ? 0 : capacity;
     if (isArray() || type instanceof MapType) {
       this.lengthData =
-          Platform.reallocateMemory(lengthData, oldCapacity * 4, newCapacity * 4);
+          Platform.reallocateMemory(lengthData, oldCapacity * 4L, newCapacity * 4L);
       this.offsetData =
-          Platform.reallocateMemory(offsetData, oldCapacity * 4, newCapacity * 4);
+          Platform.reallocateMemory(offsetData, oldCapacity * 4L, newCapacity * 4L);
     } else if (type instanceof ByteType || type instanceof BooleanType) {
       this.data = Platform.reallocateMemory(data, oldCapacity, newCapacity);
     } else if (type instanceof ShortType) {
-      this.data = Platform.reallocateMemory(data, oldCapacity * 2, newCapacity * 2);
+      this.data = Platform.reallocateMemory(data, oldCapacity * 2L, newCapacity * 2L);
     } else if (type instanceof IntegerType || type instanceof FloatType ||
         type instanceof DateType || DecimalType.is32BitDecimalType(type)) {
-      this.data = Platform.reallocateMemory(data, oldCapacity * 4, newCapacity * 4);
+      this.data = Platform.reallocateMemory(data, oldCapacity * 4L, newCapacity * 4L);
     } else if (type instanceof LongType || type instanceof DoubleType ||
         DecimalType.is64BitDecimalType(type) || type instanceof TimestampType) {
-      this.data = Platform.reallocateMemory(data, oldCapacity * 8, newCapacity * 8);
+      this.data = Platform.reallocateMemory(data, oldCapacity * 8L, newCapacity * 8L);
     } else if (childColumns != null) {
       // Nothing to store.
     } else {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
index 23dcc104e67c4..577eab6ed14c8 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
@@ -231,7 +231,7 @@ public void putShorts(int rowId, int count, short[] src, int srcIndex) {
   @Override
   public void putShorts(int rowId, int count, byte[] src, int srcIndex) {
     Platform.copyMemory(src, Platform.BYTE_ARRAY_OFFSET + srcIndex, shortData,
-      Platform.SHORT_ARRAY_OFFSET + rowId * 2, count * 2);
+      Platform.SHORT_ARRAY_OFFSET + rowId * 2L, count * 2L);
   }
 
   @Override
@@ -276,7 +276,7 @@ public void putInts(int rowId, int count, int[] src, int srcIndex) {
   @Override
   public void putInts(int rowId, int count, byte[] src, int srcIndex) {
     Platform.copyMemory(src, Platform.BYTE_ARRAY_OFFSET + srcIndex, intData,
-      Platform.INT_ARRAY_OFFSET + rowId * 4, count * 4);
+      Platform.INT_ARRAY_OFFSET + rowId * 4L, count * 4L);
   }
 
   @Override
@@ -342,7 +342,7 @@ public void putLongs(int rowId, int count, long[] src, int srcIndex) {
   @Override
   public void putLongs(int rowId, int count, byte[] src, int srcIndex) {
     Platform.copyMemory(src, Platform.BYTE_ARRAY_OFFSET + srcIndex, longData,
-      Platform.LONG_ARRAY_OFFSET + rowId * 8, count * 8);
+      Platform.LONG_ARRAY_OFFSET + rowId * 8L, count * 8L);
   }
 
   @Override
@@ -394,7 +394,7 @@ public void putFloats(int rowId, int count, float[] src, int srcIndex) {
   public void putFloats(int rowId, int count, byte[] src, int srcIndex) {
     if (!bigEndianPlatform) {
       Platform.copyMemory(src, Platform.BYTE_ARRAY_OFFSET + srcIndex, floatData,
-          Platform.DOUBLE_ARRAY_OFFSET + rowId * 4, count * 4);
+          Platform.DOUBLE_ARRAY_OFFSET + rowId * 4L, count * 4L);
     } else {
       ByteBuffer bb = ByteBuffer.wrap(src).order(ByteOrder.LITTLE_ENDIAN);
       for (int i = 0; i < count; ++i) {
@@ -443,7 +443,7 @@ public void putDoubles(int rowId, int count, double[] src, int srcIndex) {
   public void putDoubles(int rowId, int count, byte[] src, int srcIndex) {
     if (!bigEndianPlatform) {
       Platform.copyMemory(src, Platform.BYTE_ARRAY_OFFSET + srcIndex, doubleData,
-          Platform.DOUBLE_ARRAY_OFFSET + rowId * 8, count * 8);
+          Platform.DOUBLE_ARRAY_OFFSET + rowId * 8L, count * 8L);
     } else {
       ByteBuffer bb = ByteBuffer.wrap(src).order(ByteOrder.LITTLE_ENDIAN);
       for (int i = 0; i < count; ++i) {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index 1eac70dbf19cd..60fe31f3542a5 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -343,7 +343,7 @@ private[client] class Shim_v0_12 extends Shim with Logging {
   }
 
   override def getMetastoreClientConnectRetryDelayMillis(conf: HiveConf): Long = {
-    conf.getIntVar(HiveConf.ConfVars.METASTORE_CLIENT_CONNECT_RETRY_DELAY) * 1000
+    conf.getIntVar(HiveConf.ConfVars.METASTORE_CLIENT_CONNECT_RETRY_DELAY) * 1000L
   }
 
   override def loadPartition(
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
index ab7c8558321c8..2e8599026ea1d 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
@@ -222,7 +222,7 @@ private[streaming] class FileBasedWriteAheadLog(
         pastLogs += LogInfo(currentLogWriterStartTime, currentLogWriterStopTime, _)
       }
       currentLogWriterStartTime = currentTime
-      currentLogWriterStopTime = currentTime + (rollingIntervalSecs * 1000)
+      currentLogWriterStopTime = currentTime + (rollingIntervalSecs * 1000L)
       val newLogPath = new Path(logDirectory,
         timeToLogFile(currentLogWriterStartTime, currentLogWriterStopTime))
       currentLogPath = Some(newLogPath.toString)

From 9d63e540e00bc655faf6d8fe1d0035bc0b9a9192 Mon Sep 17 00:00:00 2001
From: Fangshi Li <fli@linkedin.com>
Date: Tue, 12 Jun 2018 12:10:08 -0700
Subject: [PATCH 502/566] [SPARK-24216][SQL] Spark TypedAggregateExpression
 uses getSimpleName that is not safe in scala

When user create a aggregator object in scala and pass the aggregator to Spark Dataset's agg() method, Spark's will initialize TypedAggregateExpression with the nodeName field as aggregator.getClass.getSimpleName. However, getSimpleName is not safe in scala environment, depending on how user creates the aggregator object. For example, if the aggregator class full qualified name is "com.my.company.MyUtils$myAgg$2$", the getSimpleName will throw java.lang.InternalError "Malformed class name". This has been reported in scalatest https://github.com/scalatest/scalatest/pull/1044 and discussed in many scala upstream jiras such as SI-8110, SI-5425.

To fix this issue, we follow the solution in https://github.com/scalatest/scalatest/pull/1044 to add safer version of getSimpleName as a util method, and TypedAggregateExpression will invoke this util method rather than getClass.getSimpleName.

added unit test

Author: Fangshi Li <fli@linkedin.com>

Closes #21276 from fangshil/SPARK-24216.

(cherry picked from commit cc88d7fad16e8b5cbf7b6b9bfe412908782b4a45)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../org/apache/spark/util/AccumulatorV2.scala |  6 +-
 .../scala/org/apache/spark/util/Utils.scala   | 59 ++++++++++++++++++-
 .../org/apache/spark/util/UtilsSuite.scala    | 16 +++++
 .../spark/ml/util/Instrumentation.scala       |  5 +-
 .../aggregate/TypedAggregateExpression.scala  |  5 +-
 5 files changed, 86 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
index 3b469a69437b9..bf618b4afbce0 100644
--- a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
+++ b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
@@ -200,10 +200,12 @@ abstract class AccumulatorV2[IN, OUT] extends Serializable {
   }
 
   override def toString: String = {
+    // getClass.getSimpleName can cause Malformed class name error,
+    // call safer `Utils.getSimpleName` instead
     if (metadata == null) {
-      "Un-registered Accumulator: " + getClass.getSimpleName
+      "Un-registered Accumulator: " + Utils.getSimpleName(getClass)
     } else {
-      getClass.getSimpleName + s"(id: $id, name: $name, value: $value)"
+      Utils.getSimpleName(getClass) + s"(id: $id, name: $name, value: $value)"
     }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 12d0934be9204..d4b72e8474626 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -19,6 +19,7 @@ package org.apache.spark.util
 
 import java.io._
 import java.lang.{Byte => JByte}
+import java.lang.InternalError
 import java.lang.management.{LockInfo, ManagementFactory, MonitorInfo, ThreadInfo}
 import java.lang.reflect.InvocationTargetException
 import java.math.{MathContext, RoundingMode}
@@ -1875,7 +1876,7 @@ private[spark] object Utils extends Logging {
 
   /** Return the class name of the given object, removing all dollar signs */
   def getFormattedClassName(obj: AnyRef): String = {
-    obj.getClass.getSimpleName.replace("$", "")
+    getSimpleName(obj.getClass).replace("$", "")
   }
 
   /** Return an option that translates JNothing to None */
@@ -2817,6 +2818,62 @@ private[spark] object Utils extends Logging {
     HashCodes.fromBytes(secretBytes).toString()
   }
 
+  /**
+   * Safer than Class obj's getSimpleName which may throw Malformed class name error in scala.
+   * This method mimicks scalatest's getSimpleNameOfAnObjectsClass.
+   */
+  def getSimpleName(cls: Class[_]): String = {
+    try {
+      return cls.getSimpleName
+    } catch {
+      case err: InternalError => return stripDollars(stripPackages(cls.getName))
+    }
+  }
+
+  /**
+   * Remove the packages from full qualified class name
+   */
+  private def stripPackages(fullyQualifiedName: String): String = {
+    fullyQualifiedName.split("\\.").takeRight(1)(0)
+  }
+
+  /**
+   * Remove trailing dollar signs from qualified class name,
+   * and return the trailing part after the last dollar sign in the middle
+   */
+  private def stripDollars(s: String): String = {
+    val lastDollarIndex = s.lastIndexOf('$')
+    if (lastDollarIndex < s.length - 1) {
+      // The last char is not a dollar sign
+      if (lastDollarIndex == -1 || !s.contains("$iw")) {
+        // The name does not have dollar sign or is not an intepreter
+        // generated class, so we should return the full string
+        s
+      } else {
+        // The class name is intepreter generated,
+        // return the part after the last dollar sign
+        // This is the same behavior as getClass.getSimpleName
+        s.substring(lastDollarIndex + 1)
+      }
+    }
+    else {
+      // The last char is a dollar sign
+      // Find last non-dollar char
+      val lastNonDollarChar = s.reverse.find(_ != '$')
+      lastNonDollarChar match {
+        case None => s
+        case Some(c) =>
+          val lastNonDollarIndex = s.lastIndexOf(c)
+          if (lastNonDollarIndex == -1) {
+            s
+          } else {
+            // Strip the trailing dollar signs
+            // Invoke stripDollars again to get the simple name
+            stripDollars(s.substring(0, lastNonDollarIndex + 1))
+          }
+      }
+    }
+  }
 }
 
 private[util] object CallerContext extends Logging {
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index eaea6b030c154..cde250ca65660 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -1167,6 +1167,22 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
       Utils.checkAndGetK8sMasterUrl("k8s://foo://host:port")
     }
   }
+
+  object MalformedClassObject {
+    class MalformedClass
+  }
+
+  test("Safe getSimpleName") {
+    // getSimpleName on class of MalformedClass will result in error: Malformed class name
+    // Utils.getSimpleName works
+    val err = intercept[java.lang.InternalError] {
+      classOf[MalformedClassObject.MalformedClass].getSimpleName
+    }
+    assert(err.getMessage === "Malformed class name")
+
+    assert(Utils.getSimpleName(classOf[MalformedClassObject.MalformedClass]) ===
+      "UtilsSuite$MalformedClassObject$MalformedClass")
+  }
 }
 
 private class SimpleExtension
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
index 7c46f45c59717..8920e615c8d72 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
@@ -28,6 +28,7 @@ import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param.Param
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.Dataset
+import org.apache.spark.util.Utils
 
 /**
  * A small wrapper that defines a training session for an estimator, and some methods to log
@@ -44,7 +45,9 @@ private[spark] class Instrumentation[E <: Estimator[_]] private (
 
   private val id = Instrumentation.counter.incrementAndGet()
   private val prefix = {
-    val className = estimator.getClass.getSimpleName
+    // estimator.getClass.getSimpleName can cause Malformed class name error,
+    // call safer `Utils.getSimpleName` instead
+    val className = Utils.getSimpleName(estimator.getClass)
     s"$className-${estimator.uid}-${dataset.hashCode()}-$id: "
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TypedAggregateExpression.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TypedAggregateExpression.scala
index aab8cc50b9526..6d44890704f49 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TypedAggregateExpression.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TypedAggregateExpression.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection
 import org.apache.spark.sql.catalyst.expressions.objects.Invoke
 import org.apache.spark.sql.expressions.Aggregator
 import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
 
 object TypedAggregateExpression {
   def apply[BUF : Encoder, OUT : Encoder](
@@ -109,7 +110,9 @@ trait TypedAggregateExpression extends AggregateFunction {
     s"$nodeName($input)"
   }
 
-  override def nodeName: String = aggregator.getClass.getSimpleName.stripSuffix("$")
+  // aggregator.getClass.getSimpleName can cause Malformed class name error,
+  // call safer `Utils.getSimpleName` instead
+  override def nodeName: String = Utils.getSimpleName(aggregator.getClass).stripSuffix("$");
 }
 
 // TODO: merge these 2 implementations once we refactor the `AggregateFunction` interface.

From b8dbfcc572bf5f386d21ae0be7b3439d1abed860 Mon Sep 17 00:00:00 2001
From: Fabrizio Cucci <fabrizio.cucci@gmail.com>
Date: Mon, 18 Jun 2018 14:40:24 -0700
Subject: [PATCH 503/566] Fix issue in 'docker-image-tool.sh'

Because of the missing assignment of the variable `BUILD_ARGS` the command `./bin/docker-image-tool.sh -r docker.io/myrepo -t v2.3.1 build` fails:

```
docker build" requires exactly 1 argument.
See 'docker build --help'.

Usage:  docker build [OPTIONS] PATH | URL | - [flags]

Build an image from a Dockerfile
```

This has been fixed on the `master` already but, apparently, it has not been ported back to the branch `2.3`, leading to the same error even on the latest `2.3.1` release (dated 8 June 2018).

Author: Fabrizio Cucci <fabrizio.cucci@gmail.com>

Closes #21551 from fabriziocucci/patch-1.
---
 bin/docker-image-tool.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bin/docker-image-tool.sh b/bin/docker-image-tool.sh
index 071406336d1b1..0d0f564bb8b9b 100755
--- a/bin/docker-image-tool.sh
+++ b/bin/docker-image-tool.sh
@@ -57,6 +57,7 @@ function build {
   else
     # Not passed as an argument to docker, but used to validate the Spark directory.
     IMG_PATH="kubernetes/dockerfiles"
+    BUILD_ARGS=()
   fi
 
   if [ ! -d "$IMG_PATH" ]; then

From 50cdb4138e5cb0e0d15f739db8066f3ea86ef037 Mon Sep 17 00:00:00 2001
From: Xiao Li <gatorsmile@gmail.com>
Date: Mon, 18 Jun 2018 20:17:04 -0700
Subject: [PATCH 504/566] [SPARK-24542][SQL] UDF series UDFXPathXXXX allow
 users to pass carefully crafted XML to access arbitrary files

## What changes were proposed in this pull request?

UDF series UDFXPathXXXX allow users to pass carefully crafted XML to access arbitrary files. Spark does not have built-in access control. When users use the external access control library, users might bypass them and access the file contents.

This PR basically patches the Hive fix to Apache Spark. https://issues.apache.org/jira/browse/HIVE-18879

## How was this patch tested?

A unit test case

Author: Xiao Li <gatorsmile@gmail.com>

Closes #21549 from gatorsmile/xpathSecurity.

(cherry picked from commit 9a75c18290fff7d116cf88a44f9120bf67d8bd27)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../expressions/xml/UDFXPathUtil.java         | 28 ++++++++++++++++++-
 .../expressions/xml/UDFXPathUtilSuite.scala   | 21 ++++++++++++++
 .../xml/XPathExpressionSuite.scala            |  5 ++--
 3 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtil.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtil.java
index d224332d8a6c9..023ec139652c5 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtil.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtil.java
@@ -21,6 +21,9 @@
 import java.io.Reader;
 
 import javax.xml.namespace.QName;
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.xpath.XPath;
 import javax.xml.xpath.XPathConstants;
 import javax.xml.xpath.XPathExpression;
@@ -37,9 +40,15 @@
  * This is based on Hive's UDFXPathUtil implementation.
  */
 public class UDFXPathUtil {
+  public static final String SAX_FEATURE_PREFIX = "http://xml.org/sax/features/";
+  public static final String EXTERNAL_GENERAL_ENTITIES_FEATURE = "external-general-entities";
+  public static final String EXTERNAL_PARAMETER_ENTITIES_FEATURE = "external-parameter-entities";
+  private DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
+  private DocumentBuilder builder = null;
   private XPath xpath = XPathFactory.newInstance().newXPath();
   private ReusableStringReader reader = new ReusableStringReader();
   private InputSource inputSource = new InputSource(reader);
+
   private XPathExpression expression = null;
   private String oldPath = null;
 
@@ -65,14 +74,31 @@ public Object eval(String xml, String path, QName qname) throws XPathExpressionE
       return null;
     }
 
+    if (builder == null){
+      try {
+        initializeDocumentBuilderFactory();
+        builder = dbf.newDocumentBuilder();
+      } catch (ParserConfigurationException e) {
+        throw new RuntimeException(
+          "Error instantiating DocumentBuilder, cannot build xml parser", e);
+      }
+    }
+
     reader.set(xml);
     try {
-      return expression.evaluate(inputSource, qname);
+      return expression.evaluate(builder.parse(inputSource), qname);
     } catch (XPathExpressionException e) {
       throw new RuntimeException("Invalid XML document: " + e.getMessage() + "\n" + xml, e);
+    } catch (Exception e) {
+      throw new RuntimeException("Error loading expression '" + oldPath + "'", e);
     }
   }
 
+  private void initializeDocumentBuilderFactory() throws ParserConfigurationException {
+    dbf.setFeature(SAX_FEATURE_PREFIX + EXTERNAL_GENERAL_ENTITIES_FEATURE, false);
+    dbf.setFeature(SAX_FEATURE_PREFIX + EXTERNAL_PARAMETER_ENTITIES_FEATURE, false);
+  }
+
   public Boolean evalBoolean(String xml, String path) throws XPathExpressionException {
     return (Boolean) eval(xml, path, XPathConstants.BOOLEAN);
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtilSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtilSuite.scala
index c4cde7091154b..0fec15bc42c17 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtilSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtilSuite.scala
@@ -77,6 +77,27 @@ class UDFXPathUtilSuite extends SparkFunSuite {
     assert(ret == "foo")
   }
 
+  test("embedFailure") {
+    import org.apache.commons.io.FileUtils
+    import java.io.File
+    val secretValue = String.valueOf(Math.random)
+    val tempFile = File.createTempFile("verifyembed", ".tmp")
+    tempFile.deleteOnExit()
+    val fname = tempFile.getAbsolutePath
+
+    FileUtils.writeStringToFile(tempFile, secretValue)
+
+    val xml =
+      s"""<?xml version="1.0" encoding="utf-8"?>
+        |<!DOCTYPE test [
+        |    <!ENTITY embed SYSTEM "$fname">
+        |]>
+        |<foo>&embed;</foo>
+      """.stripMargin
+    val evaled = new UDFXPathUtil().evalString(xml, "/foo")
+    assert(evaled.isEmpty)
+  }
+
   test("number eval") {
     var ret =
       util.evalNumber("<a><b>true</b><b>false</b><b>b3</b><c>c1</c><c>-77</c></a>", "a/c[2]")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/XPathExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/XPathExpressionSuite.scala
index bfa18a0919e45..c6f6d3abb860c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/XPathExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/XPathExpressionSuite.scala
@@ -40,8 +40,9 @@ class XPathExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
 
     // Test error message for invalid XML document
     val e1 = intercept[RuntimeException] { testExpr("<a>/a>", "a", null.asInstanceOf[T]) }
-    assert(e1.getCause.getMessage.contains("Invalid XML document") &&
-      e1.getCause.getMessage.contains("<a>/a>"))
+    assert(e1.getCause.getCause.getMessage.contains(
+      "XML document structures must start and end within the same entity."))
+    assert(e1.getMessage.contains("<a>/a>"))
 
     // Test error message for invalid xpath
     val e2 = intercept[RuntimeException] { testExpr("<a></a>", "!#$", null.asInstanceOf[T]) }

From d687d97b116beafa7f4375f1876049f5da4f5ba7 Mon Sep 17 00:00:00 2001
From: Maryann Xue <maryannxue@apache.org>
Date: Tue, 19 Jun 2018 15:27:20 -0700
Subject: [PATCH 505/566] [SPARK-24583][SQL] Wrong schema type in
 InsertIntoDataSourceCommand

## What changes were proposed in this pull request?

Change insert input schema type: "insertRelationType" -> "insertRelationType.asNullable", in order to avoid nullable being overridden.

## How was this patch tested?

Added one test in InsertSuite.

Author: Maryann Xue <maryannxue@apache.org>

Closes #21585 from maryannxue/spark-24583.

(cherry picked from commit bc0498d5820ded2b428277e396502e74ef0ce36d)
Signed-off-by: Xiao Li <gatorsmile@gmail.com>
---
 .../InsertIntoDataSourceCommand.scala         |  5 +-
 .../spark/sql/sources/InsertSuite.scala       | 51 ++++++++++++++++++-
 2 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoDataSourceCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoDataSourceCommand.scala
index a813829d50cb1..80d7608a22891 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoDataSourceCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoDataSourceCommand.scala
@@ -38,9 +38,8 @@ case class InsertIntoDataSourceCommand(
   override def run(sparkSession: SparkSession): Seq[Row] = {
     val relation = logicalRelation.relation.asInstanceOf[InsertableRelation]
     val data = Dataset.ofRows(sparkSession, query)
-    // Apply the schema of the existing table to the new data.
-    val df = sparkSession.internalCreateDataFrame(data.queryExecution.toRdd, logicalRelation.schema)
-    relation.insert(df, overwrite)
+    // Data has been casted to the target relation's schema by the PreprocessTableInsertion rule.
+    relation.insert(data, overwrite)
 
     // Re-cache all cached plans(including this relation itself, if it's cached) that refer to this
     // data source relation.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index fef01c860db6e..438d5d8176b8b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -20,12 +20,36 @@ package org.apache.spark.sql.sources
 import java.io.File
 
 import org.apache.spark.SparkException
-import org.apache.spark.sql.{AnalysisException, Row}
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.SQLConf.PartitionOverwriteMode
 import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
+class SimpleInsertSource extends SchemaRelationProvider {
+  override def createRelation(
+      sqlContext: SQLContext,
+      parameters: Map[String, String],
+      schema: StructType): BaseRelation = {
+    SimpleInsert(schema)(sqlContext.sparkSession)
+  }
+}
+
+case class SimpleInsert(userSpecifiedSchema: StructType)(@transient val sparkSession: SparkSession)
+  extends BaseRelation with InsertableRelation {
+
+  override def sqlContext: SQLContext = sparkSession.sqlContext
+
+  override def schema: StructType = userSpecifiedSchema
+
+  override def insert(input: DataFrame, overwrite: Boolean): Unit = {
+    input.collect
+  }
+}
+
 class InsertSuite extends DataSourceTest with SharedSQLContext {
   import testImplicits._
 
@@ -520,4 +544,29 @@ class InsertSuite extends DataSourceTest with SharedSQLContext {
       }
     }
   }
+
+  test("SPARK-24583 Wrong schema type in InsertIntoDataSourceCommand") {
+    withTable("test_table") {
+      val schema = new StructType()
+        .add("i", LongType, false)
+        .add("s", StringType, false)
+      val newTable = CatalogTable(
+        identifier = TableIdentifier("test_table", None),
+        tableType = CatalogTableType.EXTERNAL,
+        storage = CatalogStorageFormat(
+          locationUri = None,
+          inputFormat = None,
+          outputFormat = None,
+          serde = None,
+          compressed = false,
+          properties = Map.empty),
+        schema = schema,
+        provider = Some(classOf[SimpleInsertSource].getName))
+
+      spark.sessionState.catalog.createTable(newTable, false)
+
+      sql("INSERT INTO TABLE test_table SELECT 1, 'a'")
+      sql("INSERT INTO TABLE test_table SELECT 2, null")
+    }
+  }
 }

From 8928de3cd448922d43f9ae80cea7138ecbea0d24 Mon Sep 17 00:00:00 2001
From: Wenbo Zhao <wzhao@twosigma.com>
Date: Wed, 20 Jun 2018 14:26:04 -0700
Subject: [PATCH 506/566] [SPARK-24578][CORE] Cap sub-region's size of returned
 nio buffer

## What changes were proposed in this pull request?
This PR tries to fix the performance regression introduced by SPARK-21517.

In our production job, we performed many parallel computations, with high possibility, some task could be scheduled to a host-2 where it needs to read the cache block data from host-1. Often, this big transfer makes the cluster suffer time out issue (it will retry 3 times, each with 120s timeout, and then do recompute to put the cache block into the local MemoryStore).

The root cause is that we don't do `consolidateIfNeeded` anymore as we are using
```
Unpooled.wrappedBuffer(chunks.length, getChunks(): _*)
```
in ChunkedByteBuffer. If we have many small chunks, it could cause the `buf.notBuffer(...)` have very bad performance in the case that we have to call `copyByteBuf(...)` many times.

## How was this patch tested?
Existing unit tests and also test in production

Author: Wenbo Zhao <wzhao@twosigma.com>

Closes #21593 from WenboZhao/spark-24578.

(cherry picked from commit 3f4bda7289f1bfbbe8b9bc4b516007f569c44d2e)
Signed-off-by: Shixiong Zhu <zsxwing@gmail.com>
---
 .../network/protocol/MessageWithHeader.java   | 25 ++++---------------
 1 file changed, 5 insertions(+), 20 deletions(-)

diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageWithHeader.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageWithHeader.java
index a5337656cbd84..e7b66a6f33a82 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageWithHeader.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageWithHeader.java
@@ -137,30 +137,15 @@ protected void deallocate() {
   }
 
   private int copyByteBuf(ByteBuf buf, WritableByteChannel target) throws IOException {
-    ByteBuffer buffer = buf.nioBuffer();
-    int written = (buffer.remaining() <= NIO_BUFFER_LIMIT) ?
-      target.write(buffer) : writeNioBuffer(target, buffer);
+    // SPARK-24578: cap the sub-region's size of returned nio buffer to improve the performance
+    // for the case that the passed-in buffer has too many components.
+    int length = Math.min(buf.readableBytes(), NIO_BUFFER_LIMIT);
+    ByteBuffer buffer = buf.nioBuffer(buf.readerIndex(), length);
+    int written = target.write(buffer);
     buf.skipBytes(written);
     return written;
   }
 
-  private int writeNioBuffer(
-      WritableByteChannel writeCh,
-      ByteBuffer buf) throws IOException {
-    int originalLimit = buf.limit();
-    int ret = 0;
-
-    try {
-      int ioSize = Math.min(buf.remaining(), NIO_BUFFER_LIMIT);
-      buf.limit(buf.position() + ioSize);
-      ret = writeCh.write(buf);
-    } finally {
-      buf.limit(originalLimit);
-    }
-
-    return ret;
-  }
-
   @Override
   public MessageWithHeader touch(Object o) {
     super.touch(o);

From 3a4b6f3be50fe2b400565c303675f6d9c23ecf41 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 21 Jun 2018 13:25:15 -0500
Subject: [PATCH 507/566] [SPARK-24589][CORE] Correctly identify tasks in
 output commit coordinator.

When an output stage is retried, it's possible that tasks from the previous
attempt are still running. In that case, there would be a new task for the
same partition in the new attempt, and the coordinator would allow both
tasks to commit their output since it did not keep track of stage attempts.

The change adds more information to the stage state tracked by the coordinator,
so that only one task is allowed to commit the output in the above case.
The stage state in the coordinator is also maintained across stage retries,
so that a stray speculative task from a previous stage attempt is not allowed
to commit.

This also removes some code added in SPARK-18113 that allowed for duplicate
commit requests; with the RPC code used in Spark 2, that situation cannot
happen, so there is no need to handle it.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #21577 from vanzin/SPARK-24552.

(cherry picked from commit c8e909cd498b67b121fa920ceee7631c652dac38)
Signed-off-by: Thomas Graves <tgraves@apache.org>
---
 .../spark/mapred/SparkHadoopMapRedUtil.scala  |   8 +-
 .../apache/spark/scheduler/DAGScheduler.scala |  23 ++--
 .../scheduler/OutputCommitCoordinator.scala   | 128 ++++++++++--------
 .../OutputCommitCoordinatorSuite.scala        | 116 +++++++++++-----
 4 files changed, 173 insertions(+), 102 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala b/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala
index 764735dc4eae7..db8aff94ea1e1 100644
--- a/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala
+++ b/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala
@@ -69,9 +69,9 @@ object SparkHadoopMapRedUtil extends Logging {
 
       if (shouldCoordinateWithDriver) {
         val outputCommitCoordinator = SparkEnv.get.outputCommitCoordinator
-        val taskAttemptNumber = TaskContext.get().attemptNumber()
-        val stageId = TaskContext.get().stageId()
-        val canCommit = outputCommitCoordinator.canCommit(stageId, splitId, taskAttemptNumber)
+        val ctx = TaskContext.get()
+        val canCommit = outputCommitCoordinator.canCommit(ctx.stageId(), ctx.stageAttemptNumber(),
+          splitId, ctx.attemptNumber())
 
         if (canCommit) {
           performCommit()
@@ -81,7 +81,7 @@ object SparkHadoopMapRedUtil extends Logging {
           logInfo(message)
           // We need to abort the task so that the driver can reschedule new attempts, if necessary
           committer.abortTask(mrTaskContext)
-          throw new CommitDeniedException(message, stageId, splitId, taskAttemptNumber)
+          throw new CommitDeniedException(message, ctx.stageId(), splitId, ctx.attemptNumber())
         }
       } else {
         // Speculation is disabled or a user has chosen to manually bypass the commit coordination
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 78b6b34b5d2bb..7029e2237a866 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1173,6 +1173,7 @@ class DAGScheduler(
 
     outputCommitCoordinator.taskCompleted(
       stageId,
+      task.stageAttemptId,
       task.partitionId,
       event.taskInfo.attemptNumber, // this is a task attempt number
       event.reason)
@@ -1332,23 +1333,24 @@ class DAGScheduler(
             s" ${task.stageAttemptId} and there is a more recent attempt for that stage " +
             s"(attempt ${failedStage.latestInfo.attemptNumber}) running")
         } else {
+          failedStage.fetchFailedAttemptIds.add(task.stageAttemptId)
+          val shouldAbortStage =
+            failedStage.fetchFailedAttemptIds.size >= maxConsecutiveStageAttempts ||
+            disallowStageRetryForTest
+
           // It is likely that we receive multiple FetchFailed for a single stage (because we have
           // multiple tasks running concurrently on different executors). In that case, it is
           // possible the fetch failure has already been handled by the scheduler.
           if (runningStages.contains(failedStage)) {
             logInfo(s"Marking $failedStage (${failedStage.name}) as failed " +
               s"due to a fetch failure from $mapStage (${mapStage.name})")
-            markStageAsFinished(failedStage, Some(failureMessage))
+            markStageAsFinished(failedStage, errorMessage = Some(failureMessage),
+              willRetry = !shouldAbortStage)
           } else {
             logDebug(s"Received fetch failure from $task, but its from $failedStage which is no " +
               s"longer running")
           }
 
-          failedStage.fetchFailedAttemptIds.add(task.stageAttemptId)
-          val shouldAbortStage =
-            failedStage.fetchFailedAttemptIds.size >= maxConsecutiveStageAttempts ||
-            disallowStageRetryForTest
-
           if (shouldAbortStage) {
             val abortMessage = if (disallowStageRetryForTest) {
               "Fetch failure will not retry stage due to testing config"
@@ -1547,7 +1549,10 @@ class DAGScheduler(
   /**
    * Marks a stage as finished and removes it from the list of running stages.
    */
-  private def markStageAsFinished(stage: Stage, errorMessage: Option[String] = None): Unit = {
+  private def markStageAsFinished(
+      stage: Stage,
+      errorMessage: Option[String] = None,
+      willRetry: Boolean = false): Unit = {
     val serviceTime = stage.latestInfo.submissionTime match {
       case Some(t) => "%.03f".format((clock.getTimeMillis() - t) / 1000.0)
       case _ => "Unknown"
@@ -1566,7 +1571,9 @@ class DAGScheduler(
       logInfo(s"$stage (${stage.name}) failed in $serviceTime s due to ${errorMessage.get}")
     }
 
-    outputCommitCoordinator.stageEnd(stage.id)
+    if (!willRetry) {
+      outputCommitCoordinator.stageEnd(stage.id)
+    }
     listenerBus.post(SparkListenerStageCompleted(stage.latestInfo))
     runningStages -= stage
   }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala b/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala
index 83d87b548a430..b382d623806e2 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala
@@ -27,7 +27,11 @@ import org.apache.spark.util.{RpcUtils, ThreadUtils}
 private sealed trait OutputCommitCoordinationMessage extends Serializable
 
 private case object StopCoordinator extends OutputCommitCoordinationMessage
-private case class AskPermissionToCommitOutput(stage: Int, partition: Int, attemptNumber: Int)
+private case class AskPermissionToCommitOutput(
+    stage: Int,
+    stageAttempt: Int,
+    partition: Int,
+    attemptNumber: Int)
 
 /**
  * Authority that decides whether tasks can commit output to HDFS. Uses a "first committer wins"
@@ -45,13 +49,15 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean)
   // Initialized by SparkEnv
   var coordinatorRef: Option[RpcEndpointRef] = None
 
-  private type StageId = Int
-  private type PartitionId = Int
-  private type TaskAttemptNumber = Int
-  private val NO_AUTHORIZED_COMMITTER: TaskAttemptNumber = -1
+  // Class used to identify a committer. The task ID for a committer is implicitly defined by
+  // the partition being processed, but the coordinator needs to keep track of both the stage
+  // attempt and the task attempt, because in some situations the same task may be running
+  // concurrently in two different attempts of the same stage.
+  private case class TaskIdentifier(stageAttempt: Int, taskAttempt: Int)
+
   private case class StageState(numPartitions: Int) {
-    val authorizedCommitters = Array.fill[TaskAttemptNumber](numPartitions)(NO_AUTHORIZED_COMMITTER)
-    val failures = mutable.Map[PartitionId, mutable.Set[TaskAttemptNumber]]()
+    val authorizedCommitters = Array.fill[TaskIdentifier](numPartitions)(null)
+    val failures = mutable.Map[Int, mutable.Set[TaskIdentifier]]()
   }
 
   /**
@@ -64,7 +70,7 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean)
    *
    * Access to this map should be guarded by synchronizing on the OutputCommitCoordinator instance.
    */
-  private val stageStates = mutable.Map[StageId, StageState]()
+  private val stageStates = mutable.Map[Int, StageState]()
 
   /**
    * Returns whether the OutputCommitCoordinator's internal data structures are all empty.
@@ -87,10 +93,11 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean)
    * @return true if this task is authorized to commit, false otherwise
    */
   def canCommit(
-      stage: StageId,
-      partition: PartitionId,
-      attemptNumber: TaskAttemptNumber): Boolean = {
-    val msg = AskPermissionToCommitOutput(stage, partition, attemptNumber)
+      stage: Int,
+      stageAttempt: Int,
+      partition: Int,
+      attemptNumber: Int): Boolean = {
+    val msg = AskPermissionToCommitOutput(stage, stageAttempt, partition, attemptNumber)
     coordinatorRef match {
       case Some(endpointRef) =>
         ThreadUtils.awaitResult(endpointRef.ask[Boolean](msg),
@@ -103,26 +110,35 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean)
   }
 
   /**
-   * Called by the DAGScheduler when a stage starts.
+   * Called by the DAGScheduler when a stage starts. Initializes the stage's state if it hasn't
+   * yet been initialized.
    *
    * @param stage the stage id.
    * @param maxPartitionId the maximum partition id that could appear in this stage's tasks (i.e.
    *                       the maximum possible value of `context.partitionId`).
    */
-  private[scheduler] def stageStart(stage: StageId, maxPartitionId: Int): Unit = synchronized {
-    stageStates(stage) = new StageState(maxPartitionId + 1)
+  private[scheduler] def stageStart(stage: Int, maxPartitionId: Int): Unit = synchronized {
+    stageStates.get(stage) match {
+      case Some(state) =>
+        require(state.authorizedCommitters.length == maxPartitionId + 1)
+        logInfo(s"Reusing state from previous attempt of stage $stage.")
+
+      case _ =>
+        stageStates(stage) = new StageState(maxPartitionId + 1)
+    }
   }
 
   // Called by DAGScheduler
-  private[scheduler] def stageEnd(stage: StageId): Unit = synchronized {
+  private[scheduler] def stageEnd(stage: Int): Unit = synchronized {
     stageStates.remove(stage)
   }
 
   // Called by DAGScheduler
   private[scheduler] def taskCompleted(
-      stage: StageId,
-      partition: PartitionId,
-      attemptNumber: TaskAttemptNumber,
+      stage: Int,
+      stageAttempt: Int,
+      partition: Int,
+      attemptNumber: Int,
       reason: TaskEndReason): Unit = synchronized {
     val stageState = stageStates.getOrElse(stage, {
       logDebug(s"Ignoring task completion for completed stage")
@@ -131,16 +147,17 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean)
     reason match {
       case Success =>
       // The task output has been committed successfully
-      case denied: TaskCommitDenied =>
-        logInfo(s"Task was denied committing, stage: $stage, partition: $partition, " +
-          s"attempt: $attemptNumber")
-      case otherReason =>
+      case _: TaskCommitDenied =>
+        logInfo(s"Task was denied committing, stage: $stage.$stageAttempt, " +
+          s"partition: $partition, attempt: $attemptNumber")
+      case _ =>
         // Mark the attempt as failed to blacklist from future commit protocol
-        stageState.failures.getOrElseUpdate(partition, mutable.Set()) += attemptNumber
-        if (stageState.authorizedCommitters(partition) == attemptNumber) {
+        val taskId = TaskIdentifier(stageAttempt, attemptNumber)
+        stageState.failures.getOrElseUpdate(partition, mutable.Set()) += taskId
+        if (stageState.authorizedCommitters(partition) == taskId) {
           logDebug(s"Authorized committer (attemptNumber=$attemptNumber, stage=$stage, " +
             s"partition=$partition) failed; clearing lock")
-          stageState.authorizedCommitters(partition) = NO_AUTHORIZED_COMMITTER
+          stageState.authorizedCommitters(partition) = null
         }
     }
   }
@@ -155,47 +172,41 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean)
 
   // Marked private[scheduler] instead of private so this can be mocked in tests
   private[scheduler] def handleAskPermissionToCommit(
-      stage: StageId,
-      partition: PartitionId,
-      attemptNumber: TaskAttemptNumber): Boolean = synchronized {
+      stage: Int,
+      stageAttempt: Int,
+      partition: Int,
+      attemptNumber: Int): Boolean = synchronized {
     stageStates.get(stage) match {
-      case Some(state) if attemptFailed(state, partition, attemptNumber) =>
-        logInfo(s"Denying attemptNumber=$attemptNumber to commit for stage=$stage," +
-          s" partition=$partition as task attempt $attemptNumber has already failed.")
+      case Some(state) if attemptFailed(state, stageAttempt, partition, attemptNumber) =>
+        logInfo(s"Commit denied for stage=$stage.$stageAttempt, partition=$partition: " +
+          s"task attempt $attemptNumber already marked as failed.")
         false
       case Some(state) =>
-        state.authorizedCommitters(partition) match {
-          case NO_AUTHORIZED_COMMITTER =>
-            logDebug(s"Authorizing attemptNumber=$attemptNumber to commit for stage=$stage, " +
-              s"partition=$partition")
-            state.authorizedCommitters(partition) = attemptNumber
-            true
-          case existingCommitter =>
-            // Coordinator should be idempotent when receiving AskPermissionToCommit.
-            if (existingCommitter == attemptNumber) {
-              logWarning(s"Authorizing duplicate request to commit for " +
-                s"attemptNumber=$attemptNumber to commit for stage=$stage," +
-                s" partition=$partition; existingCommitter = $existingCommitter." +
-                s" This can indicate dropped network traffic.")
-              true
-            } else {
-              logDebug(s"Denying attemptNumber=$attemptNumber to commit for stage=$stage, " +
-                s"partition=$partition; existingCommitter = $existingCommitter")
-              false
-            }
+        val existing = state.authorizedCommitters(partition)
+        if (existing == null) {
+          logDebug(s"Commit allowed for stage=$stage.$stageAttempt, partition=$partition, " +
+            s"task attempt $attemptNumber")
+          state.authorizedCommitters(partition) = TaskIdentifier(stageAttempt, attemptNumber)
+          true
+        } else {
+          logDebug(s"Commit denied for stage=$stage.$stageAttempt, partition=$partition: " +
+            s"already committed by $existing")
+          false
         }
       case None =>
-        logDebug(s"Stage $stage has completed, so not allowing" +
-          s" attempt number $attemptNumber of partition $partition to commit")
+        logDebug(s"Commit denied for stage=$stage.$stageAttempt, partition=$partition: " +
+          "stage already marked as completed.")
         false
     }
   }
 
   private def attemptFailed(
       stageState: StageState,
-      partition: PartitionId,
-      attempt: TaskAttemptNumber): Boolean = synchronized {
-    stageState.failures.get(partition).exists(_.contains(attempt))
+      stageAttempt: Int,
+      partition: Int,
+      attempt: Int): Boolean = synchronized {
+    val failInfo = TaskIdentifier(stageAttempt, attempt)
+    stageState.failures.get(partition).exists(_.contains(failInfo))
   }
 }
 
@@ -215,9 +226,10 @@ private[spark] object OutputCommitCoordinator {
     }
 
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
-      case AskPermissionToCommitOutput(stage, partition, attemptNumber) =>
+      case AskPermissionToCommitOutput(stage, stageAttempt, partition, attemptNumber) =>
         context.reply(
-          outputCommitCoordinator.handleAskPermissionToCommit(stage, partition, attemptNumber))
+          outputCommitCoordinator.handleAskPermissionToCommit(stage, stageAttempt, partition,
+            attemptNumber))
     }
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
index 03b1903902491..158c9eb75f2b6 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
@@ -35,6 +35,7 @@ import org.scalatest.BeforeAndAfter
 import org.apache.spark._
 import org.apache.spark.internal.io.{FileCommitProtocol, HadoopMapRedCommitProtocol, SparkHadoopWriterUtils}
 import org.apache.spark.rdd.{FakeOutputCommitter, RDD}
+import org.apache.spark.shuffle.FetchFailedException
 import org.apache.spark.util.{ThreadUtils, Utils}
 
 /**
@@ -153,7 +154,7 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter {
   test("Job should not complete if all commits are denied") {
     // Create a mock OutputCommitCoordinator that denies all attempts to commit
     doReturn(false).when(outputCommitCoordinator).handleAskPermissionToCommit(
-      Matchers.any(), Matchers.any(), Matchers.any())
+      Matchers.any(), Matchers.any(), Matchers.any(), Matchers.any())
     val rdd: RDD[Int] = sc.parallelize(Seq(1), 1)
     def resultHandler(x: Int, y: Unit): Unit = {}
     val futureAction: SimpleFutureAction[Unit] = sc.submitJob[Int, Unit, Unit](rdd,
@@ -169,45 +170,106 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter {
 
   test("Only authorized committer failures can clear the authorized committer lock (SPARK-6614)") {
     val stage: Int = 1
+    val stageAttempt: Int = 1
     val partition: Int = 2
     val authorizedCommitter: Int = 3
     val nonAuthorizedCommitter: Int = 100
     outputCommitCoordinator.stageStart(stage, maxPartitionId = 2)
 
-    assert(outputCommitCoordinator.canCommit(stage, partition, authorizedCommitter))
-    assert(!outputCommitCoordinator.canCommit(stage, partition, nonAuthorizedCommitter))
+    assert(outputCommitCoordinator.canCommit(stage, stageAttempt, partition, authorizedCommitter))
+    assert(!outputCommitCoordinator.canCommit(stage, stageAttempt, partition,
+      nonAuthorizedCommitter))
     // The non-authorized committer fails
-    outputCommitCoordinator.taskCompleted(
-      stage, partition, attemptNumber = nonAuthorizedCommitter, reason = TaskKilled("test"))
+    outputCommitCoordinator.taskCompleted(stage, stageAttempt, partition,
+      attemptNumber = nonAuthorizedCommitter, reason = TaskKilled("test"))
     // New tasks should still not be able to commit because the authorized committer has not failed
-    assert(
-      !outputCommitCoordinator.canCommit(stage, partition, nonAuthorizedCommitter + 1))
+    assert(!outputCommitCoordinator.canCommit(stage, stageAttempt, partition,
+      nonAuthorizedCommitter + 1))
     // The authorized committer now fails, clearing the lock
-    outputCommitCoordinator.taskCompleted(
-      stage, partition, attemptNumber = authorizedCommitter, reason = TaskKilled("test"))
+    outputCommitCoordinator.taskCompleted(stage, stageAttempt, partition,
+      attemptNumber = authorizedCommitter, reason = TaskKilled("test"))
     // A new task should now be allowed to become the authorized committer
-    assert(
-      outputCommitCoordinator.canCommit(stage, partition, nonAuthorizedCommitter + 2))
+    assert(outputCommitCoordinator.canCommit(stage, stageAttempt, partition,
+      nonAuthorizedCommitter + 2))
     // There can only be one authorized committer
-    assert(
-      !outputCommitCoordinator.canCommit(stage, partition, nonAuthorizedCommitter + 3))
-  }
-
-  test("Duplicate calls to canCommit from the authorized committer gets idempotent responses.") {
-    val rdd = sc.parallelize(Seq(1), 1)
-    sc.runJob(rdd, OutputCommitFunctions(tempDir.getAbsolutePath).callCanCommitMultipleTimes _,
-       0 until rdd.partitions.size)
+    assert(!outputCommitCoordinator.canCommit(stage, stageAttempt, partition,
+      nonAuthorizedCommitter + 3))
   }
 
   test("SPARK-19631: Do not allow failed attempts to be authorized for committing") {
     val stage: Int = 1
+    val stageAttempt: Int = 1
     val partition: Int = 1
     val failedAttempt: Int = 0
     outputCommitCoordinator.stageStart(stage, maxPartitionId = 1)
-    outputCommitCoordinator.taskCompleted(stage, partition, attemptNumber = failedAttempt,
+    outputCommitCoordinator.taskCompleted(stage, stageAttempt, partition,
+      attemptNumber = failedAttempt,
       reason = ExecutorLostFailure("0", exitCausedByApp = true, None))
-    assert(!outputCommitCoordinator.canCommit(stage, partition, failedAttempt))
-    assert(outputCommitCoordinator.canCommit(stage, partition, failedAttempt + 1))
+    assert(!outputCommitCoordinator.canCommit(stage, stageAttempt, partition, failedAttempt))
+    assert(outputCommitCoordinator.canCommit(stage, stageAttempt, partition, failedAttempt + 1))
+  }
+
+  test("SPARK-24589: Differentiate tasks from different stage attempts") {
+    var stage = 1
+    val taskAttempt = 1
+    val partition = 1
+
+    outputCommitCoordinator.stageStart(stage, maxPartitionId = 1)
+    assert(outputCommitCoordinator.canCommit(stage, 1, partition, taskAttempt))
+    assert(!outputCommitCoordinator.canCommit(stage, 2, partition, taskAttempt))
+
+    // Fail the task in the first attempt, the task in the second attempt should succeed.
+    stage += 1
+    outputCommitCoordinator.stageStart(stage, maxPartitionId = 1)
+    outputCommitCoordinator.taskCompleted(stage, 1, partition, taskAttempt,
+      ExecutorLostFailure("0", exitCausedByApp = true, None))
+    assert(!outputCommitCoordinator.canCommit(stage, 1, partition, taskAttempt))
+    assert(outputCommitCoordinator.canCommit(stage, 2, partition, taskAttempt))
+
+    // Commit the 1st attempt, fail the 2nd attempt, make sure 3rd attempt cannot commit,
+    // then fail the 1st attempt and make sure the 4th one can commit again.
+    stage += 1
+    outputCommitCoordinator.stageStart(stage, maxPartitionId = 1)
+    assert(outputCommitCoordinator.canCommit(stage, 1, partition, taskAttempt))
+    outputCommitCoordinator.taskCompleted(stage, 2, partition, taskAttempt,
+      ExecutorLostFailure("0", exitCausedByApp = true, None))
+    assert(!outputCommitCoordinator.canCommit(stage, 3, partition, taskAttempt))
+    outputCommitCoordinator.taskCompleted(stage, 1, partition, taskAttempt,
+      ExecutorLostFailure("0", exitCausedByApp = true, None))
+    assert(outputCommitCoordinator.canCommit(stage, 4, partition, taskAttempt))
+  }
+
+  test("SPARK-24589: Make sure stage state is cleaned up") {
+    // Normal application without stage failures.
+    sc.parallelize(1 to 100, 100)
+      .map { i => (i % 10, i) }
+      .reduceByKey(_ + _)
+      .collect()
+
+    assert(sc.dagScheduler.outputCommitCoordinator.isEmpty)
+
+    // Force failures in a few tasks so that a stage is retried. Collect the ID of the failing
+    // stage so that we can check the state of the output committer.
+    val retriedStage = sc.parallelize(1 to 100, 10)
+      .map { i => (i % 10, i) }
+      .reduceByKey { case (_, _) =>
+        val ctx = TaskContext.get()
+        if (ctx.stageAttemptNumber() == 0) {
+          throw new FetchFailedException(SparkEnv.get.blockManager.blockManagerId, 1, 1, 1,
+            new Exception("Failure for test."))
+        } else {
+          ctx.stageId()
+        }
+      }
+      .collect()
+      .map { case (k, v) => v }
+      .toSet
+
+    assert(retriedStage.size === 1)
+    assert(sc.dagScheduler.outputCommitCoordinator.isEmpty)
+    verify(sc.env.outputCommitCoordinator, times(2))
+      .stageStart(Matchers.eq(retriedStage.head), Matchers.any())
+    verify(sc.env.outputCommitCoordinator).stageEnd(Matchers.eq(retriedStage.head))
   }
 }
 
@@ -243,16 +305,6 @@ private case class OutputCommitFunctions(tempDirPath: String) {
       if (ctx.attemptNumber == 0) failingOutputCommitter else successfulOutputCommitter)
   }
 
-  // Receiver should be idempotent for AskPermissionToCommitOutput
-  def callCanCommitMultipleTimes(iter: Iterator[Int]): Unit = {
-    val ctx = TaskContext.get()
-    val canCommit1 = SparkEnv.get.outputCommitCoordinator
-      .canCommit(ctx.stageId(), ctx.partitionId(), ctx.attemptNumber())
-    val canCommit2 = SparkEnv.get.outputCommitCoordinator
-      .canCommit(ctx.stageId(), ctx.partitionId(), ctx.attemptNumber())
-    assert(canCommit1 && canCommit2)
-  }
-
   private def runCommitWithProvidedCommitter(
       ctx: TaskContext,
       iter: Iterator[Int],

From a1e9640079d4a567adb87a78eee8e4bcb1431a14 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 21 Jun 2018 15:38:46 -0700
Subject: [PATCH 508/566] [SPARK-24588][SS] streaming join should require
 HashClusteredPartitioning from children

## What changes were proposed in this pull request?

In https://github.com/apache/spark/pull/19080 we simplified the distribution/partitioning framework, and make all the join-like operators require `HashClusteredDistribution` from children. Unfortunately streaming join operator was missed.

This can cause wrong result. Think about
```
val input1 = MemoryStream[Int]
val input2 = MemoryStream[Int]

val df1 = input1.toDF.select('value as 'a, 'value * 2 as 'b)
val df2 = input2.toDF.select('value as 'a, 'value * 2 as 'b).repartition('b)
val joined = df1.join(df2, Seq("a", "b")).select('a)
```

The physical plan is
```
*(3) Project [a#5]
+- StreamingSymmetricHashJoin [a#5, b#6], [a#10, b#11], Inner, condition = [ leftOnly = null, rightOnly = null, both = null, full = null ], state info [ checkpoint = <unknown>, runId = 54e31fce-f055-4686-b75d-fcd2b076f8d8, opId = 0, ver = 0, numPartitions = 5], 0, state cleanup [ left = null, right = null ]
   :- Exchange hashpartitioning(a#5, b#6, 5)
   :  +- *(1) Project [value#1 AS a#5, (value#1 * 2) AS b#6]
   :     +- StreamingRelation MemoryStream[value#1], [value#1]
   +- Exchange hashpartitioning(b#11, 5)
      +- *(2) Project [value#3 AS a#10, (value#3 * 2) AS b#11]
         +- StreamingRelation MemoryStream[value#3], [value#3]
```

The left table is hash partitioned by `a, b`, while the right table is hash partitioned by `b`. This means, we may have a matching record that is in different partitions, which should be in the output but not.

## How was this patch tested?

N/A

Author: Wenchen Fan <wenchen@databricks.com>

Closes #21587 from cloud-fan/join.

(cherry picked from commit dc8a6befa5dad861a731b4d7865f3ccf37482ae0)
Signed-off-by: Xiao Li <gatorsmile@gmail.com>
---
 .../plans/physical/partitioning.scala         |  53 +++--
 .../sql/catalyst/DistributionSuite.scala      | 201 +++++++++++++++---
 .../v2/DataSourcePartitioning.scala           |   4 +-
 .../StreamingSymmetricHashJoinExec.scala      |   4 +-
 .../sql/streaming/StreamingJoinSuite.scala    |  14 ++
 5 files changed, 217 insertions(+), 59 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
index 4d9a9925fe3ff..cc1a5e835d9cd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
@@ -99,16 +99,19 @@ case class ClusteredDistribution(
  * This is a strictly stronger guarantee than [[ClusteredDistribution]]. Given a tuple and the
  * number of partitions, this distribution strictly requires which partition the tuple should be in.
  */
-case class HashClusteredDistribution(expressions: Seq[Expression]) extends Distribution {
+case class HashClusteredDistribution(
+    expressions: Seq[Expression],
+    requiredNumPartitions: Option[Int] = None) extends Distribution {
   require(
     expressions != Nil,
-    "The expressions for hash of a HashPartitionedDistribution should not be Nil. " +
+    "The expressions for hash of a HashClusteredDistribution should not be Nil. " +
       "An AllTuples should be used to represent a distribution that only has " +
       "a single partition.")
 
-  override def requiredNumPartitions: Option[Int] = None
-
   override def createPartitioning(numPartitions: Int): Partitioning = {
+    assert(requiredNumPartitions.isEmpty || requiredNumPartitions.get == numPartitions,
+      s"This HashClusteredDistribution requires ${requiredNumPartitions.get} partitions, but " +
+        s"the actual number of partitions is $numPartitions.")
     HashPartitioning(expressions, numPartitions)
   }
 }
@@ -163,11 +166,22 @@ trait Partitioning {
    * i.e. the current dataset does not need to be re-partitioned for the `required`
    * Distribution (it is possible that tuples within a partition need to be reorganized).
    *
+   * A [[Partitioning]] can never satisfy a [[Distribution]] if its `numPartitions` does't match
+   * [[Distribution.requiredNumPartitions]].
+   */
+  final def satisfies(required: Distribution): Boolean = {
+    required.requiredNumPartitions.forall(_ == numPartitions) && satisfies0(required)
+  }
+
+  /**
+   * The actual method that defines whether this [[Partitioning]] can satisfy the given
+   * [[Distribution]], after the `numPartitions` check.
+   *
    * By default a [[Partitioning]] can satisfy [[UnspecifiedDistribution]], and [[AllTuples]] if
-   * the [[Partitioning]] only have one partition. Implementations can overwrite this method with
-   * special logic.
+   * the [[Partitioning]] only have one partition. Implementations can also overwrite this method
+   * with special logic.
    */
-  def satisfies(required: Distribution): Boolean = required match {
+  protected def satisfies0(required: Distribution): Boolean = required match {
     case UnspecifiedDistribution => true
     case AllTuples => numPartitions == 1
     case _ => false
@@ -186,9 +200,8 @@ case class RoundRobinPartitioning(numPartitions: Int) extends Partitioning
 case object SinglePartition extends Partitioning {
   val numPartitions = 1
 
-  override def satisfies(required: Distribution): Boolean = required match {
+  override def satisfies0(required: Distribution): Boolean = required match {
     case _: BroadcastDistribution => false
-    case ClusteredDistribution(_, Some(requiredNumPartitions)) => requiredNumPartitions == 1
     case _ => true
   }
 }
@@ -205,16 +218,15 @@ case class HashPartitioning(expressions: Seq[Expression], numPartitions: Int)
   override def nullable: Boolean = false
   override def dataType: DataType = IntegerType
 
-  override def satisfies(required: Distribution): Boolean = {
-    super.satisfies(required) || {
+  override def satisfies0(required: Distribution): Boolean = {
+    super.satisfies0(required) || {
       required match {
         case h: HashClusteredDistribution =>
           expressions.length == h.expressions.length && expressions.zip(h.expressions).forall {
             case (l, r) => l.semanticEquals(r)
           }
-        case ClusteredDistribution(requiredClustering, requiredNumPartitions) =>
-          expressions.forall(x => requiredClustering.exists(_.semanticEquals(x))) &&
-            (requiredNumPartitions.isEmpty || requiredNumPartitions.get == numPartitions)
+        case ClusteredDistribution(requiredClustering, _) =>
+          expressions.forall(x => requiredClustering.exists(_.semanticEquals(x)))
         case _ => false
       }
     }
@@ -246,15 +258,14 @@ case class RangePartitioning(ordering: Seq[SortOrder], numPartitions: Int)
   override def nullable: Boolean = false
   override def dataType: DataType = IntegerType
 
-  override def satisfies(required: Distribution): Boolean = {
-    super.satisfies(required) || {
+  override def satisfies0(required: Distribution): Boolean = {
+    super.satisfies0(required) || {
       required match {
         case OrderedDistribution(requiredOrdering) =>
           val minSize = Seq(requiredOrdering.size, ordering.size).min
           requiredOrdering.take(minSize) == ordering.take(minSize)
-        case ClusteredDistribution(requiredClustering, requiredNumPartitions) =>
-          ordering.map(_.child).forall(x => requiredClustering.exists(_.semanticEquals(x))) &&
-            (requiredNumPartitions.isEmpty || requiredNumPartitions.get == numPartitions)
+        case ClusteredDistribution(requiredClustering, _) =>
+          ordering.map(_.child).forall(x => requiredClustering.exists(_.semanticEquals(x)))
         case _ => false
       }
     }
@@ -295,7 +306,7 @@ case class PartitioningCollection(partitionings: Seq[Partitioning])
    * Returns true if any `partitioning` of this collection satisfies the given
    * [[Distribution]].
    */
-  override def satisfies(required: Distribution): Boolean =
+  override def satisfies0(required: Distribution): Boolean =
     partitionings.exists(_.satisfies(required))
 
   override def toString: String = {
@@ -310,7 +321,7 @@ case class PartitioningCollection(partitionings: Seq[Partitioning])
 case class BroadcastPartitioning(mode: BroadcastMode) extends Partitioning {
   override val numPartitions: Int = 1
 
-  override def satisfies(required: Distribution): Boolean = required match {
+  override def satisfies0(required: Distribution): Boolean = required match {
     case BroadcastDistribution(m) if m == mode => true
     case _ => false
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/DistributionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/DistributionSuite.scala
index b47b8adfe5d55..39228102682b9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/DistributionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/DistributionSuite.scala
@@ -41,34 +41,127 @@ class DistributionSuite extends SparkFunSuite {
     }
   }
 
-  test("HashPartitioning (with nullSafe = true) is the output partitioning") {
-    // Cases which do not need an exchange between two data properties.
+  test("UnspecifiedDistribution and AllTuples") {
+    // except `BroadcastPartitioning`, all other partitioning can satisfy UnspecifiedDistribution
     checkSatisfied(
-      HashPartitioning(Seq('a, 'b, 'c), 10),
+      UnknownPartitioning(-1),
       UnspecifiedDistribution,
       true)
 
     checkSatisfied(
-      HashPartitioning(Seq('a, 'b, 'c), 10),
-      ClusteredDistribution(Seq('a, 'b, 'c)),
+      RoundRobinPartitioning(10),
+      UnspecifiedDistribution,
       true)
 
     checkSatisfied(
-      HashPartitioning(Seq('b, 'c), 10),
-      ClusteredDistribution(Seq('a, 'b, 'c)),
+      SinglePartition,
+      UnspecifiedDistribution,
+      true)
+
+    checkSatisfied(
+      HashPartitioning(Seq('a), 10),
+      UnspecifiedDistribution,
+      true)
+
+    checkSatisfied(
+      RangePartitioning(Seq('a.asc), 10),
+      UnspecifiedDistribution,
+      true)
+
+    checkSatisfied(
+      BroadcastPartitioning(IdentityBroadcastMode),
+      UnspecifiedDistribution,
+      false)
+
+    // except `BroadcastPartitioning`, all other partitioning can satisfy AllTuples if they have
+    // only one partition.
+    checkSatisfied(
+      UnknownPartitioning(1),
+      AllTuples,
+      true)
+
+    checkSatisfied(
+      UnknownPartitioning(10),
+      AllTuples,
+      false)
+
+    checkSatisfied(
+      RoundRobinPartitioning(1),
+      AllTuples,
+      true)
+
+    checkSatisfied(
+      RoundRobinPartitioning(10),
+      AllTuples,
+      false)
+
+    checkSatisfied(
+      SinglePartition,
+      AllTuples,
+      true)
+
+    checkSatisfied(
+      HashPartitioning(Seq('a), 1),
+      AllTuples,
       true)
 
+    checkSatisfied(
+      HashPartitioning(Seq('a), 10),
+      AllTuples,
+      false)
+
+    checkSatisfied(
+      RangePartitioning(Seq('a.asc), 1),
+      AllTuples,
+      true)
+
+    checkSatisfied(
+      RangePartitioning(Seq('a.asc), 10),
+      AllTuples,
+      false)
+
+    checkSatisfied(
+      BroadcastPartitioning(IdentityBroadcastMode),
+      AllTuples,
+      false)
+  }
+
+  test("SinglePartition is the output partitioning") {
+    // SinglePartition can satisfy all the distributions except `BroadcastDistribution`
     checkSatisfied(
       SinglePartition,
       ClusteredDistribution(Seq('a, 'b, 'c)),
       true)
 
+    checkSatisfied(
+      SinglePartition,
+      HashClusteredDistribution(Seq('a, 'b, 'c)),
+      true)
+
     checkSatisfied(
       SinglePartition,
       OrderedDistribution(Seq('a.asc, 'b.asc, 'c.asc)),
       true)
 
-    // Cases which need an exchange between two data properties.
+    checkSatisfied(
+      SinglePartition,
+      BroadcastDistribution(IdentityBroadcastMode),
+      false)
+  }
+
+  test("HashPartitioning is the output partitioning") {
+    // HashPartitioning can satisfy ClusteredDistribution iff its hash expressions are a subset of
+    // the required clustering expressions.
+    checkSatisfied(
+      HashPartitioning(Seq('a, 'b, 'c), 10),
+      ClusteredDistribution(Seq('a, 'b, 'c)),
+      true)
+
+    checkSatisfied(
+      HashPartitioning(Seq('b, 'c), 10),
+      ClusteredDistribution(Seq('a, 'b, 'c)),
+      true)
+
     checkSatisfied(
       HashPartitioning(Seq('a, 'b, 'c), 10),
       ClusteredDistribution(Seq('b, 'c)),
@@ -79,37 +172,43 @@ class DistributionSuite extends SparkFunSuite {
       ClusteredDistribution(Seq('d, 'e)),
       false)
 
+    // HashPartitioning can satisfy HashClusteredDistribution iff its hash expressions are exactly
+    // same with the required hash clustering expressions.
     checkSatisfied(
       HashPartitioning(Seq('a, 'b, 'c), 10),
-      AllTuples,
+      HashClusteredDistribution(Seq('a, 'b, 'c)),
+      true)
+
+    checkSatisfied(
+      HashPartitioning(Seq('c, 'b, 'a), 10),
+      HashClusteredDistribution(Seq('a, 'b, 'c)),
       false)
 
+    checkSatisfied(
+      HashPartitioning(Seq('a, 'b), 10),
+      HashClusteredDistribution(Seq('a, 'b, 'c)),
+      false)
+
+    // HashPartitioning cannot satisfy OrderedDistribution
     checkSatisfied(
       HashPartitioning(Seq('a, 'b, 'c), 10),
       OrderedDistribution(Seq('a.asc, 'b.asc, 'c.asc)),
       false)
 
     checkSatisfied(
-      HashPartitioning(Seq('b, 'c), 10),
+      HashPartitioning(Seq('a, 'b, 'c), 1),
       OrderedDistribution(Seq('a.asc, 'b.asc, 'c.asc)),
-      false)
+      false) // TODO: this can be relaxed.
 
-    // TODO: We should check functional dependencies
-    /*
     checkSatisfied(
-      ClusteredDistribution(Seq('b)),
-      ClusteredDistribution(Seq('b + 1)),
-      true)
-    */
+      HashPartitioning(Seq('b, 'c), 10),
+      OrderedDistribution(Seq('a.asc, 'b.asc, 'c.asc)),
+      false)
   }
 
   test("RangePartitioning is the output partitioning") {
-    // Cases which do not need an exchange between two data properties.
-    checkSatisfied(
-      RangePartitioning(Seq('a.asc, 'b.asc, 'c.asc), 10),
-      UnspecifiedDistribution,
-      true)
-
+    // RangePartitioning can satisfy OrderedDistribution iff its ordering is a prefix
+    // of the required ordering, or the required ordering is a prefix of its ordering.
     checkSatisfied(
       RangePartitioning(Seq('a.asc, 'b.asc, 'c.asc), 10),
       OrderedDistribution(Seq('a.asc, 'b.asc, 'c.asc)),
@@ -125,6 +224,27 @@ class DistributionSuite extends SparkFunSuite {
       OrderedDistribution(Seq('a.asc, 'b.asc, 'c.asc, 'd.desc)),
       true)
 
+    // TODO: We can have an optimization to first sort the dataset
+    // by a.asc and then sort b, and c in a partition. This optimization
+    // should tradeoff the benefit of a less number of Exchange operators
+    // and the parallelism.
+    checkSatisfied(
+      RangePartitioning(Seq('a.asc, 'b.asc, 'c.asc), 10),
+      OrderedDistribution(Seq('a.asc, 'b.desc, 'c.asc)),
+      false)
+
+    checkSatisfied(
+      RangePartitioning(Seq('a.asc, 'b.asc, 'c.asc), 10),
+      OrderedDistribution(Seq('b.asc, 'a.asc)),
+      false)
+
+    checkSatisfied(
+      RangePartitioning(Seq('a.asc, 'b.asc, 'c.asc), 10),
+      OrderedDistribution(Seq('a.asc, 'b.asc, 'd.desc)),
+      false)
+
+    // RangePartitioning can satisfy ClusteredDistribution iff its ordering expressions are a subset
+    // of the required clustering expressions.
     checkSatisfied(
       RangePartitioning(Seq('a.asc, 'b.asc, 'c.asc), 10),
       ClusteredDistribution(Seq('a, 'b, 'c)),
@@ -140,34 +260,47 @@ class DistributionSuite extends SparkFunSuite {
       ClusteredDistribution(Seq('b, 'c, 'a, 'd)),
       true)
 
-    // Cases which need an exchange between two data properties.
-    // TODO: We can have an optimization to first sort the dataset
-    // by a.asc and then sort b, and c in a partition. This optimization
-    // should tradeoff the benefit of a less number of Exchange operators
-    // and the parallelism.
     checkSatisfied(
       RangePartitioning(Seq('a.asc, 'b.asc, 'c.asc), 10),
-      OrderedDistribution(Seq('a.asc, 'b.desc, 'c.asc)),
+      ClusteredDistribution(Seq('a, 'b)),
       false)
 
     checkSatisfied(
       RangePartitioning(Seq('a.asc, 'b.asc, 'c.asc), 10),
-      OrderedDistribution(Seq('b.asc, 'a.asc)),
+      ClusteredDistribution(Seq('c, 'd)),
       false)
 
+    // RangePartitioning cannot satisfy HashClusteredDistribution
     checkSatisfied(
       RangePartitioning(Seq('a.asc, 'b.asc, 'c.asc), 10),
-      ClusteredDistribution(Seq('a, 'b)),
+      HashClusteredDistribution(Seq('a, 'b, 'c)),
       false)
+  }
 
+  test("Partitioning.numPartitions must match Distribution.requiredNumPartitions to satisfy it") {
     checkSatisfied(
-      RangePartitioning(Seq('a.asc, 'b.asc, 'c.asc), 10),
-      ClusteredDistribution(Seq('c, 'd)),
+      SinglePartition,
+      ClusteredDistribution(Seq('a, 'b, 'c), Some(10)),
+      false)
+
+    checkSatisfied(
+      SinglePartition,
+      HashClusteredDistribution(Seq('a, 'b, 'c), Some(10)),
+      false)
+
+    checkSatisfied(
+      HashPartitioning(Seq('a, 'b, 'c), 10),
+      ClusteredDistribution(Seq('a, 'b, 'c), Some(5)),
+      false)
+
+    checkSatisfied(
+      HashPartitioning(Seq('a, 'b, 'c), 10),
+      HashClusteredDistribution(Seq('a, 'b, 'c), Some(5)),
       false)
 
     checkSatisfied(
       RangePartitioning(Seq('a.asc, 'b.asc, 'c.asc), 10),
-      AllTuples,
+      ClusteredDistribution(Seq('a, 'b, 'c), Some(5)),
       false)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourcePartitioning.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourcePartitioning.scala
index 017a6737161a6..33079d5912506 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourcePartitioning.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourcePartitioning.scala
@@ -30,8 +30,8 @@ class DataSourcePartitioning(
 
   override val numPartitions: Int = partitioning.numPartitions()
 
-  override def satisfies(required: physical.Distribution): Boolean = {
-    super.satisfies(required) || {
+  override def satisfies0(required: physical.Distribution): Boolean = {
+    super.satisfies0(required) || {
       required match {
         case d: physical.ClusteredDistribution if isCandidate(d.clustering) =>
           val attrs = d.clustering.map(_.asInstanceOf[Attribute])
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
index fa7c8ee906ecd..1402eba8bbad2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
@@ -167,8 +167,8 @@ case class StreamingSymmetricHashJoinExec(
   val nullRight = new GenericInternalRow(right.output.map(_.withNullability(true)).length)
 
   override def requiredChildDistribution: Seq[Distribution] =
-    ClusteredDistribution(leftKeys, stateInfo.map(_.numPartitions)) ::
-      ClusteredDistribution(rightKeys, stateInfo.map(_.numPartitions)) :: Nil
+    HashClusteredDistribution(leftKeys, stateInfo.map(_.numPartitions)) ::
+      HashClusteredDistribution(rightKeys, stateInfo.map(_.numPartitions)) :: Nil
 
   override def output: Seq[Attribute] = joinType match {
     case _: InnerLike => left.output ++ right.output
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
index 92087f68ad74a..4af6dc378d998 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
@@ -406,6 +406,20 @@ class StreamingInnerJoinSuite extends StreamTest with StateStoreMetricsTest with
       AddData(input3, 5, 10),
       CheckLastBatch((5, 10, 5, 15, 5, 25)))
   }
+
+  test("streaming join should require HashClusteredDistribution from children") {
+    val input1 = MemoryStream[Int]
+    val input2 = MemoryStream[Int]
+
+    val df1 = input1.toDF.select('value as 'a, 'value * 2 as 'b)
+    val df2 = input2.toDF.select('value as 'a, 'value * 2 as 'b).repartition('b)
+    val joined = df1.join(df2, Seq("a", "b")).select('a)
+
+    testStream(joined)(
+      AddData(input1, 1.to(1000): _*),
+      AddData(input2, 1.to(1000): _*),
+      CheckAnswer(1.to(1000): _*))
+  }
 }
 
 class StreamingOuterJoinSuite extends StreamTest with StateStoreMetricsTest with BeforeAndAfter {

From db538b25ae9016c624ed7c70a34dee2036d80d3a Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Mon, 25 Jun 2018 16:55:41 -0700
Subject: [PATCH 509/566] [SPARK-24552][CORE][SQL][BRANCH-2.3] Use unique id
 instead of attempt number for writes .

This passes a unique attempt id instead of attempt number to v2
data sources and hadoop APIs, because attempt number is reused
when stages are retried. When attempt numbers are reused, sources
that track data by partition id and attempt number may incorrectly
clean up data because the same attempt number can be both committed
and aborted.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #21615 from vanzin/SPARK-24552-2.3.
---
 .../org/apache/spark/internal/io/SparkHadoopWriter.scala    | 6 +++++-
 .../sql/execution/datasources/v2/WriteToDataSourceV2.scala  | 5 ++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala b/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala
index abf39213fa0d2..9ebd0aa301592 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala
@@ -76,13 +76,17 @@ object SparkHadoopWriter extends Logging {
     // Try to write all RDD partitions as a Hadoop OutputFormat.
     try {
       val ret = sparkContext.runJob(rdd, (context: TaskContext, iter: Iterator[(K, V)]) => {
+        // SPARK-24552: Generate a unique "attempt ID" based on the stage and task attempt numbers.
+        // Assumes that there won't be more than Short.MaxValue attempts, at least not concurrently.
+        val attemptId = (context.stageAttemptNumber << 16) | context.attemptNumber
+
         executeTask(
           context = context,
           config = config,
           jobTrackerId = jobTrackerId,
           commitJobId = commitJobId,
           sparkPartitionId = context.partitionId,
-          sparkAttemptNumber = context.attemptNumber,
+          sparkAttemptNumber = attemptId,
           committer = committer,
           iterator = iter)
       })
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
index d02faacd9c19a..e4d2639632174 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
@@ -123,7 +123,10 @@ object DataWritingSparkTask extends Logging {
       writeTask: DataWriterFactory[InternalRow],
       context: TaskContext,
       iter: Iterator[InternalRow]): WriterCommitMessage = {
-    val dataWriter = writeTask.createDataWriter(context.partitionId(), context.attemptNumber())
+    // SPARK-24552: Generate a unique "attempt ID" based on the stage and task attempt numbers.
+    // Assumes that there won't be more than Short.MaxValue attempts, at least not concurrently.
+    val attemptId = (context.stageAttemptNumber << 16) | context.attemptNumber
+    val dataWriter = writeTask.createDataWriter(context.partitionId(), attemptId)
 
     // write the data and commit this writer.
     Utils.tryWithSafeFinallyAndFailureCallbacks(block = {

From 6e1f5e0182e1d111f7252c24c576674d1d2c7b91 Mon Sep 17 00:00:00 2001
From: Maryann Xue <maryannxue@apache.org>
Date: Thu, 21 Jun 2018 11:45:30 -0700
Subject: [PATCH 510/566] [SPARK-24613][SQL] Cache with UDF could not be
 matched with subsequent dependent caches

Wrap the logical plan with a `AnalysisBarrier` for execution plan compilation in CacheManager, in order to avoid the plan being analyzed again.

Add one test in `DatasetCacheSuite`

Author: Maryann Xue <maryannxue@apache.org>

Closes #21602 from maryannxue/cache-mismatch.
---
 .../spark/sql/execution/CacheManager.scala       |  6 +++---
 .../org/apache/spark/sql/DatasetCacheSuite.scala | 16 ++++++++++++++++
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
index d68aeb275afda..dbdda27d533f4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -26,7 +26,7 @@ import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{Dataset, SparkSession}
 import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
-import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, ResolvedHint}
+import org.apache.spark.sql.catalyst.plans.logical.{AnalysisBarrier, LogicalPlan, ResolvedHint}
 import org.apache.spark.sql.execution.columnar.InMemoryRelation
 import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
 import org.apache.spark.storage.StorageLevel
@@ -97,7 +97,7 @@ class CacheManager extends Logging {
       val inMemoryRelation = InMemoryRelation(
         sparkSession.sessionState.conf.useCompression,
         sparkSession.sessionState.conf.columnBatchSize, storageLevel,
-        sparkSession.sessionState.executePlan(planToCache).executedPlan,
+        sparkSession.sessionState.executePlan(AnalysisBarrier(planToCache)).executedPlan,
         tableName,
         planToCache.stats)
       cachedData.add(CachedData(planToCache, inMemoryRelation))
@@ -146,7 +146,7 @@ class CacheManager extends Logging {
           useCompression = cd.cachedRepresentation.useCompression,
           batchSize = cd.cachedRepresentation.batchSize,
           storageLevel = cd.cachedRepresentation.storageLevel,
-          child = spark.sessionState.executePlan(cd.plan).executedPlan,
+          child = spark.sessionState.executePlan(AnalysisBarrier(cd.plan)).executedPlan,
           tableName = cd.cachedRepresentation.tableName,
           statsOfPlanToCache = cd.plan.stats)
         needToRecache += cd.copy(cachedRepresentation = newCache)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala
index e0561ee2797a5..f6c760ed492a3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.execution.columnar.{InMemoryRelation, InMemoryTableScanExec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.storage.StorageLevel
@@ -96,4 +97,19 @@ class DatasetCacheSuite extends QueryTest with SharedSQLContext {
     agged.unpersist()
     assert(agged.storageLevel == StorageLevel.NONE, "The Dataset agged should not be cached.")
   }
+
+  test("SPARK-24613 Cache with UDF could not be matched with subsequent dependent caches") {
+    val udf1 = udf({x: Int => x + 1})
+    val df = spark.range(0, 10).toDF("a").withColumn("b", udf1($"a"))
+    val df2 = df.agg(sum(df("b")))
+
+    df.cache()
+    df.count()
+    df2.cache()
+
+    val plan = df2.queryExecution.withCachedData
+    assert(plan.isInstanceOf[InMemoryRelation])
+    val internalPlan = plan.asInstanceOf[InMemoryRelation].child
+    assert(internalPlan.find(_.isInstanceOf[InMemoryTableScanExec]).isDefined)
+  }
 }

From 0f534d3da284947f52b62154daaddfa77b42873c Mon Sep 17 00:00:00 2001
From: Fokko Driesprong <fokkodriesprong@godatadriven.com>
Date: Thu, 28 Jun 2018 09:59:00 +0800
Subject: [PATCH 511/566] [SPARK-24603][SQL] Fix findTightestCommonType
 reference in comments

findTightestCommonTypeOfTwo has been renamed to findTightestCommonType

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Fokko Driesprong <fokkodriesprong@godatadriven.com>

Closes #21597 from Fokko/fd-typo.

(cherry picked from commit 6a97e8eb31da76fe5af912a6304c07b63735062f)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 .../sql/execution/datasources/json/JsonInferSchema.scala      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonInferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonInferSchema.scala
index a270a6451d5dd..1702fb6ec414d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonInferSchema.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonInferSchema.scala
@@ -329,8 +329,8 @@ private[sql] object JsonInferSchema {
           ArrayType(compatibleType(elementType1, elementType2), containsNull1 || containsNull2)
 
         // The case that given `DecimalType` is capable of given `IntegralType` is handled in
-        // `findTightestCommonTypeOfTwo`. Both cases below will be executed only when
-        // the given `DecimalType` is not capable of the given `IntegralType`.
+        // `findTightestCommonType`. Both cases below will be executed only when the given
+        // `DecimalType` is not capable of the given `IntegralType`.
         case (t1: IntegralType, t2: DecimalType) =>
           compatibleType(DecimalType.forType(t1), t2)
         case (t1: DecimalType, t2: IntegralType) =>

From 8ff4b97274e58f5944506b25481c6eb44238a4cd Mon Sep 17 00:00:00 2001
From: Xiao Li <gatorsmile@gmail.com>
Date: Fri, 29 Jun 2018 23:51:13 -0700
Subject: [PATCH 512/566] simplify rand in dsl/package.scala

(cherry picked from commit d54d8b86301581142293341af25fd78b3278a2e8)
Signed-off-by: Xiao Li <gatorsmile@gmail.com>
---
 .../main/scala/org/apache/spark/sql/catalyst/dsl/package.scala   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index efb2eba655e15..89e8c998f740d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -149,6 +149,7 @@ package object dsl {
       }
     }
 
+    def rand(e: Long): Expression = Rand(e)
     def sum(e: Expression): Expression = Sum(e).toAggregateExpression()
     def sumDistinct(e: Expression): Expression = Sum(e).toAggregateExpression(isDistinct = true)
     def count(e: Expression): Expression = Count(e).toAggregateExpression()

From 3c0af793f9e050f5d8dfb2f5daab6c0043c39748 Mon Sep 17 00:00:00 2001
From: maryannxue <maryannxue@apache.org>
Date: Fri, 29 Jun 2018 23:46:12 -0700
Subject: [PATCH 513/566] [SPARK-24696][SQL] ColumnPruning rule fails to remove
 extra Project

The ColumnPruning rule tries adding an extra Project if an input node produces fields more than needed, but as a post-processing step, it needs to remove the lower Project in the form of "Project - Filter - Project" otherwise it would conflict with PushPredicatesThroughProject and would thus cause a infinite optimization loop. The current post-processing method is defined as:
```
  private def removeProjectBeforeFilter(plan: LogicalPlan): LogicalPlan = plan transform {
    case p1  Project(_, f  Filter(_, p2  Project(_, child)))
      if p2.outputSet.subsetOf(child.outputSet) =>
      p1.copy(child = f.copy(child = child))
  }
```
This method works well when there is only one Filter but would not if there's two or more Filters. In this case, there is a deterministic filter and a non-deterministic filter so they stay as separate filter nodes and cannot be combined together.

An simplified illustration of the optimization process that forms the infinite loop is shown below (F1 stands for the 1st filter, F2 for the 2nd filter, P for project, S for scan of relation, PredicatePushDown as abbrev. of PushPredicatesThroughProject):
```
                             F1 - F2 - P - S
PredicatePushDown      =>    F1 - P - F2 - S
ColumnPruning          =>    F1 - P - F2 - P - S
                       =>    F1 - P - F2 - S        (Project removed)
PredicatePushDown      =>    P - F1 - F2 - S
ColumnPruning          =>    P - F1 - P - F2 - S
                       =>    P - F1 - P - F2 - P - S
                       =>    P - F1 - F2 - P - S    (only one Project removed)
RemoveRedundantProject =>    F1 - F2 - P - S        (goes back to the loop start)
```
So the problem is the ColumnPruning rule adds a Project under a Filter (and fails to remove it in the end), and that new Project triggers PushPredicateThroughProject. Once the filters have been push through the Project, a new Project will be added by the ColumnPruning rule and this goes on and on.
The fix should be when adding Projects, the rule applies top-down, but later when removing extra Projects, the process should go bottom-up to ensure all extra Projects can be matched.

Added a optimization rule test in ColumnPruningSuite; and a end-to-end test in SQLQuerySuite.

Author: maryannxue <maryannxue@apache.org>

Closes #21674 from maryannxue/spark-24696.
---
 .../sql/catalyst/optimizer/Optimizer.scala    |  5 +++--
 .../optimizer/ColumnPruningSuite.scala        |  9 +++++++-
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 21 +++++++++++++++++++
 3 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index c77e0f82dc253..38799c1d594a8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -524,9 +524,10 @@ object ColumnPruning extends Rule[LogicalPlan] {
 
   /**
    * The Project before Filter is not necessary but conflict with PushPredicatesThroughProject,
-   * so remove it.
+   * so remove it. Since the Projects have been added top-down, we need to remove in bottom-up
+   * order, otherwise lower Projects can be missed.
    */
-  private def removeProjectBeforeFilter(plan: LogicalPlan): LogicalPlan = plan transform {
+  private def removeProjectBeforeFilter(plan: LogicalPlan): LogicalPlan = plan transformUp {
     case p1 @ Project(_, f @ Filter(_, p2 @ Project(_, child)))
       if p2.outputSet.subsetOf(child.outputSet) =>
       p1.copy(child = f.copy(child = child))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
index 3f41f4b144096..8b05ba32e6eef 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.catalyst.optimizer
 
 import scala.reflect.runtime.universe.TypeTag
 
-import org.apache.spark.sql.catalyst.analysis
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
@@ -370,5 +369,13 @@ class ColumnPruningSuite extends PlanTest {
     comparePlans(optimized2, expected2.analyze)
   }
 
+  test("SPARK-24696 ColumnPruning rule fails to remove extra Project") {
+    val input = LocalRelation('key.int, 'value.string)
+    val query = input.select('key).where(rand(0L) > 0.5).where('key < 10).analyze
+    val optimized = Optimize.execute(query)
+    val expected = input.where(rand(0L) > 0.5).where('key < 10).select('key).analyze
+    comparePlans(optimized, expected)
+  }
+
   // todo: add more tests for column pruning
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index bc57efeef69c1..5aabc98b714cb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2785,4 +2785,25 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  test("SPARK-24696 ColumnPruning rule fails to remove extra Project") {
+    withTable("fact_stats", "dim_stats") {
+      val factData = Seq((1, 1, 99, 1), (2, 2, 99, 2), (3, 1, 99, 3), (4, 2, 99, 4))
+      val storeData = Seq((1, "BW", "DE"), (2, "AZ", "US"))
+      spark.udf.register("filterND", udf((value: Int) => value > 2).asNondeterministic)
+      factData.toDF("date_id", "store_id", "product_id", "units_sold")
+        .write.mode("overwrite").partitionBy("store_id").format("parquet").saveAsTable("fact_stats")
+      storeData.toDF("store_id", "state_province", "country")
+        .write.mode("overwrite").format("parquet").saveAsTable("dim_stats")
+      val df = sql(
+        """
+          |SELECT f.date_id, f.product_id, f.store_id FROM
+          |(SELECT date_id, product_id, store_id
+          |   FROM fact_stats WHERE filterND(date_id)) AS f
+          |JOIN dim_stats s
+          |ON f.store_id = s.store_id WHERE s.country = 'DE'
+        """.stripMargin)
+      checkAnswer(df, Seq(Row(3, 99, 1)))
+    }
+  }
 }

From 1cba0505ef36b83611c1f2ba0ad5e638980670ac Mon Sep 17 00:00:00 2001
From: Rekha Joshi <rekhajoshm@gmail.com>
Date: Mon, 2 Jul 2018 22:39:00 +0800
Subject: [PATCH 514/566] [SPARK-24507][DOCUMENTATION] Update streaming guide

## What changes were proposed in this pull request?
Updated streaming guide for direct stream and link to integration guide.

## How was this patch tested?
jekyll build

Author: Rekha Joshi <rekhajoshm@gmail.com>

Closes #21683 from rekhajoshm/SPARK-24507.

(cherry picked from commit f599cde69506a5aedeeec449cba9a8b5ab128282)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 docs/streaming-programming-guide.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index ffda36d64a770..9efa9fcc86b3a 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -2176,6 +2176,8 @@ the input data stream (using `inputStream.repartition(<number of partitions>)`).
 This distributes the received batches of data across the specified number of machines in the cluster
 before further processing.
 
+For direct stream, please refer to [Spark Streaming + Kafka Integration Guide](streaming-kafka-integration.html)
+
 ### Level of Parallelism in Data Processing
 {:.no_toc}
 Cluster resources can be under-utilized if the number of parallel tasks used in any stage of the

From bc7ee758698911d75ffda386ee8048b42b0f2681 Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Tue, 3 Jul 2018 12:20:03 +0800
Subject: [PATCH 515/566] [SPARK-24385][SQL] Resolve self-join condition
 ambiguity for EqualNullSafe

## What changes were proposed in this pull request?

In Dataset.join we have a small hack for resolving ambiguity in the column name for self-joins. The current code supports only `EqualTo`.

The PR extends the fix to `EqualNullSafe`.

Credit for this PR should be given to daniel-shields.

## How was this patch tested?

added UT

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #21605 from mgaido91/SPARK-24385_2.

(cherry picked from commit a7c8f0c8cb144a026ea21e8780107e363ceacb8d)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../src/main/scala/org/apache/spark/sql/Dataset.scala     | 5 +++++
 .../scala/org/apache/spark/sql/DataFrameJoinSuite.scala   | 8 ++++++++
 2 files changed, 13 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 21c92506692e1..784abc8251995 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -995,6 +995,11 @@ class Dataset[T] private[sql](
         catalyst.expressions.EqualTo(
           withPlan(plan.left).resolve(a.name),
           withPlan(plan.right).resolve(b.name))
+      case catalyst.expressions.EqualNullSafe(a: AttributeReference, b: AttributeReference)
+        if a.sameRef(b) =>
+        catalyst.expressions.EqualNullSafe(
+          withPlan(plan.left).resolve(a.name),
+          withPlan(plan.right).resolve(b.name))
     }}
 
     withPlan {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
index 0d9eeabb397a1..10d9a11d2ee79 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
@@ -287,4 +287,12 @@ class DataFrameJoinSuite extends QueryTest with SharedSQLContext {
       dfOne.join(dfTwo, $"a" === $"b", "left").queryExecution.optimizedPlan
     }
   }
+
+  test("SPARK-24385: Resolve ambiguity in self-joins with EqualNullSafe") {
+    withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "false") {
+      val df = spark.range(2)
+      // this throws an exception before the fix
+      df.join(df, df("id") <=> df("id")).queryExecution.optimizedPlan
+    }
+  }
 }

From e5cc5f699bfc25e043d0da7c3e1a8c53a2fe550a Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Fri, 6 Jul 2018 00:08:03 -0700
Subject: [PATCH 516/566] [SPARK-24535][SPARKR] fix tests on java check error

## What changes were proposed in this pull request?

change to skip tests if
- couldn't determine java version

fix problem on windows

## How was this patch tested?

unit test, manual, win-builder

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #21666 from felixcheung/rjavaskip.

(cherry picked from commit 141953f4c44dbad1c2a7059e92bec5fe770af932)
Signed-off-by: Felix Cheung <felixcheung@apache.org>
---
 R/pkg/R/client.R                       | 24 +++++++++++++++---------
 R/pkg/R/sparkR.R                       |  2 +-
 R/pkg/inst/tests/testthat/test_basic.R |  8 ++++++++
 3 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R
index 4c87f64e7f0e1..660f0864403e0 100644
--- a/R/pkg/R/client.R
+++ b/R/pkg/R/client.R
@@ -71,15 +71,20 @@ checkJavaVersion <- function() {
 
   # If java is missing from PATH, we get an error in Unix and a warning in Windows
   javaVersionOut <- tryCatch(
-      launchScript(javaBin, "-version", wait = TRUE, stdout = TRUE, stderr = TRUE),
-                   error = function(e) {
-                     stop("Java version check failed. Please make sure Java is installed",
-                          " and set JAVA_HOME to point to the installation directory.", e)
-                   },
-                   warning = function(w) {
-                     stop("Java version check failed. Please make sure Java is installed",
-                          " and set JAVA_HOME to point to the installation directory.", w)
-                   })
+    if (is_windows()) {
+      # See SPARK-24535
+      system2(javaBin, "-version", wait = TRUE, stdout = TRUE, stderr = TRUE)
+    } else {
+      launchScript(javaBin, "-version", wait = TRUE, stdout = TRUE, stderr = TRUE)
+    },
+    error = function(e) {
+      stop("Java version check failed. Please make sure Java is installed",
+           " and set JAVA_HOME to point to the installation directory.", e)
+    },
+    warning = function(w) {
+      stop("Java version check failed. Please make sure Java is installed",
+          " and set JAVA_HOME to point to the installation directory.", w)
+    })
   javaVersionFilter <- Filter(
       function(x) {
         grepl(" version", x)
@@ -93,6 +98,7 @@ checkJavaVersion <- function() {
     stop(paste("Java version", sparkJavaVersion, "is required for this package; found version:",
                javaVersionStr))
   }
+  return(javaVersionNum)
 }
 
 launchBackend <- function(args, sparkHome, jars, sparkSubmitOpts, packages) {
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 266fa46525e0a..ac7fdedcbbc52 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -170,7 +170,7 @@ sparkR.sparkContext <- function(
     submitOps <- getClientModeSparkSubmitOpts(
         Sys.getenv("SPARKR_SUBMIT_ARGS", "sparkr-shell"),
         sparkEnvirMap)
-    checkJavaVersion()
+    invisible(checkJavaVersion())
     launchBackend(
         args = path,
         sparkHome = sparkHome,
diff --git a/R/pkg/inst/tests/testthat/test_basic.R b/R/pkg/inst/tests/testthat/test_basic.R
index 823d26f12feee..243f5f0298284 100644
--- a/R/pkg/inst/tests/testthat/test_basic.R
+++ b/R/pkg/inst/tests/testthat/test_basic.R
@@ -18,6 +18,10 @@
 context("basic tests for CRAN")
 
 test_that("create DataFrame from list or data.frame", {
+  tryCatch( checkJavaVersion(),
+            error = function(e) { skip("error on Java check") },
+            warning = function(e) { skip("warning on Java check") } )
+
   sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE,
                  sparkConfig = sparkRTestConfig)
 
@@ -50,6 +54,10 @@ test_that("create DataFrame from list or data.frame", {
 })
 
 test_that("spark.glm and predict", {
+  tryCatch( checkJavaVersion(),
+            error = function(e) { skip("error on Java check") },
+            warning = function(e) { skip("warning on Java check") } )
+
   sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE,
                  sparkConfig = sparkRTestConfig)
 

From 64c72b4de00b241fb4785e89ec3c4bbe90818293 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@apache.org>
Date: Sat, 7 Jul 2018 11:37:41 +0800
Subject: [PATCH 517/566] [SPARK-24739][PYTHON] Make PySpark compatible with
 Python 3.7

## What changes were proposed in this pull request?

This PR proposes to make PySpark compatible with Python 3.7.  There are rather radical change in semantic of `StopIteration` within a generator. It now throws it as a `RuntimeError`.

To make it compatible, we should fix it:

```python
try:
    next(...)
except StopIteration
    return
```

See [release note](https://docs.python.org/3/whatsnew/3.7.html#porting-to-python-3-7) and [PEP 479](https://www.python.org/dev/peps/pep-0479/).

## How was this patch tested?

Manually tested:

```
 $ ./run-tests --python-executables=python3.7
Running PySpark tests. Output is in /.../spark/python/unit-tests.log
Will test against the following Python executables: ['python3.7']
Will test the following Python modules: ['pyspark-core', 'pyspark-ml', 'pyspark-mllib', 'pyspark-sql', 'pyspark-streaming']
Starting test(python3.7): pyspark.mllib.tests
Starting test(python3.7): pyspark.sql.tests
Starting test(python3.7): pyspark.streaming.tests
Starting test(python3.7): pyspark.tests
Finished test(python3.7): pyspark.streaming.tests (130s)
Starting test(python3.7): pyspark.accumulators
Finished test(python3.7): pyspark.accumulators (8s)
Starting test(python3.7): pyspark.broadcast
Finished test(python3.7): pyspark.broadcast (9s)
Starting test(python3.7): pyspark.conf
Finished test(python3.7): pyspark.conf (6s)
Starting test(python3.7): pyspark.context
Finished test(python3.7): pyspark.context (27s)
Starting test(python3.7): pyspark.ml.classification
Finished test(python3.7): pyspark.tests (200s) ... 3 tests were skipped
Starting test(python3.7): pyspark.ml.clustering
Finished test(python3.7): pyspark.mllib.tests (244s)
Starting test(python3.7): pyspark.ml.evaluation
Finished test(python3.7): pyspark.ml.classification (63s)
Starting test(python3.7): pyspark.ml.feature
Finished test(python3.7): pyspark.ml.clustering (48s)
Starting test(python3.7): pyspark.ml.fpm
Finished test(python3.7): pyspark.ml.fpm (0s)
Starting test(python3.7): pyspark.ml.image
Finished test(python3.7): pyspark.ml.evaluation (23s)
Starting test(python3.7): pyspark.ml.linalg.__init__
Finished test(python3.7): pyspark.ml.linalg.__init__ (0s)
Starting test(python3.7): pyspark.ml.recommendation
Finished test(python3.7): pyspark.ml.image (20s)
Starting test(python3.7): pyspark.ml.regression
Finished test(python3.7): pyspark.ml.regression (58s)
Starting test(python3.7): pyspark.ml.stat
Finished test(python3.7): pyspark.ml.feature (90s)
Starting test(python3.7): pyspark.ml.tests
Finished test(python3.7): pyspark.ml.recommendation (82s)
Starting test(python3.7): pyspark.ml.tuning
Finished test(python3.7): pyspark.ml.stat (27s)
Starting test(python3.7): pyspark.mllib.classification
Finished test(python3.7): pyspark.sql.tests (362s) ... 102 tests were skipped
Starting test(python3.7): pyspark.mllib.clustering
Finished test(python3.7): pyspark.ml.tuning (29s)
Starting test(python3.7): pyspark.mllib.evaluation
Finished test(python3.7): pyspark.mllib.classification (39s)
Starting test(python3.7): pyspark.mllib.feature
Finished test(python3.7): pyspark.mllib.evaluation (30s)
Starting test(python3.7): pyspark.mllib.fpm
Finished test(python3.7): pyspark.mllib.feature (44s)
Starting test(python3.7): pyspark.mllib.linalg.__init__
Finished test(python3.7): pyspark.mllib.linalg.__init__ (0s)
Starting test(python3.7): pyspark.mllib.linalg.distributed
Finished test(python3.7): pyspark.mllib.clustering (78s)
Starting test(python3.7): pyspark.mllib.random
Finished test(python3.7): pyspark.mllib.fpm (33s)
Starting test(python3.7): pyspark.mllib.recommendation
Finished test(python3.7): pyspark.mllib.random (12s)
Starting test(python3.7): pyspark.mllib.regression
Finished test(python3.7): pyspark.mllib.linalg.distributed (45s)
Starting test(python3.7): pyspark.mllib.stat.KernelDensity
Finished test(python3.7): pyspark.mllib.stat.KernelDensity (0s)
Starting test(python3.7): pyspark.mllib.stat._statistics
Finished test(python3.7): pyspark.mllib.recommendation (41s)
Starting test(python3.7): pyspark.mllib.tree
Finished test(python3.7): pyspark.mllib.regression (44s)
Starting test(python3.7): pyspark.mllib.util
Finished test(python3.7): pyspark.mllib.stat._statistics (20s)
Starting test(python3.7): pyspark.profiler
Finished test(python3.7): pyspark.mllib.tree (26s)
Starting test(python3.7): pyspark.rdd
Finished test(python3.7): pyspark.profiler (11s)
Starting test(python3.7): pyspark.serializers
Finished test(python3.7): pyspark.mllib.util (24s)
Starting test(python3.7): pyspark.shuffle
Finished test(python3.7): pyspark.shuffle (0s)
Starting test(python3.7): pyspark.sql.catalog
Finished test(python3.7): pyspark.serializers (15s)
Starting test(python3.7): pyspark.sql.column
Finished test(python3.7): pyspark.rdd (27s)
Starting test(python3.7): pyspark.sql.conf
Finished test(python3.7): pyspark.sql.catalog (24s)
Starting test(python3.7): pyspark.sql.context
Finished test(python3.7): pyspark.sql.conf (8s)
Starting test(python3.7): pyspark.sql.dataframe
Finished test(python3.7): pyspark.sql.column (29s)
Starting test(python3.7): pyspark.sql.functions
Finished test(python3.7): pyspark.sql.context (26s)
Starting test(python3.7): pyspark.sql.group
Finished test(python3.7): pyspark.sql.dataframe (51s)
Starting test(python3.7): pyspark.sql.readwriter
Finished test(python3.7): pyspark.ml.tests (266s)
Starting test(python3.7): pyspark.sql.session
Finished test(python3.7): pyspark.sql.group (36s)
Starting test(python3.7): pyspark.sql.streaming
Finished test(python3.7): pyspark.sql.functions (57s)
Starting test(python3.7): pyspark.sql.types
Finished test(python3.7): pyspark.sql.session (25s)
Starting test(python3.7): pyspark.sql.udf
Finished test(python3.7): pyspark.sql.types (10s)
Starting test(python3.7): pyspark.sql.window
Finished test(python3.7): pyspark.sql.readwriter (31s)
Starting test(python3.7): pyspark.streaming.util
Finished test(python3.7): pyspark.sql.streaming (22s)
Starting test(python3.7): pyspark.util
Finished test(python3.7): pyspark.util (0s)
Finished test(python3.7): pyspark.streaming.util (0s)
Finished test(python3.7): pyspark.sql.udf (16s)
Finished test(python3.7): pyspark.sql.window (12s)
```

In my local (I have two Macs but both have the same issues), I currently faced some issues for now to install both extra dependencies PyArrow and Pandas same as Jenkins's, against Python 3.7.

Author: hyukjinkwon <gurwls223@apache.org>

Closes #21714 from HyukjinKwon/SPARK-24739.

(cherry picked from commit 74f6a92fcea9196d62c2d531c11ec7efd580b760)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 python/pyspark/rdd.py | 5 ++++-
 python/setup.py       | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 8dc30a42f74e0..eadb5aba89a03 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -1368,7 +1368,10 @@ def takeUpToNumLeft(iterator):
                 iterator = iter(iterator)
                 taken = 0
                 while taken < left:
-                    yield next(iterator)
+                    try:
+                        yield next(iterator)
+                    except StopIteration:
+                        return
                     taken += 1
 
             p = range(partsScanned, min(partsScanned + numPartsToTry, totalParts))
diff --git a/python/setup.py b/python/setup.py
index 80cf0d7b329bd..e599fe607d63f 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -219,6 +219,7 @@ def _supports_symlinks():
             'Programming Language :: Python :: 3.4',
             'Programming Language :: Python :: 3.5',
             'Programming Language :: Python :: 3.6',
+            'Programming Language :: Python :: 3.7',
             'Programming Language :: Python :: Implementation :: CPython',
             'Programming Language :: Python :: Implementation :: PyPy']
     )

From 4df06b45160241dbb331153efbb25703f913c192 Mon Sep 17 00:00:00 2001
From: Saisai Shao <jshao@apache.org>
Date: Sun, 8 Jul 2018 01:24:42 +0000
Subject: [PATCH 518/566] Preparing Spark release v2.3.2-rc1

---
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 2 +-
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 40 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index 02bf39bcb96f3..57485fc003097 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 646fdfb23ef19..53e58c2f5d374 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 76c7dcf52203f..d05647c601807 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index f2661fedd6f75..8d46761151abe 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 229d4667169d2..11c290d64590a 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index febec1897c59c..786e959a48cce 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 4dec96f86d9c3..78d5a5eaf2335 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 7187313c5c2f7..11259d923900e 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index a9cc91f87566b..276cf10aa205a 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index 8579166c2e635..bca466d19fa36 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,7 +14,7 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.2-SNAPSHOT
+SPARK_VERSION: 2.3.2
 SPARK_VERSION_SHORT: 2.3.2
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
diff --git a/examples/pom.xml b/examples/pom.xml
index b873bc9b20322..7324f3a780e49 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index b7dc48e4a7001..a7a743129e2c4 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 161fd0faac82a..5b8661c350554 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 26abc31f9620d..5b924e4d73c5c 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index bcc13ebb359b3..70e0c423faa3e 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 020e0ba7d568b..3c99f4821d4f9 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index e12115ee6e20c..65b8a3924b172 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 07d205b01dfde..b0e857ba7534e 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 11d971a0de046..6f299a2d7b6c7 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 70eb580d474af..de2023837594a 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 0728baede26f0..5ee942480a62a 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index c6925851ebc23..b8a15c5b10dbd 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 8b95f1be216c7..535104b32328c 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index b5253fd8954f3..01d140b3c7de8 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index dacd42126393b..33f18dbe59a1a 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 2fcb367cf301c..77970b2f18303 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 8ecd126ebb9b2..be5d767e4c84a 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index f52b83475c515..a4b6f95c9254b 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 0e9354891f1a3..381b0f7c84aad 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.2-SNAPSHOT</version>
+  <version>2.3.2</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index ed1cbdd58f252..94a128ecb10af 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.2.dev0"
+__version__ = "2.3.2"
diff --git a/repl/pom.xml b/repl/pom.xml
index bc10cce2d1aea..a64270a800e4d 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 3e0aa0872b99b..8b6232e6772dc 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index cf153476ce9f9..b68f817b44f69 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 71ddbb5d4db08..2972d61b6b2d4 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 04a3380b02115..be9d087b6393f 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index c9a05cc9e43f6..2638b56da1f8b 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 8a0c6fba66cda..43cfd529ebdd1 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 2eeff747a68c6..b26f5bb942824 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index d16f48dec205f..77e6435fe9f44 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index f5927c5e41939..24d4e1d47b744 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 72eb97ce96c27e296aa6ed8198246000a78fb10b Mon Sep 17 00:00:00 2001
From: Saisai Shao <jshao@apache.org>
Date: Sun, 8 Jul 2018 01:24:55 +0000
Subject: [PATCH 519/566] Preparing development version 2.3.3-SNAPSHOT

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 8df26351614ec..6ec496620c8bd 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.2
+Version: 2.3.3
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 57485fc003097..f8b15cc3632ab 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 53e58c2f5d374..e412a4727b385 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index d05647c601807..d8f9a3d2015c1 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 8d46761151abe..a1a4f87987119 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 11c290d64590a..e650978f1a2e5 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 786e959a48cce..350e3cb8fef99 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 78d5a5eaf2335..e7fea4125cf69 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 11259d923900e..601cc5d998b59 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 276cf10aa205a..2a7e6442f2f6f 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index bca466d19fa36..7629f5f8c5818 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.2
-SPARK_VERSION_SHORT: 2.3.2
+SPARK_VERSION: 2.3.3-SNAPSHOT
+SPARK_VERSION_SHORT: 2.3.3
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 7324f3a780e49..7ecec708ac6c1 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index a7a743129e2c4..30816bba29627 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 5b8661c350554..4a2cb1361db27 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 5b924e4d73c5c..5199aba2f5f8a 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 70e0c423faa3e..3f8098887b64f 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 3c99f4821d4f9..f9874cd223938 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 65b8a3924b172..499f1bc4c0907 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index b0e857ba7534e..c4caff853ca1a 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 6f299a2d7b6c7..a09d23d917379 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index de2023837594a..6660c73a677c8 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 5ee942480a62a..f08fe460f9d9b 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index b8a15c5b10dbd..3fb0def7d45e5 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 535104b32328c..4a0aa06e621c3 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 01d140b3c7de8..5de97fe7b7824 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 33f18dbe59a1a..d1f40132f2c6f 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 77970b2f18303..96f4081cc5f94 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index be5d767e4c84a..43ef77facca8c 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index a4b6f95c9254b..01ebc1ae90aa0 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 381b0f7c84aad..ea6f27c367445 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.2</version>
+  <version>2.3.3-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 94a128ecb10af..ca37f0e98320d 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.2"
+__version__ = "2.3.3.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index a64270a800e4d..9c16fe4fb546d 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 8b6232e6772dc..d7fb35f4559df 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index b68f817b44f69..84ce020dff77b 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 2972d61b6b2d4..b09d0031f4cf9 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index be9d087b6393f..2a86f6f1d42d3 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 2638b56da1f8b..64aeb7ef10d92 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 43cfd529ebdd1..a9fccdd327880 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index b26f5bb942824..5faa52541da3b 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 77e6435fe9f44..f73d618b6e41c 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 24d4e1d47b744..8d98ba7161979 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 19542f5de390f6096745e478dd9339958db899e8 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@apache.org>
Date: Wed, 11 Jul 2018 10:10:07 +0800
Subject: [PATCH 520/566] [SPARK-24530][PYTHON] Add a control to force Python
 version in Sphinx via environment variable, SPHINXPYTHON

## What changes were proposed in this pull request?

This PR proposes to add `SPHINXPYTHON` environment variable to control the Python version used by Sphinx.

The motivation of this environment variable is, it seems not properly rendering some signatures in the Python documentation when Python 2 is used by Sphinx. See the JIRA's case. It should be encouraged to use Python 3, but looks we will probably live with this problem for a long while in any event.

For the default case of `make html`, it keeps previous behaviour and use `SPHINXBUILD` as it was. If `SPHINXPYTHON` is set, then it forces Sphinx to use the specific Python version.

```
$ SPHINXPYTHON=python3 make html
python3 -msphinx -b html -d _build/doctrees   . _build/html
Running Sphinx v1.7.5
...
```

1. if `SPHINXPYTHON` is set, use Python. If `SPHINXBUILD` is set, use sphinx-build.
2. If both are set, `SPHINXBUILD` has a higher priority over `SPHINXPYTHON`
3. By default, `SPHINXBUILD` is used as 'sphinx-build'.

Probably, we can somehow work around this via explicitly setting `SPHINXBUILD` but `sphinx-build` can't be easily distinguished since it (at least in my environment and up to my knowledge) doesn't replace `sphinx-build` when newer Sphinx is installed in different Python version. It confuses and doesn't warn for its Python version.

## How was this patch tested?

Manually tested:

**`python` (Python 2.7) in the path with Sphinx:**

```
$ make html
sphinx-build -b html -d _build/doctrees   . _build/html
Running Sphinx v1.7.5
...
```

**`python` (Python 2.7) in the path without Sphinx:**

```
$ make html
Makefile:8: *** The 'sphinx-build' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the 'sphinx-build' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/.  Stop.
```

**`SPHINXPYTHON` set `python` (Python 2.7)  with Sphinx:**

```
$ SPHINXPYTHON=python make html
Makefile:35: *** Note that Python 3 is required to generate PySpark documentation correctly for now. Current Python executable was less than Python 3. See SPARK-24530. To force Sphinx to use a specific Python executable, please set SPHINXPYTHON to point to the Python 3 executable..  Stop.
```

**`SPHINXPYTHON` set `python` (Python 2.7)  without Sphinx:**

```
$ SPHINXPYTHON=python make html
Makefile:35: *** Note that Python 3 is required to generate PySpark documentation correctly for now. Current Python executable was less than Python 3. See SPARK-24530. To force Sphinx to use a specific Python executable, please set SPHINXPYTHON to point to the Python 3 executable..  Stop.
```

**`SPHINXPYTHON` set `python3` with Sphinx:**

```
$ SPHINXPYTHON=python3 make html
python3 -msphinx -b html -d _build/doctrees   . _build/html
Running Sphinx v1.7.5
...
```

**`SPHINXPYTHON` set `python3` without Sphinx:**

```
$ SPHINXPYTHON=python3 make html
Makefile:39: *** Python executable 'python3' did not have Sphinx installed. Make sure you have Sphinx installed, then set the SPHINXPYTHON environment variable to point to the Python executable having Sphinx installed. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/.  Stop.
```

**`SPHINXBUILD` set:**

```
$ SPHINXBUILD=sphinx-build make html
sphinx-build -b html -d _build/doctrees   . _build/html
Running Sphinx v1.7.5
...
```

**Both `SPHINXPYTHON` and `SPHINXBUILD` are set:**

```
$ SPHINXBUILD=sphinx-build SPHINXPYTHON=python make html
sphinx-build -b html -d _build/doctrees   . _build/html
Running Sphinx v1.7.5
...
```

Author: hyukjinkwon <gurwls223@apache.org>

Closes #21659 from HyukjinKwon/SPARK-24530.

(cherry picked from commit 1f94bf492c3bce3b61f7fec6132b50e06dea94a8)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 python/docs/Makefile | 37 +++++++++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/python/docs/Makefile b/python/docs/Makefile
index b8e079483c90c..1ed1f33af2326 100644
--- a/python/docs/Makefile
+++ b/python/docs/Makefile
@@ -1,19 +1,44 @@
 # Makefile for Sphinx documentation
 #
 
+ifndef SPHINXBUILD
+ifndef SPHINXPYTHON
+SPHINXBUILD = sphinx-build
+endif
+endif
+
+ifdef SPHINXBUILD
+# User-friendly check for sphinx-build.
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+else
+# Note that there is an issue with Python version and Sphinx in PySpark documentation generation.
+# Please remove this check below when this issue is fixed. See SPARK-24530 for more details.
+PYTHON_VERSION_CHECK = $(shell $(SPHINXPYTHON) -c 'import sys; print(sys.version_info < (3, 0, 0))')
+ifeq ($(PYTHON_VERSION_CHECK), True)
+$(error Note that Python 3 is required to generate PySpark documentation correctly for now. Current Python executable was less than Python 3. See SPARK-24530. To force Sphinx to use a specific Python executable, please set SPHINXPYTHON to point to the Python 3 executable.)
+endif
+# Check if Sphinx is installed.
+ifeq ($(shell $(SPHINXPYTHON) -c 'import sphinx' >/dev/null 2>&1; echo $$?), 1)
+$(error Python executable '$(SPHINXPYTHON)' did not have Sphinx installed. Make sure you have Sphinx installed, then set the SPHINXPYTHON environment variable to point to the Python executable having Sphinx installed. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+# Use 'SPHINXPYTHON -msphinx' instead of 'sphinx-build'. See https://github.com/sphinx-doc/sphinx/pull/3523 for more details.
+SPHINXBUILD = $(SPHINXPYTHON) -msphinx
+endif
+
 # You can set these variables from the command line.
 SPHINXOPTS    ?=
-SPHINXBUILD   ?= sphinx-build
 PAPER         ?=
 BUILDDIR      ?= _build
+# You can set SPHINXBUILD to specify Sphinx build executable or SPHINXPYTHON to specify the Python executable used in Sphinx.
+# They follow:
+#   1. if SPHINXPYTHON is set, use Python. If SPHINXBUILD is set, use sphinx-build.
+#   2. If both are set, SPHINXBUILD has a higher priority over SPHINXPYTHON
+#   3. By default, SPHINXBUILD is used as 'sphinx-build'.
 
 export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.10.7-src.zip)
 
-# User-friendly check for sphinx-build
-ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
-$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
-endif
-
 # Internal variables.
 PAPEROPT_a4     = -D latex_paper_size=a4
 PAPEROPT_letter = -D latex_paper_size=letter

From 307499e1a99c6ad3ce0b978626894ea2c1e3807e Mon Sep 17 00:00:00 2001
From: Saisai Shao <jshao@apache.org>
Date: Wed, 11 Jul 2018 05:27:02 +0000
Subject: [PATCH 521/566] Preparing Spark release v2.3.2-rc2

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 6ec496620c8bd..8df26351614ec 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.3
+Version: 2.3.2
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index f8b15cc3632ab..57485fc003097 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index e412a4727b385..53e58c2f5d374 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index d8f9a3d2015c1..d05647c601807 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index a1a4f87987119..8d46761151abe 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index e650978f1a2e5..11c290d64590a 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 350e3cb8fef99..786e959a48cce 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index e7fea4125cf69..78d5a5eaf2335 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 601cc5d998b59..11259d923900e 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 2a7e6442f2f6f..276cf10aa205a 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index 7629f5f8c5818..bca466d19fa36 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.3-SNAPSHOT
-SPARK_VERSION_SHORT: 2.3.3
+SPARK_VERSION: 2.3.2
+SPARK_VERSION_SHORT: 2.3.2
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 7ecec708ac6c1..7324f3a780e49 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 30816bba29627..a7a743129e2c4 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 4a2cb1361db27..5b8661c350554 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 5199aba2f5f8a..5b924e4d73c5c 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 3f8098887b64f..70e0c423faa3e 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index f9874cd223938..3c99f4821d4f9 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 499f1bc4c0907..65b8a3924b172 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index c4caff853ca1a..b0e857ba7534e 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index a09d23d917379..6f299a2d7b6c7 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 6660c73a677c8..de2023837594a 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index f08fe460f9d9b..5ee942480a62a 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 3fb0def7d45e5..b8a15c5b10dbd 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 4a0aa06e621c3..535104b32328c 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 5de97fe7b7824..01d140b3c7de8 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index d1f40132f2c6f..33f18dbe59a1a 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 96f4081cc5f94..77970b2f18303 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 43ef77facca8c..be5d767e4c84a 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 01ebc1ae90aa0..a4b6f95c9254b 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index ea6f27c367445..381b0f7c84aad 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.3-SNAPSHOT</version>
+  <version>2.3.2</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index ca37f0e98320d..94a128ecb10af 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.3.dev0"
+__version__ = "2.3.2"
diff --git a/repl/pom.xml b/repl/pom.xml
index 9c16fe4fb546d..a64270a800e4d 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index d7fb35f4559df..8b6232e6772dc 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 84ce020dff77b..b68f817b44f69 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index b09d0031f4cf9..2972d61b6b2d4 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 2a86f6f1d42d3..be9d087b6393f 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 64aeb7ef10d92..2638b56da1f8b 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index a9fccdd327880..43cfd529ebdd1 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 5faa52541da3b..b26f5bb942824 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index f73d618b6e41c..77e6435fe9f44 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 8d98ba7161979..24d4e1d47b744 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 86457a16de20eb126d0569d12f51b2e427fd03c3 Mon Sep 17 00:00:00 2001
From: Saisai Shao <jshao@apache.org>
Date: Wed, 11 Jul 2018 05:27:12 +0000
Subject: [PATCH 522/566] Preparing development version 2.3.3-SNAPSHOT

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 8df26351614ec..6ec496620c8bd 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.2
+Version: 2.3.3
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 57485fc003097..f8b15cc3632ab 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 53e58c2f5d374..e412a4727b385 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index d05647c601807..d8f9a3d2015c1 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 8d46761151abe..a1a4f87987119 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 11c290d64590a..e650978f1a2e5 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 786e959a48cce..350e3cb8fef99 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 78d5a5eaf2335..e7fea4125cf69 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 11259d923900e..601cc5d998b59 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 276cf10aa205a..2a7e6442f2f6f 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index bca466d19fa36..7629f5f8c5818 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.2
-SPARK_VERSION_SHORT: 2.3.2
+SPARK_VERSION: 2.3.3-SNAPSHOT
+SPARK_VERSION_SHORT: 2.3.3
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 7324f3a780e49..7ecec708ac6c1 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index a7a743129e2c4..30816bba29627 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 5b8661c350554..4a2cb1361db27 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 5b924e4d73c5c..5199aba2f5f8a 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 70e0c423faa3e..3f8098887b64f 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 3c99f4821d4f9..f9874cd223938 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 65b8a3924b172..499f1bc4c0907 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index b0e857ba7534e..c4caff853ca1a 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 6f299a2d7b6c7..a09d23d917379 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index de2023837594a..6660c73a677c8 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 5ee942480a62a..f08fe460f9d9b 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index b8a15c5b10dbd..3fb0def7d45e5 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 535104b32328c..4a0aa06e621c3 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 01d140b3c7de8..5de97fe7b7824 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 33f18dbe59a1a..d1f40132f2c6f 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 77970b2f18303..96f4081cc5f94 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index be5d767e4c84a..43ef77facca8c 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index a4b6f95c9254b..01ebc1ae90aa0 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 381b0f7c84aad..ea6f27c367445 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.2</version>
+  <version>2.3.3-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 94a128ecb10af..ca37f0e98320d 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.2"
+__version__ = "2.3.3.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index a64270a800e4d..9c16fe4fb546d 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 8b6232e6772dc..d7fb35f4559df 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index b68f817b44f69..84ce020dff77b 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 2972d61b6b2d4..b09d0031f4cf9 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index be9d087b6393f..2a86f6f1d42d3 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 2638b56da1f8b..64aeb7ef10d92 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 43cfd529ebdd1..a9fccdd327880 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index b26f5bb942824..5faa52541da3b 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 77e6435fe9f44..f73d618b6e41c 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 24d4e1d47b744..8d98ba7161979 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 32429256f3e659c648462e5b2740747645740c97 Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Wed, 11 Jul 2018 09:29:19 -0700
Subject: [PATCH 523/566] [SPARK-24208][SQL] Fix attribute deduplication for
 FlatMapGroupsInPandas

A self-join on a dataset which contains a `FlatMapGroupsInPandas` fails because of duplicate attributes. This happens because we are not dealing with this specific case in our `dedupAttr` rules.

The PR fix the issue by adding the management of the specific case

added UT + manual tests

Author: Marco Gaido <marcogaido91@gmail.com>
Author: Marco Gaido <mgaido@hortonworks.com>

Closes #21737 from mgaido91/SPARK-24208.

(cherry picked from commit ebf4bfb966389342bfd9bdb8e3b612828c18730c)
Signed-off-by: Xiao Li <gatorsmile@gmail.com>
---
 python/pyspark/sql/tests.py                      | 16 ++++++++++++++++
 .../spark/sql/catalyst/analysis/Analyzer.scala   |  4 ++++
 .../apache/spark/sql/GroupedDatasetSuite.scala   | 12 ++++++++++++
 3 files changed, 32 insertions(+)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index aa7d8eba1f692..6bfb329b5c81d 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -4691,6 +4691,22 @@ def test_timestamp_dst(self):
         result = df.groupby('time').apply(foo_udf).sort('time')
         self.assertPandasEqual(df.toPandas(), result.toPandas())
 
+    def test_self_join_with_pandas(self):
+        import pyspark.sql.functions as F
+
+        @F.pandas_udf('key long, col string', F.PandasUDFType.GROUPED_MAP)
+        def dummy_pandas_udf(df):
+            return df[['key', 'col']]
+
+        df = self.spark.createDataFrame([Row(key=1, col='A'), Row(key=1, col='B'),
+                                         Row(key=2, col='C')])
+        dfWithPandas = df.groupBy('key').apply(dummy_pandas_udf)
+
+        # this was throwing an AnalysisException before SPARK-24208
+        res = dfWithPandas.alias('temp0').join(dfWithPandas.alias('temp1'),
+                                               F.col('temp0.key') == F.col('temp1.key'))
+        self.assertEquals(res.count(), 5)
+
 
 if __name__ == "__main__":
     from pyspark.sql.tests import *
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 8597d83d83000..a584cb8d7a7c8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -723,6 +723,10 @@ class Analyzer(
             if findAliases(aggregateExpressions).intersect(conflictingAttributes).nonEmpty =>
           (oldVersion, oldVersion.copy(aggregateExpressions = newAliases(aggregateExpressions)))
 
+        case oldVersion @ FlatMapGroupsInPandas(_, _, output, _)
+            if oldVersion.outputSet.intersect(conflictingAttributes).nonEmpty =>
+          (oldVersion, oldVersion.copy(output = output.map(_.newInstance())))
+
         case oldVersion: Generate
             if oldVersion.producedAttributes.intersect(conflictingAttributes).nonEmpty =>
           val newOutput = oldVersion.generatorOutput.map(_.newInstance())
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/GroupedDatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/GroupedDatasetSuite.scala
index 218a1b7248f12..9699fadb1db5d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/GroupedDatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/GroupedDatasetSuite.scala
@@ -93,4 +93,16 @@ class GroupedDatasetSuite extends QueryTest with SharedSQLContext {
     }
     datasetWithUDF.unpersist(true)
   }
+
+  test("SPARK-24208: analysis fails on self-join with FlatMapGroupsInPandas") {
+    val df = datasetWithUDF.groupBy("s").flatMapGroupsInPandas(PythonUDF(
+      "pyUDF",
+      null,
+      StructType(Seq(StructField("s", LongType))),
+      Seq.empty,
+      PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
+      true))
+    val df1 = df.alias("temp0").join(df.alias("temp1"), $"temp0.s" === $"temp1.s")
+    df1.queryExecution.assertAnalyzed()
+  }
 }

From 9cf375f5be3d359912bde9b6ba5766425e8cf3bb Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 13 Jul 2018 08:25:00 -0700
Subject: [PATCH 524/566] [SPARK-24781][SQL] Using a reference from Dataset in
 Filter/Sort might not work

## What changes were proposed in this pull request?

When we use a reference from Dataset in filter or sort, which was not used in the prior select, an AnalysisException occurs, e.g.,

```scala
val df = Seq(("test1", 0), ("test2", 1)).toDF("name", "id")
df.select(df("name")).filter(df("id") === 0).show()
```

```scala
org.apache.spark.sql.AnalysisException: Resolved attribute(s) id#6 missing from name#5 in operator !Filter (id#6 = 0).;;
!Filter (id#6 = 0)
   +- AnalysisBarrier
      +- Project [name#5]
         +- Project [_1#2 AS name#5, _2#3 AS id#6]
            +- LocalRelation [_1#2, _2#3]
```
This change updates the rule `ResolveMissingReferences` so `Filter` and `Sort` with non-empty `missingInputs` will also be transformed.

## How was this patch tested?

Added tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #21745 from viirya/SPARK-24781.

(cherry picked from commit dfd7ac9887f89b9b51b7b143ab54d01f11cfcdb5)
Signed-off-by: Xiao Li <gatorsmile@gmail.com>
---
 .../sql/catalyst/analysis/Analyzer.scala      | 30 ++++++++++++++-----
 .../org/apache/spark/sql/DataFrameSuite.scala | 25 ++++++++++++++++
 2 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index a584cb8d7a7c8..2858bee890d04 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1114,7 +1114,8 @@ class Analyzer(
       case sa @ Sort(_, _, AnalysisBarrier(child: Aggregate)) => sa
       case sa @ Sort(_, _, child: Aggregate) => sa
 
-      case s @ Sort(order, _, child) if !s.resolved && child.resolved =>
+      case s @ Sort(order, _, child)
+          if (!s.resolved || s.missingInput.nonEmpty) && child.resolved =>
         val (newOrder, newChild) = resolveExprsAndAddMissingAttrs(order, child)
         val ordering = newOrder.map(_.asInstanceOf[SortOrder])
         if (child.output == newChild.output) {
@@ -1125,7 +1126,7 @@ class Analyzer(
           Project(child.output, newSort)
         }
 
-      case f @ Filter(cond, child) if !f.resolved && child.resolved =>
+      case f @ Filter(cond, child) if (!f.resolved || f.missingInput.nonEmpty) && child.resolved =>
         val (newCond, newChild) = resolveExprsAndAddMissingAttrs(Seq(cond), child)
         if (child.output == newChild.output) {
           f.copy(condition = newCond.head)
@@ -1136,10 +1137,17 @@ class Analyzer(
         }
     }
 
+    /**
+     * This method tries to resolve expressions and find missing attributes recursively. Specially,
+     * when the expressions used in `Sort` or `Filter` contain unresolved attributes or resolved
+     * attributes which are missed from child output. This method tries to find the missing
+     * attributes out and add into the projection.
+     */
     private def resolveExprsAndAddMissingAttrs(
         exprs: Seq[Expression], plan: LogicalPlan): (Seq[Expression], LogicalPlan) = {
-      if (exprs.forall(_.resolved)) {
-        // All given expressions are resolved, no need to continue anymore.
+      // Missing attributes can be unresolved attributes or resolved attributes which are not in
+      // the output attributes of the plan.
+      if (exprs.forall(e => e.resolved && e.references.subsetOf(plan.outputSet))) {
         (exprs, plan)
       } else {
         plan match {
@@ -1150,15 +1158,19 @@ class Analyzer(
             (newExprs, AnalysisBarrier(newChild))
 
           case p: Project =>
+            // Resolving expressions against current plan.
             val maybeResolvedExprs = exprs.map(resolveExpression(_, p))
+            // Recursively resolving expressions on the child of current plan.
             val (newExprs, newChild) = resolveExprsAndAddMissingAttrs(maybeResolvedExprs, p.child)
-            val missingAttrs = AttributeSet(newExprs) -- AttributeSet(maybeResolvedExprs)
+            // If some attributes used by expressions are resolvable only on the rewritten child
+            // plan, we need to add them into original projection.
+            val missingAttrs = (AttributeSet(newExprs) -- p.outputSet).intersect(newChild.outputSet)
             (newExprs, Project(p.projectList ++ missingAttrs, newChild))
 
           case a @ Aggregate(groupExprs, aggExprs, child) =>
             val maybeResolvedExprs = exprs.map(resolveExpression(_, a))
             val (newExprs, newChild) = resolveExprsAndAddMissingAttrs(maybeResolvedExprs, child)
-            val missingAttrs = AttributeSet(newExprs) -- AttributeSet(maybeResolvedExprs)
+            val missingAttrs = (AttributeSet(newExprs) -- a.outputSet).intersect(newChild.outputSet)
             if (missingAttrs.forall(attr => groupExprs.exists(_.semanticEquals(attr)))) {
               // All the missing attributes are grouping expressions, valid case.
               (newExprs, a.copy(aggregateExpressions = aggExprs ++ missingAttrs, child = newChild))
@@ -1478,7 +1490,11 @@ class Analyzer(
 
         // Try resolving the ordering as though it is in the aggregate clause.
         try {
-          val unresolvedSortOrders = sortOrder.filter(s => !s.resolved || containsAggregate(s))
+          // If a sort order is unresolved, containing references not in aggregate, or containing
+          // `AggregateExpression`, we need to push down it to the underlying aggregate operator.
+          val unresolvedSortOrders = sortOrder.filter { s =>
+            !s.resolved || !s.references.subsetOf(aggregate.outputSet) || containsAggregate(s)
+          }
           val aliasedOrdering =
             unresolvedSortOrders.map(o => Alias(o.child, "aggOrder")())
           val aggregatedOrdering = aggregate.copy(aggregateExpressions = aliasedOrdering)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index fd624ad8ef68d..9f8d337ead09d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -2275,4 +2275,29 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     val mapWithBinaryKey = map(lit(Array[Byte](1.toByte)), lit(1))
     checkAnswer(spark.range(1).select(mapWithBinaryKey.getItem(Array[Byte](1.toByte))), Row(1))
   }
+
+  test("SPARK-24781: Using a reference from Dataset in Filter/Sort") {
+    val df = Seq(("test1", 0), ("test2", 1)).toDF("name", "id")
+    val filter1 = df.select(df("name")).filter(df("id") === 0)
+    val filter2 = df.select(col("name")).filter(col("id") === 0)
+    checkAnswer(filter1, filter2.collect())
+
+    val sort1 = df.select(df("name")).orderBy(df("id"))
+    val sort2 = df.select(col("name")).orderBy(col("id"))
+    checkAnswer(sort1, sort2.collect())
+  }
+
+  test("SPARK-24781: Using a reference not in aggregation in Filter/Sort") {
+     withSQLConf(SQLConf.DATAFRAME_RETAIN_GROUP_COLUMNS.key -> "false") {
+      val df = Seq(("test1", 0), ("test2", 1)).toDF("name", "id")
+
+      val aggPlusSort1 = df.groupBy(df("name")).agg(count(df("name"))).orderBy(df("name"))
+      val aggPlusSort2 = df.groupBy(col("name")).agg(count(col("name"))).orderBy(col("name"))
+      checkAnswer(aggPlusSort1, aggPlusSort2.collect())
+
+      val aggPlusFilter1 = df.groupBy(df("name")).agg(count(df("name"))).filter(df("name") === 0)
+      val aggPlusFilter2 = df.groupBy(col("name")).agg(count(col("name"))).filter(col("name") === 0)
+      checkAnswer(aggPlusFilter1, aggPlusFilter2.collect())
+    }
+  }
 }

From b3726dadcf2997f20231873ec6e057dba433ae64 Mon Sep 17 00:00:00 2001
From: Saisai Shao <jshao@apache.org>
Date: Sun, 15 Jul 2018 01:56:00 +0000
Subject: [PATCH 525/566] Preparing Spark release v2.3.2-rc3

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 6ec496620c8bd..8df26351614ec 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.3
+Version: 2.3.2
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index f8b15cc3632ab..57485fc003097 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index e412a4727b385..53e58c2f5d374 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index d8f9a3d2015c1..d05647c601807 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index a1a4f87987119..8d46761151abe 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index e650978f1a2e5..11c290d64590a 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 350e3cb8fef99..786e959a48cce 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index e7fea4125cf69..78d5a5eaf2335 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 601cc5d998b59..11259d923900e 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 2a7e6442f2f6f..276cf10aa205a 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index 7629f5f8c5818..bca466d19fa36 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.3-SNAPSHOT
-SPARK_VERSION_SHORT: 2.3.3
+SPARK_VERSION: 2.3.2
+SPARK_VERSION_SHORT: 2.3.2
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 7ecec708ac6c1..7324f3a780e49 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 30816bba29627..a7a743129e2c4 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 4a2cb1361db27..5b8661c350554 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 5199aba2f5f8a..5b924e4d73c5c 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 3f8098887b64f..70e0c423faa3e 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index f9874cd223938..3c99f4821d4f9 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 499f1bc4c0907..65b8a3924b172 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index c4caff853ca1a..b0e857ba7534e 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index a09d23d917379..6f299a2d7b6c7 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 6660c73a677c8..de2023837594a 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index f08fe460f9d9b..5ee942480a62a 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 3fb0def7d45e5..b8a15c5b10dbd 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 4a0aa06e621c3..535104b32328c 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 5de97fe7b7824..01d140b3c7de8 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index d1f40132f2c6f..33f18dbe59a1a 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 96f4081cc5f94..77970b2f18303 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 43ef77facca8c..be5d767e4c84a 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 01ebc1ae90aa0..a4b6f95c9254b 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index ea6f27c367445..381b0f7c84aad 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.3-SNAPSHOT</version>
+  <version>2.3.2</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index ca37f0e98320d..94a128ecb10af 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.3.dev0"
+__version__ = "2.3.2"
diff --git a/repl/pom.xml b/repl/pom.xml
index 9c16fe4fb546d..a64270a800e4d 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index d7fb35f4559df..8b6232e6772dc 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 84ce020dff77b..b68f817b44f69 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index b09d0031f4cf9..2972d61b6b2d4 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 2a86f6f1d42d3..be9d087b6393f 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 64aeb7ef10d92..2638b56da1f8b 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index a9fccdd327880..43cfd529ebdd1 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 5faa52541da3b..b26f5bb942824 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index f73d618b6e41c..77e6435fe9f44 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 8d98ba7161979..24d4e1d47b744 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From f9a2b0a878f05131d76959236243e7f5caffeb96 Mon Sep 17 00:00:00 2001
From: Saisai Shao <jshao@apache.org>
Date: Sun, 15 Jul 2018 01:56:15 +0000
Subject: [PATCH 526/566] Preparing development version 2.3.3-SNAPSHOT

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 8df26351614ec..6ec496620c8bd 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.2
+Version: 2.3.3
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 57485fc003097..f8b15cc3632ab 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 53e58c2f5d374..e412a4727b385 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index d05647c601807..d8f9a3d2015c1 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 8d46761151abe..a1a4f87987119 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 11c290d64590a..e650978f1a2e5 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 786e959a48cce..350e3cb8fef99 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 78d5a5eaf2335..e7fea4125cf69 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 11259d923900e..601cc5d998b59 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 276cf10aa205a..2a7e6442f2f6f 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index bca466d19fa36..7629f5f8c5818 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.2
-SPARK_VERSION_SHORT: 2.3.2
+SPARK_VERSION: 2.3.3-SNAPSHOT
+SPARK_VERSION_SHORT: 2.3.3
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 7324f3a780e49..7ecec708ac6c1 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index a7a743129e2c4..30816bba29627 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 5b8661c350554..4a2cb1361db27 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 5b924e4d73c5c..5199aba2f5f8a 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 70e0c423faa3e..3f8098887b64f 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 3c99f4821d4f9..f9874cd223938 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 65b8a3924b172..499f1bc4c0907 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index b0e857ba7534e..c4caff853ca1a 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 6f299a2d7b6c7..a09d23d917379 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index de2023837594a..6660c73a677c8 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 5ee942480a62a..f08fe460f9d9b 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index b8a15c5b10dbd..3fb0def7d45e5 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 535104b32328c..4a0aa06e621c3 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 01d140b3c7de8..5de97fe7b7824 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 33f18dbe59a1a..d1f40132f2c6f 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 77970b2f18303..96f4081cc5f94 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index be5d767e4c84a..43ef77facca8c 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index a4b6f95c9254b..01ebc1ae90aa0 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 381b0f7c84aad..ea6f27c367445 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.2</version>
+  <version>2.3.3-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 94a128ecb10af..ca37f0e98320d 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.2"
+__version__ = "2.3.3.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index a64270a800e4d..9c16fe4fb546d 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 8b6232e6772dc..d7fb35f4559df 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index b68f817b44f69..84ce020dff77b 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 2972d61b6b2d4..b09d0031f4cf9 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index be9d087b6393f..2a86f6f1d42d3 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 2638b56da1f8b..64aeb7ef10d92 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 43cfd529ebdd1..a9fccdd327880 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index b26f5bb942824..5faa52541da3b 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 77e6435fe9f44..f73d618b6e41c 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 24d4e1d47b744..8d98ba7161979 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From dae352a2929733279911f3d49d8483b20ae28767 Mon Sep 17 00:00:00 2001
From: Sean Owen <srowen@gmail.com>
Date: Mon, 16 Jul 2018 09:29:51 +0800
Subject: [PATCH 527/566] [SPARK-24813][TESTS][HIVE][HOTFIX]
 HiveExternalCatalogVersionsSuite still flaky; fall back to Apache archive

## What changes were proposed in this pull request?

Try only unique ASF mirrors to download Spark release; fall back to Apache archive if no mirrors available or release is not mirrored

## How was this patch tested?

Existing HiveExternalCatalogVersionsSuite

Author: Sean Owen <srowen@gmail.com>

Closes #21776 from srowen/SPARK-24813.

(cherry picked from commit bbc2ffc8ab27192384def9847c36b873efd87234)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 .../HiveExternalCatalogVersionsSuite.scala    | 24 ++++++++++++-------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
index 514921875f1f9..f8212684d5335 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
@@ -56,14 +56,21 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
   }
 
   private def tryDownloadSpark(version: String, path: String): Unit = {
-    // Try mirrors a few times until one succeeds
-    for (i <- 0 until 3) {
-      // we don't retry on a failure to get mirror url. If we can't get a mirror url,
-      // the test fails (getStringFromUrl will throw an exception)
-      val preferredMirror =
-        getStringFromUrl("https://www.apache.org/dyn/closer.lua?preferred=true")
+    // Try a few mirrors first; fall back to Apache archive
+    val mirrors =
+      (0 until 2).flatMap { _ =>
+        try {
+          Some(getStringFromUrl("https://www.apache.org/dyn/closer.lua?preferred=true"))
+        } catch {
+          // If we can't get a mirror URL, skip it. No retry.
+          case _: Exception => None
+        }
+      }
+    val sites = mirrors.distinct :+ "https://archive.apache.org/dist"
+    logInfo(s"Trying to download Spark $version from $sites")
+    for (site <- sites) {
       val filename = s"spark-$version-bin-hadoop2.7.tgz"
-      val url = s"$preferredMirror/spark/spark-$version/$filename"
+      val url = s"$site/spark/spark-$version/$filename"
       logInfo(s"Downloading Spark $version from $url")
       try {
         getFileFromUrl(url, path, filename)
@@ -83,7 +90,8 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
           Seq("rm", "-rf", targetDir).!
         }
       } catch {
-        case ex: Exception => logWarning(s"Failed to download Spark $version from $url", ex)
+        case ex: Exception =>
+          logWarning(s"Failed to download Spark $version from $url: ${ex.getMessage}")
       }
     }
     fail(s"Unable to download Spark $version")

From e31b4766b96dd0076d59a10b8028d9adaaa7f333 Mon Sep 17 00:00:00 2001
From: Sean Owen <srowen@gmail.com>
Date: Tue, 17 Jul 2018 11:23:34 -0500
Subject: [PATCH 528/566] [SPARK-24813][BUILD][FOLLOW-UP][HOTFIX]
 HiveExternalCatalogVersionsSuite still flaky; fall back to Apache archive

## What changes were proposed in this pull request?

Test HiveExternalCatalogVersionsSuite vs only current Spark releases

## How was this patch tested?

`HiveExternalCatalogVersionsSuite`

Author: Sean Owen <srowen@gmail.com>

Closes #21793 from srowen/SPARK-24813.3.

(cherry picked from commit 5215344deaa5533e593c62aba3fcdfa1a2901801)
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 .../spark/sql/hive/HiveExternalCatalogVersionsSuite.scala       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
index f8212684d5335..5103aa8a207db 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
@@ -203,7 +203,7 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
 
 object PROCESS_TABLES extends QueryTest with SQLTestUtils {
   // Tests the latest version of every release line.
-  val testingVersions = Seq("2.0.2", "2.1.2", "2.2.1", "2.3.1")
+  val testingVersions = Seq("2.1.3", "2.2.2", "2.3.1")
 
   protected var spark: SparkSession = _
 

From 7be70e29dd92de36dbb30ce39623d588f48e4cac Mon Sep 17 00:00:00 2001
From: sychen <sychen@ctrip.com>
Date: Wed, 18 Jul 2018 13:24:41 -0500
Subject: [PATCH 529/566] [SPARK-24677][CORE] Avoid NoSuchElementException from
 MedianHeap

## What changes were proposed in this pull request?
When speculation is enabled,
TaskSetManager#markPartitionCompleted should write successful task duration to MedianHeap,
not just increase tasksSuccessful.

Otherwise when TaskSetManager#checkSpeculatableTasks,tasksSuccessful non-zero, but MedianHeap is empty.
Then throw an exception successfulTaskDurations.median java.util.NoSuchElementException: MedianHeap is empty.
Finally led to stopping SparkContext.
## How was this patch tested?
TaskSetManagerSuite.scala
unit test:[SPARK-24677] MedianHeap should not be empty when speculation is enabled

Author: sychen <sychen@ctrip.com>

Closes #21656 from cxzl25/fix_MedianHeap_empty.

(cherry picked from commit c8bee932cb644627c4049b5a07dd8028968572d9)
Signed-off-by: Thomas Graves <tgraves@apache.org>
---
 .../spark/scheduler/TaskSchedulerImpl.scala   |  7 ++-
 .../spark/scheduler/TaskSetManager.scala      |  7 ++-
 .../spark/scheduler/TaskSetManagerSuite.scala | 49 +++++++++++++++++++
 3 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index 598b62f85a1fa..56c0bf6c09351 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -697,9 +697,12 @@ private[spark] class TaskSchedulerImpl(
    * do not also submit those same tasks.  That also means that a task completion from an earlier
    * attempt can lead to the entire stage getting marked as successful.
    */
-  private[scheduler] def markPartitionCompletedInAllTaskSets(stageId: Int, partitionId: Int) = {
+  private[scheduler] def markPartitionCompletedInAllTaskSets(
+      stageId: Int,
+      partitionId: Int,
+      taskInfo: TaskInfo) = {
     taskSetsByStageIdAndAttempt.getOrElse(stageId, Map()).values.foreach { tsm =>
-      tsm.markPartitionCompleted(partitionId)
+      tsm.markPartitionCompleted(partitionId, taskInfo)
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index b52e376e7b870..c90ae50c9deeb 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -759,7 +759,7 @@ private[spark] class TaskSetManager(
     }
     // There may be multiple tasksets for this stage -- we let all of them know that the partition
     // was completed.  This may result in some of the tasksets getting completed.
-    sched.markPartitionCompletedInAllTaskSets(stageId, tasks(index).partitionId)
+    sched.markPartitionCompletedInAllTaskSets(stageId, tasks(index).partitionId, info)
     // This method is called by "TaskSchedulerImpl.handleSuccessfulTask" which holds the
     // "TaskSchedulerImpl" lock until exiting. To avoid the SPARK-7655 issue, we should not
     // "deserialize" the value when holding a lock to avoid blocking other threads. So we call
@@ -770,9 +770,12 @@ private[spark] class TaskSetManager(
     maybeFinishTaskSet()
   }
 
-  private[scheduler] def markPartitionCompleted(partitionId: Int): Unit = {
+  private[scheduler] def markPartitionCompleted(partitionId: Int, taskInfo: TaskInfo): Unit = {
     partitionToIndex.get(partitionId).foreach { index =>
       if (!successful(index)) {
+        if (speculationEnabled && !isZombie) {
+          successfulTaskDurations.insert(taskInfo.duration)
+        }
         tasksSuccessful += 1
         successful(index) = true
         if (tasksSuccessful == numTasks) {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index 2ce81ae27daf6..0f4d056d8f58b 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -1362,6 +1362,55 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
     assert(taskOption4.get.addedJars === addedJarsMidTaskSet)
   }
 
+  test("[SPARK-24677] Avoid NoSuchElementException from MedianHeap") {
+    val conf = new SparkConf().set("spark.speculation", "true")
+    sc = new SparkContext("local", "test", conf)
+    // Set the speculation multiplier to be 0 so speculative tasks are launched immediately
+    sc.conf.set("spark.speculation.multiplier", "0.0")
+    sc.conf.set("spark.speculation.quantile", "0.1")
+    sc.conf.set("spark.speculation", "true")
+
+    sched = new FakeTaskScheduler(sc)
+    sched.initialize(new FakeSchedulerBackend())
+
+    val dagScheduler = new FakeDAGScheduler(sc, sched)
+    sched.setDAGScheduler(dagScheduler)
+
+    val taskSet1 = FakeTask.createTaskSet(10)
+    val accumUpdatesByTask: Array[Seq[AccumulatorV2[_, _]]] = taskSet1.tasks.map { task =>
+      task.metrics.internalAccums
+    }
+
+    sched.submitTasks(taskSet1)
+    sched.resourceOffers(
+      (0 until 10).map { idx => WorkerOffer(s"exec-$idx", s"host-$idx", 1) })
+
+    val taskSetManager1 = sched.taskSetManagerForAttempt(0, 0).get
+
+    // fail fetch
+    taskSetManager1.handleFailedTask(
+      taskSetManager1.taskAttempts.head.head.taskId, TaskState.FAILED,
+      FetchFailed(null, 0, 0, 0, "fetch failed"))
+
+    assert(taskSetManager1.isZombie)
+    assert(taskSetManager1.runningTasks === 9)
+
+    val taskSet2 = FakeTask.createTaskSet(10, stageAttemptId = 1)
+    sched.submitTasks(taskSet2)
+    sched.resourceOffers(
+      (11 until 20).map { idx => WorkerOffer(s"exec-$idx", s"host-$idx", 1) })
+
+    // Complete the 2 tasks and leave 8 task in running
+    for (id <- Set(0, 1)) {
+      taskSetManager1.handleSuccessfulTask(id, createTaskResult(id, accumUpdatesByTask(id)))
+      assert(sched.endedTasks(id) === Success)
+    }
+
+    val taskSetManager2 = sched.taskSetManagerForAttempt(0, 1).get
+    assert(!taskSetManager2.successfulTaskDurations.isEmpty())
+    taskSetManager2.checkSpeculatableTasks(0)
+  }
+
   private def createTaskResult(
       id: Int,
       accumUpdates: Seq[AccumulatorV2[_, _]] = Seq.empty): DirectTaskResult[Int] = {

From d0280ab818391fd11662647459f1e9e683b2bc8e Mon Sep 17 00:00:00 2001
From: Hieu Huynh <“Hieu.huynh@oath.com”>
Date: Thu, 19 Jul 2018 09:52:07 -0500
Subject: [PATCH 530/566] [SPARK-24755][CORE] Executor loss can cause task to
 not be resubmitted
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

**Description**
As described in [SPARK-24755](https://issues.apache.org/jira/browse/SPARK-24755), when speculation is enabled, there is scenario that executor loss can cause task to not be resubmitted.
This patch changes the variable killedByOtherAttempt to keeps track of the taskId of tasks that are killed by other attempt. By doing this, we can still prevent resubmitting task killed by other attempt while resubmit successful attempt when executor lost.

**How was this patch tested?**
A UT is added based on the UT written by xuanyuanking with modification to simulate the scenario described in SPARK-24755.

Author: Hieu Huynh <“Hieu.huynh@oath.com”>

Closes #21729 from hthuynh2/SPARK_24755.

(cherry picked from commit 8d707b06003bc97d06630b22e6ae7c35f99b3cdd)
Signed-off-by: Thomas Graves <tgraves@apache.org>
---
 .../spark/scheduler/TaskSetManager.scala      |  10 +-
 .../spark/scheduler/TaskSetManagerSuite.scala | 112 ++++++++++++++++++
 2 files changed, 117 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index c90ae50c9deeb..df8d9142f9164 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -85,10 +85,10 @@ private[spark] class TaskSetManager(
   val successful = new Array[Boolean](numTasks)
   private val numFailures = new Array[Int](numTasks)
 
-  // Set the coresponding index of Boolean var when the task killed by other attempt tasks,
-  // this happened while we set the `spark.speculation` to true. The task killed by others
+  // Add the tid of task into this HashSet when the task is killed by other attempt tasks.
+  // This happened while we set the `spark.speculation` to true. The task killed by others
   // should not resubmit while executor lost.
-  private val killedByOtherAttempt: Array[Boolean] = new Array[Boolean](numTasks)
+  private val killedByOtherAttempt = new HashSet[Long]
 
   val taskAttempts = Array.fill[List[TaskInfo]](numTasks)(Nil)
   private[scheduler] var tasksSuccessful = 0
@@ -736,7 +736,7 @@ private[spark] class TaskSetManager(
       logInfo(s"Killing attempt ${attemptInfo.attemptNumber} for task ${attemptInfo.id} " +
         s"in stage ${taskSet.id} (TID ${attemptInfo.taskId}) on ${attemptInfo.host} " +
         s"as the attempt ${info.attemptNumber} succeeded on ${info.host}")
-      killedByOtherAttempt(index) = true
+      killedByOtherAttempt += attemptInfo.taskId
       sched.backend.killTask(
         attemptInfo.taskId,
         attemptInfo.executorId,
@@ -942,7 +942,7 @@ private[spark] class TaskSetManager(
         && !isZombie) {
       for ((tid, info) <- taskInfos if info.executorId == execId) {
         val index = taskInfos(tid).index
-        if (successful(index) && !killedByOtherAttempt(index)) {
+        if (successful(index) && !killedByOtherAttempt.contains(tid)) {
           successful(index) = false
           copiesRunning(index) -= 1
           tasksSuccessful -= 1
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index 0f4d056d8f58b..b4acccf3bc649 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -1411,6 +1411,118 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
     taskSetManager2.checkSpeculatableTasks(0)
   }
 
+
+  test("SPARK-24755 Executor loss can cause task to not be resubmitted") {
+    val conf = new SparkConf().set("spark.speculation", "true")
+    sc = new SparkContext("local", "test", conf)
+    // Set the speculation multiplier to be 0 so speculative tasks are launched immediately
+    sc.conf.set("spark.speculation.multiplier", "0.0")
+
+    sc.conf.set("spark.speculation.quantile", "0.5")
+    sc.conf.set("spark.speculation", "true")
+
+    var killTaskCalled = false
+    sched = new FakeTaskScheduler(sc, ("exec1", "host1"),
+      ("exec2", "host2"), ("exec3", "host3"))
+    sched.initialize(new FakeSchedulerBackend() {
+      override def killTask(
+          taskId: Long,
+          executorId: String,
+          interruptThread: Boolean,
+          reason: String): Unit = {
+        // Check the only one killTask event in this case, which triggered by
+        // task 2.1 completed.
+        assert(taskId === 2)
+        assert(executorId === "exec3")
+        assert(interruptThread)
+        assert(reason === "another attempt succeeded")
+        killTaskCalled = true
+      }
+    })
+
+    // Keep track of the index of tasks that are resubmitted,
+    // so that the test can check that task is resubmitted correctly
+    var resubmittedTasks = new mutable.HashSet[Int]
+    val dagScheduler = new FakeDAGScheduler(sc, sched) {
+      override def taskEnded(
+          task: Task[_],
+          reason: TaskEndReason,
+          result: Any,
+          accumUpdates: Seq[AccumulatorV2[_, _]],
+          taskInfo: TaskInfo): Unit = {
+        super.taskEnded(task, reason, result, accumUpdates, taskInfo)
+        reason match {
+          case Resubmitted => resubmittedTasks += taskInfo.index
+          case _ =>
+        }
+      }
+    }
+    sched.dagScheduler.stop()
+    sched.setDAGScheduler(dagScheduler)
+
+    val taskSet = FakeTask.createShuffleMapTaskSet(4, 0, 0,
+      Seq(TaskLocation("host1", "exec1")),
+      Seq(TaskLocation("host1", "exec1")),
+      Seq(TaskLocation("host3", "exec3")),
+      Seq(TaskLocation("host2", "exec2")))
+
+    val clock = new ManualClock()
+    val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock = clock)
+    val accumUpdatesByTask: Array[Seq[AccumulatorV2[_, _]]] = taskSet.tasks.map { task =>
+      task.metrics.internalAccums
+    }
+    // Offer resources for 4 tasks to start
+    for ((exec, host) <- Seq(
+        "exec1" -> "host1",
+        "exec1" -> "host1",
+        "exec3" -> "host3",
+        "exec2" -> "host2")) {
+      val taskOption = manager.resourceOffer(exec, host, NO_PREF)
+      assert(taskOption.isDefined)
+      val task = taskOption.get
+      assert(task.executorId === exec)
+      // Add an extra assert to make sure task 2.0 is running on exec3
+      if (task.index == 2) {
+        assert(task.attemptNumber === 0)
+        assert(task.executorId === "exec3")
+      }
+    }
+    assert(sched.startedTasks.toSet === Set(0, 1, 2, 3))
+    clock.advance(1)
+    // Complete the 2 tasks and leave 2 task in running
+    for (id <- Set(0, 1)) {
+      manager.handleSuccessfulTask(id, createTaskResult(id, accumUpdatesByTask(id)))
+      assert(sched.endedTasks(id) === Success)
+    }
+
+    // checkSpeculatableTasks checks that the task runtime is greater than the threshold for
+    // speculating. Since we use a threshold of 0 for speculation, tasks need to be running for
+    // > 0ms, so advance the clock by 1ms here.
+    clock.advance(1)
+    assert(manager.checkSpeculatableTasks(0))
+    assert(sched.speculativeTasks.toSet === Set(2, 3))
+
+    // Offer resource to start the speculative attempt for the running task 2.0
+    val taskOption = manager.resourceOffer("exec2", "host2", ANY)
+    assert(taskOption.isDefined)
+    val task4 = taskOption.get
+    assert(task4.index === 2)
+    assert(task4.taskId === 4)
+    assert(task4.executorId === "exec2")
+    assert(task4.attemptNumber === 1)
+    // Complete the speculative attempt for the running task
+    manager.handleSuccessfulTask(4, createTaskResult(2, accumUpdatesByTask(2)))
+    // Make sure schedBackend.killTask(2, "exec3", true, "another attempt succeeded") gets called
+    assert(killTaskCalled)
+
+    assert(resubmittedTasks.isEmpty)
+    // Host 2 Losts, meaning we lost the map output task4
+    manager.executorLost("exec2", "host2", SlaveLost())
+    // Make sure that task with index 2 is re-submitted
+    assert(resubmittedTasks.contains(2))
+
+  }
+
   private def createTaskResult(
       id: Int,
       accumUpdates: Seq[AccumulatorV2[_, _]] = Seq.empty): DirectTaskResult[Int] = {

From db1f3cc76eb42de258b11d0f475a1f700e649fb6 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@apache.org>
Date: Fri, 20 Jul 2018 20:59:48 +0800
Subject: [PATCH 531/566] [SPARK-23731][SQL] Make FileSourceScanExec
 canonicalizable after being (de)serialized
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

### What's problem?

In some cases, sub scalar query could throw a NPE, which is caused in execution side.

```
java.lang.NullPointerException
	at org.apache.spark.sql.execution.FileSourceScanExec.<init>(DataSourceScanExec.scala:169)
	at org.apache.spark.sql.execution.FileSourceScanExec.doCanonicalize(DataSourceScanExec.scala:526)
	at org.apache.spark.sql.execution.FileSourceScanExec.doCanonicalize(DataSourceScanExec.scala:159)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.canonicalized$lzycompute(QueryPlan.scala:211)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.canonicalized(QueryPlan.scala:210)
	at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$3.apply(QueryPlan.scala:225)
	at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$3.apply(QueryPlan.scala:225)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.immutable.List.foreach(List.scala:392)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at scala.collection.immutable.List.map(List.scala:296)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.doCanonicalize(QueryPlan.scala:225)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.canonicalized$lzycompute(QueryPlan.scala:211)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.canonicalized(QueryPlan.scala:210)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.sameResult(QueryPlan.scala:258)
	at org.apache.spark.sql.execution.ScalarSubquery.semanticEquals(subquery.scala:58)
	at org.apache.spark.sql.catalyst.expressions.EquivalentExpressions$Expr.equals(EquivalentExpressions.scala:36)
	at scala.collection.mutable.HashTable$class.elemEquals(HashTable.scala:364)
	at scala.collection.mutable.HashMap.elemEquals(HashMap.scala:40)
	at scala.collection.mutable.HashTable$class.scala$collection$mutable$HashTable$$findEntry0(HashTable.scala:139)
	at scala.collection.mutable.HashTable$class.findEntry(HashTable.scala:135)
	at scala.collection.mutable.HashMap.findEntry(HashMap.scala:40)
	at scala.collection.mutable.HashMap.get(HashMap.scala:70)
	at org.apache.spark.sql.catalyst.expressions.EquivalentExpressions.addExpr(EquivalentExpressions.scala:56)
	at org.apache.spark.sql.catalyst.expressions.EquivalentExpressions.addExprTree(EquivalentExpressions.scala:97)
	at org.apache.spark.sql.catalyst.expressions.EquivalentExpressions$$anonfun$addExprTree$1.apply(EquivalentExpressions.scala:98)
	at org.apache.spark.sql.catalyst.expressions.EquivalentExpressions$$anonfun$addExprTree$1.apply(EquivalentExpressions.scala:98)
	at scala.collection.immutable.List.foreach(List.scala:392)
	at org.apache.spark.sql.catalyst.expressions.EquivalentExpressions.addExprTree(EquivalentExpressions.scala:98)
	at org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext$$anonfun$subexpressionElimination$1.apply(CodeGenerator.scala:1102)
	at org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext$$anonfun$subexpressionElimination$1.apply(CodeGenerator.scala:1102)
	at scala.collection.immutable.List.foreach(List.scala:392)
	at org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext.subexpressionElimination(CodeGenerator.scala:1102)
	at org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext.generateExpressions(CodeGenerator.scala:1154)
	at org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection$.createCode(GenerateUnsafeProjection.scala:270)
	at org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection$.create(GenerateUnsafeProjection.scala:319)
	at org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection$.generate(GenerateUnsafeProjection.scala:308)
	at org.apache.spark.sql.catalyst.expressions.UnsafeProjection$.create(Projection.scala:181)
	at org.apache.spark.sql.execution.ProjectExec$$anonfun$9.apply(basicPhysicalOperators.scala:71)
	at org.apache.spark.sql.execution.ProjectExec$$anonfun$9.apply(basicPhysicalOperators.scala:70)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndexInternal$1$$anonfun$apply$24.apply(RDD.scala:818)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndexInternal$1$$anonfun$apply$24.apply(RDD.scala:818)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:367)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
```

### How does this happen?

Here looks what happen now:

1. Sub scalar query was made (for instance `SELECT (SELECT id FROM foo)`).

2. Try to extract some common expressions (via `CodeGenerator.subexpressionElimination`) so that it can generates some common codes and can be reused.

3. During this, seems it extracts some expressions that can be reused (via `EquivalentExpressions.addExprTree`)

  https://github.com/apache/spark/blob/b2deef64f604ddd9502a31105ed47cb63470ec85/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala#L1102

4. During this, if the hash (`EquivalentExpressions.Expr.hashCode`) happened to be the same at `EquivalentExpressions.addExpr` anyhow, `EquivalentExpressions.Expr.equals` is called to identify object in the same hash, which eventually calls `semanticEquals` in `ScalarSubquery`

  https://github.com/apache/spark/blob/087879a77acb37b790c36f8da67355b90719c2dc/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala#L54

  https://github.com/apache/spark/blob/087879a77acb37b790c36f8da67355b90719c2dc/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala#L36

5. `ScalarSubquery`'s `semanticEquals` needs `SubqueryExec`'s `sameResult`

  https://github.com/apache/spark/blob/77a2fc5b521788b406bb32bcc3c637c1d7406e58/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala#L58

6. `SubqueryExec`'s `sameResult` requires a canonicalized plan which calls `FileSourceScanExec`'s `doCanonicalize`

  https://github.com/apache/spark/blob/e008ad175256a3192fdcbd2c4793044d52f46d57/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala#L258

7. In `FileSourceScanExec`'s `doCanonicalize`, `FileSourceScanExec`'s `relation` is required but seems `transient` so it becomes `null`.

  https://github.com/apache/spark/blob/e76b0124fbe463def00b1dffcfd8fd47e04772fe/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L527

  https://github.com/apache/spark/blob/e76b0124fbe463def00b1dffcfd8fd47e04772fe/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L160

8. NPE is thrown.

\*1. driver side
\*2., 3., 4., 5., 6., 7., 8. executor side

Note that most of cases, it looks fine because we will usually call:

https://github.com/apache/spark/blob/087879a77acb37b790c36f8da67355b90719c2dc/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala#L40

which make a canonicalized plan via:

https://github.com/apache/spark/blob/b045315e5d87b7ea3588436053aaa4d5a7bd103f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala#L192

https://github.com/apache/spark/blob/77a2fc5b521788b406bb32bcc3c637c1d7406e58/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala#L52

### How to reproduce?

This looks what happened now. I can reproduce this by a bit of messy way:

```diff
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala
index 8d06804ce1e..d25fc9a7ba9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala
 -37,7 +37,9  class EquivalentExpressions {
       case _ => false
     }

-    override def hashCode: Int = e.semanticHash()
+    override def hashCode: Int = {
+      1
+    }
   }
```

```scala
spark.range(1).write.mode("overwrite").parquet("/tmp/foo")
spark.read.parquet("/tmp/foo").createOrReplaceTempView("foo")
spark.conf.set("spark.sql.codegen.wholeStage", false)
sql("SELECT (SELECT id FROM foo) == (SELECT id FROM foo)").collect()
```

### How does this PR fix?

- Make all variables that access to `FileSourceScanExec`'s `relation` as `lazy val` so that we avoid NPE. This is a temporary fix.

- Allow `makeCopy` in `SparkPlan` without Spark session too. This looks still able to be accessed within executor side. For instance:

  ```
	at org.apache.spark.sql.execution.SparkPlan.makeCopy(SparkPlan.scala:70)
	at org.apache.spark.sql.execution.SparkPlan.makeCopy(SparkPlan.scala:47)
	at org.apache.spark.sql.catalyst.trees.TreeNode.withNewChildren(TreeNode.scala:233)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.doCanonicalize(QueryPlan.scala:243)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.canonicalized$lzycompute(QueryPlan.scala:211)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.canonicalized(QueryPlan.scala:210)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.sameResult(QueryPlan.scala:258)
	at org.apache.spark.sql.execution.ScalarSubquery.semanticEquals(subquery.scala:58)
	at org.apache.spark.sql.catalyst.expressions.EquivalentExpressions$Expr.equals(EquivalentExpressions.scala:36)
	at scala.collection.mutable.HashTable$class.elemEquals(HashTable.scala:364)
	at scala.collection.mutable.HashMap.elemEquals(HashMap.scala:40)
	at scala.collection.mutable.HashTable$class.scala$collection$mutable$HashTable$$findEntry0(HashTable.scala:139)
	at scala.collection.mutable.HashTable$class.findEntry(HashTable.scala:135)
	at scala.collection.mutable.HashMap.findEntry(HashMap.scala:40)
	at scala.collection.mutable.HashMap.get(HashMap.scala:70)
	at org.apache.spark.sql.catalyst.expressions.EquivalentExpressions.addExpr(EquivalentExpressions.scala:54)
	at org.apache.spark.sql.catalyst.expressions.EquivalentExpressions.addExprTree(EquivalentExpressions.scala:95)
	at org.apache.spark.sql.catalyst.expressions.EquivalentExpressions$$anonfun$addExprTree$1.apply(EquivalentExpressions.scala:96)
	at org.apache.spark.sql.catalyst.expressions.EquivalentExpressions$$anonfun$addExprTree$1.apply(EquivalentExpressions.scala:96)
	at scala.collection.immutable.List.foreach(List.scala:392)
	at org.apache.spark.sql.catalyst.expressions.EquivalentExpressions.addExprTree(EquivalentExpressions.scala:96)
	at org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext$$anonfun$subexpressionElimination$1.apply(CodeGenerator.scala:1102)
	at org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext$$anonfun$subexpressionElimination$1.apply(CodeGenerator.scala:1102)
	at scala.collection.immutable.List.foreach(List.scala:392)
	at org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext.subexpressionElimination(CodeGenerator.scala:1102)
	at org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext.generateExpressions(CodeGenerator.scala:1154)
	at org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection$.createCode(GenerateUnsafeProjection.scala:270)
	at org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection$.create(GenerateUnsafeProjection.scala:319)
	at org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection$.generate(GenerateUnsafeProjection.scala:308)
	at org.apache.spark.sql.catalyst.expressions.UnsafeProjection$.create(Projection.scala:181)
	at org.apache.spark.sql.execution.ProjectExec$$anonfun$9.apply(basicPhysicalOperators.scala:71)
	at org.apache.spark.sql.execution.ProjectExec$$anonfun$9.apply(basicPhysicalOperators.scala:70)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndexInternal$1$$anonfun$apply$24.apply(RDD.scala:818)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndexInternal$1$$anonfun$apply$24.apply(RDD.scala:818)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:367)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
  ```

This PR takes over https://github.com/apache/spark/pull/20856.

## How was this patch tested?

Manually tested and unit test was added.

Closes #20856

Author: hyukjinkwon <gurwls223@apache.org>

Closes #21815 from HyukjinKwon/SPARK-23731.

(cherry picked from commit e0b63832181464453f753649623a24cb567a73d4)

Signed-off-by: Wenchen Fan <wenchen@databricks.com>

Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/execution/DataSourceScanExec.scala      | 10 ++++++----
 .../apache/spark/sql/execution/SparkPlan.scala  | 10 +++++-----
 .../spark/sql/execution/SparkPlanSuite.scala    | 17 +++++++++++++++++
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 1a98fe0f3fa33..0a23264a4a16b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -164,10 +164,12 @@ case class FileSourceScanExec(
     override val tableIdentifier: Option[TableIdentifier])
   extends DataSourceScanExec with ColumnarBatchScan  {
 
-  override val supportsBatch: Boolean = relation.fileFormat.supportBatch(
+  // Note that some vals referring the file-based relation are lazy intentionally
+  // so that this plan can be canonicalized on executor side too. See SPARK-23731.
+  override lazy val supportsBatch: Boolean = relation.fileFormat.supportBatch(
     relation.sparkSession, StructType.fromAttributes(output))
 
-  override val needsUnsafeRowConversion: Boolean = {
+  override lazy val needsUnsafeRowConversion: Boolean = {
     if (relation.fileFormat.isInstanceOf[ParquetSource]) {
       SparkSession.getActiveSession.get.sessionState.conf.parquetVectorizedReaderEnabled
     } else {
@@ -197,7 +199,7 @@ case class FileSourceScanExec(
     ret
   }
 
-  override val (outputPartitioning, outputOrdering): (Partitioning, Seq[SortOrder]) = {
+  override lazy val (outputPartitioning, outputOrdering): (Partitioning, Seq[SortOrder]) = {
     val bucketSpec = if (relation.sparkSession.sessionState.conf.bucketingEnabled) {
       relation.bucketSpec
     } else {
@@ -268,7 +270,7 @@ case class FileSourceScanExec(
   private val pushedDownFilters = dataFilters.flatMap(DataSourceStrategy.translateFilter)
   logInfo(s"Pushed Filters: ${pushedDownFilters.mkString(",")}")
 
-  override val metadata: Map[String, String] = {
+  override lazy val metadata: Map[String, String] = {
     def seqToString(seq: Seq[Any]) = seq.mkString("[", ", ", "]")
     val location = relation.location
     val locationDesc =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 398758a3331b4..1f97993e20458 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -47,17 +47,15 @@ import org.apache.spark.util.ThreadUtils
 abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializable {
 
   /**
-   * A handle to the SQL Context that was used to create this plan.   Since many operators need
+   * A handle to the SQL Context that was used to create this plan. Since many operators need
    * access to the sqlContext for RDD operations or configuration this field is automatically
    * populated by the query planning infrastructure.
    */
-  @transient
-  final val sqlContext = SparkSession.getActiveSession.map(_.sqlContext).orNull
+  @transient final val sqlContext = SparkSession.getActiveSession.map(_.sqlContext).orNull
 
   protected def sparkContext = sqlContext.sparkContext
 
   // sqlContext will be null when SparkPlan nodes are created without the active sessions.
-  // So far, this only happens in the test cases.
   val subexpressionEliminationEnabled: Boolean = if (sqlContext != null) {
     sqlContext.conf.subexpressionEliminationEnabled
   } else {
@@ -69,7 +67,9 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
 
   /** Overridden make copy also propagates sqlContext to copied plan. */
   override def makeCopy(newArgs: Array[AnyRef]): SparkPlan = {
-    SparkSession.setActiveSession(sqlContext.sparkSession)
+    if (sqlContext != null) {
+      SparkSession.setActiveSession(sqlContext.sparkSession)
+    }
     super.makeCopy(newArgs)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala
index 750d9e4adf8b4..34dc6f37c0e4d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution
 
+import org.apache.spark.SparkEnv
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.test.SharedSQLContext
 
@@ -33,4 +34,20 @@ class SparkPlanSuite extends QueryTest with SharedSQLContext {
     intercept[IllegalStateException] { plan.executeTake(1) }
   }
 
+  test("SPARK-23731 plans should be canonicalizable after being (de)serialized") {
+    withTempPath { path =>
+      spark.range(1).write.parquet(path.getAbsolutePath)
+      val df = spark.read.parquet(path.getAbsolutePath)
+      val fileSourceScanExec =
+        df.queryExecution.sparkPlan.collectFirst { case p: FileSourceScanExec => p }.get
+      val serializer = SparkEnv.get.serializer.newInstance()
+      val readback =
+        serializer.deserialize[FileSourceScanExec](serializer.serialize(fileSourceScanExec))
+      try {
+        readback.canonicalized
+      } catch {
+        case e: Throwable => fail("FileSourceScanExec was not canonicalizable", e)
+      }
+    }
+  }
 }

From bd6bfacb2ef473b858cc91e0692591deaea26118 Mon Sep 17 00:00:00 2001
From: William Sheu <william.sheu@databricks.com>
Date: Fri, 20 Jul 2018 19:59:28 -0700
Subject: [PATCH 532/566] [SPARK-24879][SQL] Fix NPE in Hive partition pruning
 filter pushdown

## What changes were proposed in this pull request?
We get a NPE when we have a filter on a partition column of the form `col in (x, null)`. This is due to the filter converter in HiveShim not handling `null`s correctly. This patch fixes this bug while still pushing down as much of the partition pruning predicates as possible, by filtering out `null`s from any `in` predicate. Since Hive only supports very simple partition pruning filters, this change should preserve correctness.

## How was this patch tested?
Unit tests, manual tests

Author: William Sheu <william.sheu@databricks.com>

Closes #21832 from PenguinToast/partition-pruning-npe.

(cherry picked from commit bbd6f0c25fe19dc6c946e63cac7b98d0f78b3463)
Signed-off-by: Xiao Li <gatorsmile@gmail.com>
---
 .../spark/sql/hive/client/HiveShim.scala      | 19 ++++++++++++++++++-
 .../spark/sql/hive/client/FiltersSuite.scala  | 14 ++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index 60fe31f3542a5..68acea7cac6d0 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -599,6 +599,7 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
 
     object ExtractableLiteral {
       def unapply(expr: Expression): Option[String] = expr match {
+        case Literal(null, _) => None // `null`s can be cast as other types; we want to avoid NPEs.
         case Literal(value, _: IntegralType) => Some(value.toString)
         case Literal(value, _: StringType) => Some(quoteStringLiteral(value.toString))
         case _ => None
@@ -607,7 +608,23 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
 
     object ExtractableLiterals {
       def unapply(exprs: Seq[Expression]): Option[Seq[String]] = {
-        val extractables = exprs.map(ExtractableLiteral.unapply)
+        // SPARK-24879: The Hive metastore filter parser does not support "null", but we still want
+        // to push down as many predicates as we can while still maintaining correctness.
+        // In SQL, the `IN` expression evaluates as follows:
+        //  > `1 in (2, NULL)` -> NULL
+        //  > `1 in (1, NULL)` -> true
+        //  > `1 in (2)` -> false
+        // Since Hive metastore filters are NULL-intolerant binary operations joined only by
+        // `AND` and `OR`, we can treat `NULL` as `false` and thus rewrite `1 in (2, NULL)` as
+        // `1 in (2)`.
+        // If the Hive metastore begins supporting NULL-tolerant predicates and Spark starts
+        // pushing down these predicates, then this optimization will become incorrect and need
+        // to be changed.
+        val extractables = exprs
+            .filter {
+              case Literal(null, _) => false
+              case _ => true
+            }.map(ExtractableLiteral.unapply)
         if (extractables.nonEmpty && extractables.forall(_.isDefined)) {
           Some(extractables.map(_.get))
         } else {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
index 19765695fbcb4..2a4efd0cce6e0 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
@@ -72,6 +72,20 @@ class FiltersSuite extends SparkFunSuite with Logging with PlanTest {
       (Literal("p2\" and q=\"q2") === a("stringcol", StringType)) :: Nil,
     """stringcol = 'p1" and q="q1' and 'p2" and q="q2' = stringcol""")
 
+  filterTest("SPARK-24879 null literals should be ignored for IN constructs",
+    (a("intcol", IntegerType) in (Literal(1), Literal(null))) :: Nil,
+    "(intcol = 1)")
+
+  // Applying the predicate `x IN (NULL)` should return an empty set, but since this optimization
+  // will be applied by Catalyst, this filter converter does not need to account for this.
+  filterTest("SPARK-24879 IN predicates with only NULLs will not cause a NPE",
+    (a("intcol", IntegerType) in Literal(null)) :: Nil,
+    "")
+
+  filterTest("typecast null literals should not be pushed down in simple predicates",
+    (a("intcol", IntegerType) === Literal(null, IntegerType)) :: Nil,
+    "")
+
   private def filterTest(name: String, filters: Seq[Expression], result: String) = {
     test(name) {
       withSQLConf(SQLConf.ADVANCED_PARTITION_PREDICATE_PUSHDOWN.key -> "true") {

From f5bc94861afb5d63526f6388e0d7edd49048a71a Mon Sep 17 00:00:00 2001
From: SongYadong <song.yadong1@zte.com.cn>
Date: Mon, 23 Jul 2018 19:10:53 +0800
Subject: [PATCH 533/566] [SQL][HIVE] Correct an assert message in function
 makeRDDForTable

## What changes were proposed in this pull request?
according to the context, "makeRDDForTablePartitions" in assert message should be "makeRDDForPartitionedTable", because "makeRDDForTablePartitions" does't exist in spark code.

## How was this patch tested?
unit tests

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: SongYadong <song.yadong1@zte.com.cn>

Closes #21836 from SongYadong/assert_info_modify.

(cherry picked from commit ab18b02e66fd04bc8f1a4fb7b6a7f2773902a494)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 .../main/scala/org/apache/spark/sql/hive/TableReader.scala   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index b5444a4217924..7d57389947576 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -110,8 +110,9 @@ class HadoopTableReader(
       deserializerClass: Class[_ <: Deserializer],
       filterOpt: Option[PathFilter]): RDD[InternalRow] = {
 
-    assert(!hiveTable.isPartitioned, """makeRDDForTable() cannot be called on a partitioned table,
-      since input formats may differ across partitions. Use makeRDDForTablePartitions() instead.""")
+    assert(!hiveTable.isPartitioned,
+      "makeRDDForTable() cannot be called on a partitioned table, since input formats may " +
+      "differ across partitions. Use makeRDDForPartitionedTable() instead.")
 
     // Create local references to member variables, so that the entire `this` object won't be
     // serialized in the closure below.

From 740a23d7d13a57083e190fe85636de893480dab5 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@apache.org>
Date: Tue, 24 Jul 2018 19:51:09 +0800
Subject: [PATCH 534/566] [SPARK-22499][FOLLOWUP][SQL] Reduce input string
 expressions for Least and Greatest to reduce time in its test

## What changes were proposed in this pull request?

It's minor and trivial but looks 2000 input is good enough to reproduce and test in SPARK-22499.

## How was this patch tested?

Manually brought the change and tested.

Locally tested:

Before: 3m 21s 288ms
After: 1m 29s 134ms

Given the latest successful build took:

```
ArithmeticExpressionSuite:
- SPARK-22499: Least and greatest should not generate codes beyond 64KB (7 minutes, 49 seconds)
```

I expect it's going to save 4ish mins.

Author: hyukjinkwon <gurwls223@apache.org>

Closes #21855 from HyukjinKwon/minor-fix-suite.

(cherry picked from commit 3d5c61e5fd24f07302e39b5d61294da79aa0c2f9)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 .../sql/catalyst/expressions/ArithmeticExpressionSuite.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
index 6edb4348f8309..94d046a75a5a9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
@@ -337,7 +337,7 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper
   }
 
   test("SPARK-22499: Least and greatest should not generate codes beyond 64KB") {
-    val N = 3000
+    val N = 2000
     val strings = (1 to N).map(x => "s" * x)
     val inputsExpr = strings.map(Literal.create(_, StringType))
 

From 6a59992866a971abf6052e479ba48c6abded4d04 Mon Sep 17 00:00:00 2001
From: maryannxue <maryannxue@apache.org>
Date: Tue, 24 Jul 2018 19:35:34 -0700
Subject: [PATCH 535/566] [SPARK-24891][SQL] Fix HandleNullInputsForUDF rule

The HandleNullInputsForUDF would always add a new `If` node every time it is applied. That would cause a difference between the same plan being analyzed once and being analyzed twice (or more), thus raising issues like plan not matched in the cache manager. The solution is to mark the arguments as null-checked, which is to add a "KnownNotNull" node above those arguments, when adding the UDF under an `If` node, because clearly the UDF will not be called when any of those arguments is null.

Add new tests under sql/UDFSuite and AnalysisSuite.

Author: maryannxue <maryannxue@apache.org>

Closes #21851 from maryannxue/spark-24891.
---
 .../sql/catalyst/analysis/Analyzer.scala      | 22 ++++++++----
 .../expressions/constraintExpressions.scala   | 35 +++++++++++++++++++
 .../sql/catalyst/analysis/AnalysisSuite.scala | 16 +++++++--
 .../scala/org/apache/spark/sql/UDFSuite.scala | 31 +++++++++++++++-
 4 files changed, 94 insertions(+), 10 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/constraintExpressions.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 2858bee890d04..5963c1467e254 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.encoders.OuterScopes
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.SubExprUtils._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
-import org.apache.spark.sql.catalyst.expressions.objects.{LambdaVariable, MapObjects, NewInstance, UnresolvedMapObjects}
+import org.apache.spark.sql.catalyst.expressions.objects._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
@@ -2046,14 +2046,24 @@ class Analyzer(
           val parameterTypes = ScalaReflection.getParameterTypes(func)
           assert(parameterTypes.length == inputs.length)
 
+          // TODO: skip null handling for not-nullable primitive inputs after we can completely
+          // trust the `nullable` information.
+          // (cls, expr) => cls.isPrimitive && expr.nullable
+          val needsNullCheck = (cls: Class[_], expr: Expression) =>
+            cls.isPrimitive && !expr.isInstanceOf[KnowNotNull]
           val inputsNullCheck = parameterTypes.zip(inputs)
-            // TODO: skip null handling for not-nullable primitive inputs after we can completely
-            // trust the `nullable` information.
-            // .filter { case (cls, expr) => cls.isPrimitive && expr.nullable }
-            .filter { case (cls, _) => cls.isPrimitive }
+            .filter { case (cls, expr) => needsNullCheck(cls, expr) }
             .map { case (_, expr) => IsNull(expr) }
             .reduceLeftOption[Expression]((e1, e2) => Or(e1, e2))
-          inputsNullCheck.map(If(_, Literal.create(null, udf.dataType), udf)).getOrElse(udf)
+          // Once we add an `If` check above the udf, it is safe to mark those checked inputs
+          // as not nullable (i.e., wrap them with `KnownNotNull`), because the null-returning
+          // branch of `If` will be called if any of these checked inputs is null. Thus we can
+          // prevent this rule from being applied repeatedly.
+          val newInputs = parameterTypes.zip(inputs).map{ case (cls, expr) =>
+            if (needsNullCheck(cls, expr)) KnowNotNull(expr) else expr }
+          inputsNullCheck
+            .map(If(_, Literal.create(null, udf.dataType), udf.copy(children = newInputs)))
+            .getOrElse(udf)
       }
     }
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/constraintExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/constraintExpressions.scala
new file mode 100644
index 0000000000000..53936aa914c8f
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/constraintExpressions.scala
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode, FalseLiteral}
+import org.apache.spark.sql.types.DataType
+
+case class KnowNotNull(child: Expression) extends UnaryExpression {
+  override def nullable: Boolean = false
+  override def dataType: DataType = child.dataType
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    child.genCode(ctx).copy(isNull = FalseLiteral)
+  }
+
+  override def eval(input: InternalRow): Any = {
+    child.eval(input)
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index cd8579584eada..6900a6cff8350 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -315,7 +315,8 @@ class AnalysisSuite extends AnalysisTest with Matchers {
 
     // only primitive parameter needs special null handling
     val udf2 = ScalaUDF((s: String, d: Double) => "x", StringType, string :: double :: Nil)
-    val expected2 = If(IsNull(double), nullResult, udf2)
+    val expected2 =
+      If(IsNull(double), nullResult, udf2.copy(children = string :: KnowNotNull(double) :: Nil))
     checkUDF(udf2, expected2)
 
     // special null handling should apply to all primitive parameters
@@ -323,7 +324,7 @@ class AnalysisSuite extends AnalysisTest with Matchers {
     val expected3 = If(
       IsNull(short) || IsNull(double),
       nullResult,
-      udf3)
+      udf3.copy(children = KnowNotNull(short) :: KnowNotNull(double) :: Nil))
     checkUDF(udf3, expected3)
 
     // we can skip special null handling for primitive parameters that are not nullable
@@ -335,10 +336,19 @@ class AnalysisSuite extends AnalysisTest with Matchers {
     val expected4 = If(
       IsNull(short),
       nullResult,
-      udf4)
+      udf4.copy(children = KnowNotNull(short) :: double.withNullability(false) :: Nil))
     // checkUDF(udf4, expected4)
   }
 
+  test("SPARK-24891 Fix HandleNullInputsForUDF rule") {
+    val a = testRelation.output(0)
+    val func = (x: Int, y: Int) => x + y
+    val udf1 = ScalaUDF(func, IntegerType, a :: a :: Nil)
+    val udf2 = ScalaUDF(func, IntegerType, a :: udf1 :: Nil)
+    val plan = Project(Alias(udf2, "")() :: Nil, testRelation)
+    comparePlans(plan.analyze, plan.analyze.analyze)
+  }
+
   test("SPARK-11863 mixture of aliases and real columns in order by clause - tpcds 19,55,71") {
     val a = testRelation2.output(0)
     val c = testRelation2.output(2)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index af6a10b425b9f..57bdec30c9a23 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql
 import org.apache.spark.sql.api.java._
 import org.apache.spark.sql.catalyst.plans.logical.Project
 import org.apache.spark.sql.execution.command.ExplainCommand
-import org.apache.spark.sql.functions.udf
+import org.apache.spark.sql.functions.{lit, udf}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.test.SQLTestData._
 import org.apache.spark.sql.types.{DataTypes, DoubleType}
@@ -304,4 +304,33 @@ class UDFSuite extends QueryTest with SharedSQLContext {
     assert(explainStr(spark.range(1).select(udf1(udf2(functions.lit(1)))))
       .contains(s"UDF:$udf1Name(UDF:$udf2Name(1))"))
   }
+
+  test("SPARK-24891 Fix HandleNullInputsForUDF rule") {
+    val udf1 = udf({(x: Int, y: Int) => x + y})
+    val df = spark.range(0, 3).toDF("a")
+      .withColumn("b", udf1($"a", udf1($"a", lit(10))))
+      .withColumn("c", udf1($"a", lit(null)))
+    val plan = spark.sessionState.executePlan(df.logicalPlan).analyzed
+
+    comparePlans(df.logicalPlan, plan)
+    checkAnswer(
+      df,
+      Seq(
+        Row(0, 10, null),
+        Row(1, 12, null),
+        Row(2, 14, null)))
+  }
+
+  test("SPARK-24891 Fix HandleNullInputsForUDF rule - with table") {
+    withTable("x") {
+      Seq((1, "2"), (2, "4")).toDF("a", "b").write.format("json").saveAsTable("x")
+      sql("insert into table x values(3, null)")
+      sql("insert into table x values(null, '4')")
+      spark.udf.register("f", (a: Int, b: String) => a + b)
+      val df = spark.sql("SELECT f(a, b) FROM x")
+      val plan = spark.sessionState.executePlan(df.logicalPlan).analyzed
+      comparePlans(df.logicalPlan, plan)
+      checkAnswer(df, Seq(Row("12"), Row("24"), Row("3null"), Row(null)))
+    }
+  }
 }

From 740606eb8714c2536f239b454b0d0f5790886414 Mon Sep 17 00:00:00 2001
From: Xiao Li <gatorsmile@gmail.com>
Date: Wed, 25 Jul 2018 19:10:44 +0800
Subject: [PATCH 536/566] [SPARK-24891][FOLLOWUP][HOT-FIX][2.3] Fix the
 Compilation Errors

## What changes were proposed in this pull request?

This PR is to fix the compilation failure in 2.3 build.

https://amplab.cs.berkeley.edu/jenkins/view/Spark%20QA%20Compile/job/spark-branch-2.3-compile-maven-hadoop-2.7/764/

## How was this patch tested?
N/A

Author: Xiao Li <gatorsmile@gmail.com>

Closes #21869 from gatorsmile/testSPARK-24891.
---
 .../sql/catalyst/expressions/constraintExpressions.scala      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/constraintExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/constraintExpressions.scala
index 53936aa914c8f..1b2f020a80449 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/constraintExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/constraintExpressions.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode, FalseLiteral}
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
 import org.apache.spark.sql.types.DataType
 
 case class KnowNotNull(child: Expression) extends UnaryExpression {
@@ -26,7 +26,7 @@ case class KnowNotNull(child: Expression) extends UnaryExpression {
   override def dataType: DataType = child.dataType
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    child.genCode(ctx).copy(isNull = FalseLiteral)
+    child.genCode(ctx).copy(isNull = "false")
   }
 
   override def eval(input: InternalRow): Any = {

From fa552c3c1102404fe98c72a5b83cffbc5ba41df3 Mon Sep 17 00:00:00 2001
From: Xiao Li <gatorsmile@gmail.com>
Date: Wed, 25 Jul 2018 17:22:37 -0700
Subject: [PATCH 537/566] [SPARK-24867][SQL] Add AnalysisBarrier to
 DataFrameWriter

```Scala
      val udf1 = udf({(x: Int, y: Int) => x + y})
      val df = spark.range(0, 3).toDF("a")
        .withColumn("b", udf1($"a", udf1($"a", lit(10))))
      df.cache()
      df.write.saveAsTable("t")
```
Cache is not being used because the plans do not match with the cached plan. This is a regression caused by the changes we made in AnalysisBarrier, since not all the Analyzer rules are idempotent.

Added a test.

Also found a bug in the DSV1 write path. This is not a regression. Thus, opened a separate JIRA https://issues.apache.org/jira/browse/SPARK-24869

Author: Xiao Li <gatorsmile@gmail.com>

Closes #21821 from gatorsmile/testMaster22.

(cherry picked from commit d2e7deb59f641e93778b763d5396f73d38f9a785)
Signed-off-by: Xiao Li <gatorsmile@gmail.com>
---
 .../apache/spark/sql/DataFrameWriter.scala    | 10 +++--
 .../spark/sql/execution/command/ddl.scala     |  7 ++--
 .../scala/org/apache/spark/sql/UDFSuite.scala | 42 ++++++++++++++++++-
 3 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index ed7a9100cc7f1..6c9fb52290e7b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -254,7 +254,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
           val writer = ws.createWriter(jobId, df.logicalPlan.schema, mode, options)
           if (writer.isPresent) {
             runCommand(df.sparkSession, "save") {
-              WriteToDataSourceV2(writer.get(), df.logicalPlan)
+              WriteToDataSourceV2(writer.get(), df.planWithBarrier)
             }
           }
 
@@ -275,7 +275,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
         sparkSession = df.sparkSession,
         className = source,
         partitionColumns = partitioningColumns.getOrElse(Nil),
-        options = extraOptions.toMap).planForWriting(mode, AnalysisBarrier(df.logicalPlan))
+        options = extraOptions.toMap).planForWriting(mode, df.planWithBarrier)
     }
   }
 
@@ -323,7 +323,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
       InsertIntoTable(
         table = UnresolvedRelation(tableIdent),
         partition = Map.empty[String, Option[String]],
-        query = df.logicalPlan,
+        query = df.planWithBarrier,
         overwrite = mode == SaveMode.Overwrite,
         ifPartitionNotExists = false)
     }
@@ -455,7 +455,9 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
       partitionColumnNames = partitioningColumns.getOrElse(Nil),
       bucketSpec = getBucketSpec)
 
-    runCommand(df.sparkSession, "saveAsTable")(CreateTable(tableDesc, mode, Some(df.logicalPlan)))
+    runCommand(df.sparkSession, "saveAsTable") {
+      CreateTable(tableDesc, mode, Some(df.planWithBarrier))
+    }
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 0f4831b348ce6..28313f2af874e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -29,7 +29,7 @@ import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.analysis.{NoSuchTableException, Resolver}
+import org.apache.spark.sql.catalyst.analysis.{EliminateBarriers, NoSuchTableException, Resolver}
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
@@ -889,8 +889,9 @@ object DDLUtils {
    * Throws exception if outputPath tries to overwrite inputpath.
    */
   def verifyNotReadPath(query: LogicalPlan, outputPath: Path) : Unit = {
-    val inputPaths = query.collect {
-      case LogicalRelation(r: HadoopFsRelation, _, _, _) => r.location.rootPaths
+    val inputPaths = EliminateBarriers(query).collect {
+      case LogicalRelation(r: HadoopFsRelation, _, _, _) =>
+        r.location.rootPaths
     }.flatten
 
     if (inputPaths.contains(outputPath)) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index 57bdec30c9a23..6f3937cfe89cc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -19,11 +19,16 @@ package org.apache.spark.sql
 
 import org.apache.spark.sql.api.java._
 import org.apache.spark.sql.catalyst.plans.logical.Project
-import org.apache.spark.sql.execution.command.ExplainCommand
+import org.apache.spark.sql.execution.QueryExecution
+import org.apache.spark.sql.execution.columnar.InMemoryRelation
+import org.apache.spark.sql.execution.command.{CreateDataSourceTableAsSelectCommand, ExplainCommand}
+import org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand
 import org.apache.spark.sql.functions.{lit, udf}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.test.SQLTestData._
 import org.apache.spark.sql.types.{DataTypes, DoubleType}
+import org.apache.spark.sql.util.QueryExecutionListener
+
 
 private case class FunctionResult(f1: String, f2: String)
 
@@ -305,6 +310,41 @@ class UDFSuite extends QueryTest with SharedSQLContext {
       .contains(s"UDF:$udf1Name(UDF:$udf2Name(1))"))
   }
 
+  test("cached Data should be used in the write path") {
+    withTable("t") {
+      withTempPath { path =>
+        var numTotalCachedHit = 0
+        val listener = new QueryExecutionListener {
+          override def onFailure(f: String, qe: QueryExecution, e: Exception): Unit = {}
+
+          override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = {
+            qe.withCachedData match {
+              case c: CreateDataSourceTableAsSelectCommand
+                  if c.query.isInstanceOf[InMemoryRelation] =>
+                numTotalCachedHit += 1
+              case i: InsertIntoHadoopFsRelationCommand
+                  if i.query.isInstanceOf[InMemoryRelation] =>
+                numTotalCachedHit += 1
+              case _ =>
+            }
+          }
+        }
+        spark.listenerManager.register(listener)
+
+        val udf1 = udf({ (x: Int, y: Int) => x + y })
+        val df = spark.range(0, 3).toDF("a")
+          .withColumn("b", udf1($"a", lit(10)))
+        df.cache()
+        df.write.saveAsTable("t")
+        assert(numTotalCachedHit == 1, "expected to be cached in saveAsTable")
+        df.write.insertInto("t")
+        assert(numTotalCachedHit == 2, "expected to be cached in insertInto")
+        df.write.save(path.getCanonicalPath)
+        assert(numTotalCachedHit == 3, "expected to be cached in save for native")
+      }
+    }
+  }
+
   test("SPARK-24891 Fix HandleNullInputsForUDF rule") {
     val udf1 = udf({(x: Int, y: Int) => x + y})
     val df = spark.range(0, 3).toDF("a")

From d5f340f27706bd9767f23ac9726f904028916814 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian.cs.zju@gmail.com>
Date: Fri, 27 Jul 2018 08:57:48 -0700
Subject: [PATCH 538/566] [SPARK-24927][BUILD][BRANCH-2.3] The scope of
 snappy-java cannot be "provided"

## What changes were proposed in this pull request?

Please see [SPARK-24927][1] for more details.

[1]: https://issues.apache.org/jira/browse/SPARK-24927

## How was this patch tested?

Manually tested.

Author: Cheng Lian <lian.cs.zju@gmail.com>

Closes #21879 from liancheng/spark-24927.
---
 pom.xml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index ea6f27c367445..179e9d2bb78a7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -536,7 +536,6 @@
         <groupId>org.xerial.snappy</groupId>
         <artifactId>snappy-java</artifactId>
         <version>${snappy.version}</version>
-        <scope>${hadoop.deps.scope}</scope>
       </dependency>
       <dependency>
         <groupId>org.lz4</groupId>

From 71eb7d4682a7e85e4de580ffe110da961d84817f Mon Sep 17 00:00:00 2001
From: liulijia <liutang123@yeah.net>
Date: Sun, 29 Jul 2018 13:13:00 -0700
Subject: [PATCH 539/566] [SPARK-24809][SQL] Serializing LongToUnsafeRowMap in
 executor may result in data error

When join key is long or int in broadcast join, Spark will use `LongToUnsafeRowMap` to store key-values of the table witch will be broadcasted. But, when `LongToUnsafeRowMap` is broadcasted to executors, and it is too big to hold in memory, it will be stored in disk. At that time, because `write` uses a variable `cursor` to determine how many bytes in `page` of `LongToUnsafeRowMap` will be write out and the `cursor` was not restore when deserializing, executor will write out nothing from page into disk.

## What changes were proposed in this pull request?
Restore cursor value when deserializing.

Author: liulijia <liutang123@yeah.net>

Closes #21772 from liutang123/SPARK-24809.

(cherry picked from commit 2c54aae1bc2fa3da26917c89e6201fb2108d9fab)
Signed-off-by: Xiao Li <gatorsmile@gmail.com>
---
 .../sql/execution/joins/HashedRelation.scala  |  2 ++
 .../execution/joins/HashedRelationSuite.scala | 29 +++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index 20ce01f4ce8cc..86eb47a70f1ad 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -772,6 +772,8 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
     array = readLongArray(readBuffer, length)
     val pageLength = readLong().toInt
     page = readLongArray(readBuffer, pageLength)
+    // Restore cursor variable to make this map able to be serialized again on executors.
+    cursor = pageLength * 8 + Platform.LONG_ARRAY_OFFSET
   }
 
   override def readExternal(in: ObjectInput): Unit = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
index 037cc2e3ccad7..d9b34dcd16476 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
@@ -278,6 +278,35 @@ class HashedRelationSuite extends SparkFunSuite with SharedSQLContext {
     map.free()
   }
 
+  test("SPARK-24809: Serializing LongToUnsafeRowMap in executor may result in data error") {
+    val unsafeProj = UnsafeProjection.create(Array[DataType](LongType))
+    val originalMap = new LongToUnsafeRowMap(mm, 1)
+
+    val key1 = 1L
+    val value1 = 4852306286022334418L
+
+    val key2 = 2L
+    val value2 = 8813607448788216010L
+
+    originalMap.append(key1, unsafeProj(InternalRow(value1)))
+    originalMap.append(key2, unsafeProj(InternalRow(value2)))
+    originalMap.optimize()
+
+    val ser = sparkContext.env.serializer.newInstance()
+    // Simulate serialize/deserialize twice on driver and executor
+    val firstTimeSerialized = ser.deserialize[LongToUnsafeRowMap](ser.serialize(originalMap))
+    val secondTimeSerialized =
+      ser.deserialize[LongToUnsafeRowMap](ser.serialize(firstTimeSerialized))
+
+    val resultRow = new UnsafeRow(1)
+    assert(secondTimeSerialized.getValue(key1, resultRow).getLong(0) === value1)
+    assert(secondTimeSerialized.getValue(key2, resultRow).getLong(0) === value2)
+
+    originalMap.free()
+    firstTimeSerialized.free()
+    secondTimeSerialized.free()
+  }
+
   test("Spark-14521") {
     val ser = new KryoSerializer(
       (new SparkConf).set("spark.kryo.referenceTracking", "false")).newInstance()

From bad56bb7b2340d338eac8cea07e9f1bb3e08b1ac Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Mon, 30 Jul 2018 09:58:28 +0800
Subject: [PATCH 540/566] [MINOR][CORE][TEST] Fix afterEach() in
 TastSetManagerSuite and TaskSchedulerImplSuite

## What changes were proposed in this pull request?

In the `afterEach()` method of both `TastSetManagerSuite` and `TaskSchedulerImplSuite`, `super.afterEach()` shall be called at the end, because it shall stop the SparkContext.

https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/93706/testReport/org.apache.spark.scheduler/TaskSchedulerImplSuite/_It_is_not_a_test_it_is_a_sbt_testing_SuiteSelector_/
The test failure is caused by the above reason, the newly added `barrierCoordinator` required `rpcEnv` which has been stopped before `TaskSchedulerImpl` doing cleanup.

## How was this patch tested?
Existing tests.

Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #21908 from jiangxb1987/afterEach.

(cherry picked from commit 3695ba57731a669ed20e7f676edee602c292fbed)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 .../org/apache/spark/scheduler/TaskSchedulerImplSuite.scala     | 2 +-
 .../scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
index 33f2ea1c94e75..a9e0aed79c6d8 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
@@ -62,7 +62,6 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B
   }
 
   override def afterEach(): Unit = {
-    super.afterEach()
     if (taskScheduler != null) {
       taskScheduler.stop()
       taskScheduler = null
@@ -71,6 +70,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B
       dagScheduler.stop()
       dagScheduler = null
     }
+    super.afterEach()
   }
 
   def setupScheduler(confs: (String, String)*): TaskSchedulerImpl = {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index b4acccf3bc649..d75c245003442 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -178,12 +178,12 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
   }
 
   override def afterEach(): Unit = {
-    super.afterEach()
     if (sched != null) {
       sched.dagScheduler.stop()
       sched.stop()
       sched = null
     }
+    super.afterEach()
   }
 
 

From aa51c070f8944fd2aa94ac891b45ff51ffcc1ef2 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@apache.org>
Date: Mon, 30 Jul 2018 13:20:03 +0800
Subject: [PATCH 541/566] [SPARK-24934][SQL] Explicitly whitelist supported
 types in upper/lower bounds for in-memory partition pruning

## What changes were proposed in this pull request?

Looks we intentionally set `null` for upper/lower bounds for complex types and don't use it. However, these look used in in-memory partition pruning, which ends up with incorrect results.

This PR proposes to explicitly whitelist the supported types.

```scala
val df = Seq(Array("a", "b"), Array("c", "d")).toDF("arrayCol")
df.cache().filter("arrayCol > array('a', 'b')").show()
```

```scala
val df = sql("select cast('a' as binary) as a")
df.cache().filter("a == cast('a' as binary)").show()
```

**Before:**

```
+--------+
|arrayCol|
+--------+
+--------+
```

```
+---+
|  a|
+---+
+---+
```

**After:**

```
+--------+
|arrayCol|
+--------+
|  [c, d]|
+--------+
```

```
+----+
|   a|
+----+
|[61]|
+----+
```

## How was this patch tested?

Unit tests were added and manually tested.

Author: hyukjinkwon <gurwls223@apache.org>

Closes #21882 from HyukjinKwon/stats-filter.

(cherry picked from commit bfe60fcdb49aa48534060c38e36e06119900140d)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../columnar/InMemoryTableScanExec.scala      | 42 +++++++++++++------
 .../columnar/PartitionBatchPruningSuite.scala | 30 ++++++++++++-
 2 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
index 08b2751ba5789..7bed7e32c7afd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
@@ -183,6 +183,18 @@ case class InMemoryTableScanExec(
   private val stats = relation.partitionStatistics
   private def statsFor(a: Attribute) = stats.forAttribute(a)
 
+  // Currently, only use statistics from atomic types except binary type only.
+  private object ExtractableLiteral {
+    def unapply(expr: Expression): Option[Literal] = expr match {
+      case lit: Literal => lit.dataType match {
+        case BinaryType => None
+        case _: AtomicType => Some(lit)
+        case _ => None
+      }
+      case _ => None
+    }
+  }
+
   // Returned filter predicate should return false iff it is impossible for the input expression
   // to evaluate to `true' based on statistics collected about this partition batch.
   @transient lazy val buildFilter: PartialFunction[Expression, Expression] = {
@@ -194,33 +206,37 @@ case class InMemoryTableScanExec(
       if buildFilter.isDefinedAt(lhs) && buildFilter.isDefinedAt(rhs) =>
       buildFilter(lhs) || buildFilter(rhs)
 
-    case EqualTo(a: AttributeReference, l: Literal) =>
+    case EqualTo(a: AttributeReference, ExtractableLiteral(l)) =>
       statsFor(a).lowerBound <= l && l <= statsFor(a).upperBound
-    case EqualTo(l: Literal, a: AttributeReference) =>
+    case EqualTo(ExtractableLiteral(l), a: AttributeReference) =>
       statsFor(a).lowerBound <= l && l <= statsFor(a).upperBound
 
-    case EqualNullSafe(a: AttributeReference, l: Literal) =>
+    case EqualNullSafe(a: AttributeReference, ExtractableLiteral(l)) =>
       statsFor(a).lowerBound <= l && l <= statsFor(a).upperBound
-    case EqualNullSafe(l: Literal, a: AttributeReference) =>
+    case EqualNullSafe(ExtractableLiteral(l), a: AttributeReference) =>
       statsFor(a).lowerBound <= l && l <= statsFor(a).upperBound
 
-    case LessThan(a: AttributeReference, l: Literal) => statsFor(a).lowerBound < l
-    case LessThan(l: Literal, a: AttributeReference) => l < statsFor(a).upperBound
+    case LessThan(a: AttributeReference, ExtractableLiteral(l)) => statsFor(a).lowerBound < l
+    case LessThan(ExtractableLiteral(l), a: AttributeReference) => l < statsFor(a).upperBound
 
-    case LessThanOrEqual(a: AttributeReference, l: Literal) => statsFor(a).lowerBound <= l
-    case LessThanOrEqual(l: Literal, a: AttributeReference) => l <= statsFor(a).upperBound
+    case LessThanOrEqual(a: AttributeReference, ExtractableLiteral(l)) =>
+      statsFor(a).lowerBound <= l
+    case LessThanOrEqual(ExtractableLiteral(l), a: AttributeReference) =>
+      l <= statsFor(a).upperBound
 
-    case GreaterThan(a: AttributeReference, l: Literal) => l < statsFor(a).upperBound
-    case GreaterThan(l: Literal, a: AttributeReference) => statsFor(a).lowerBound < l
+    case GreaterThan(a: AttributeReference, ExtractableLiteral(l)) => l < statsFor(a).upperBound
+    case GreaterThan(ExtractableLiteral(l), a: AttributeReference) => statsFor(a).lowerBound < l
 
-    case GreaterThanOrEqual(a: AttributeReference, l: Literal) => l <= statsFor(a).upperBound
-    case GreaterThanOrEqual(l: Literal, a: AttributeReference) => statsFor(a).lowerBound <= l
+    case GreaterThanOrEqual(a: AttributeReference, ExtractableLiteral(l)) =>
+      l <= statsFor(a).upperBound
+    case GreaterThanOrEqual(ExtractableLiteral(l), a: AttributeReference) =>
+      statsFor(a).lowerBound <= l
 
     case IsNull(a: Attribute) => statsFor(a).nullCount > 0
     case IsNotNull(a: Attribute) => statsFor(a).count - statsFor(a).nullCount > 0
 
     case In(a: AttributeReference, list: Seq[Expression])
-      if list.forall(_.isInstanceOf[Literal]) && list.nonEmpty =>
+      if list.forall(ExtractableLiteral.unapply(_).isDefined) && list.nonEmpty =>
       list.map(l => statsFor(a).lowerBound <= l.asInstanceOf[Literal] &&
         l.asInstanceOf[Literal] <= statsFor(a).upperBound).reduce(_ || _)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala
index 9d862cfdecb21..af493e93b5192 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.columnar
 import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.test.SQLTestData._
@@ -35,6 +36,12 @@ class PartitionBatchPruningSuite
   private lazy val originalColumnBatchSize = spark.conf.get(SQLConf.COLUMN_BATCH_SIZE)
   private lazy val originalInMemoryPartitionPruning =
     spark.conf.get(SQLConf.IN_MEMORY_PARTITION_PRUNING)
+  private val testArrayData = (1 to 100).map { key =>
+    Tuple1(Array.fill(key)(key))
+  }
+  private val testBinaryData = (1 to 100).map { key =>
+    Tuple1(Array.fill(key)(key.toByte))
+  }
 
   override protected def beforeAll(): Unit = {
     super.beforeAll()
@@ -71,12 +78,22 @@ class PartitionBatchPruningSuite
     }, 5).toDF()
     pruningStringData.createOrReplaceTempView("pruningStringData")
     spark.catalog.cacheTable("pruningStringData")
+
+    val pruningArrayData = sparkContext.makeRDD(testArrayData, 5).toDF()
+    pruningArrayData.createOrReplaceTempView("pruningArrayData")
+    spark.catalog.cacheTable("pruningArrayData")
+
+    val pruningBinaryData = sparkContext.makeRDD(testBinaryData, 5).toDF()
+    pruningBinaryData.createOrReplaceTempView("pruningBinaryData")
+    spark.catalog.cacheTable("pruningBinaryData")
   }
 
   override protected def afterEach(): Unit = {
     try {
       spark.catalog.uncacheTable("pruningData")
       spark.catalog.uncacheTable("pruningStringData")
+      spark.catalog.uncacheTable("pruningArrayData")
+      spark.catalog.uncacheTable("pruningBinaryData")
     } finally {
       super.afterEach()
     }
@@ -95,6 +112,14 @@ class PartitionBatchPruningSuite
   checkBatchPruning("SELECT key FROM pruningData WHERE 11 >= key", 1, 2)(1 to 11)
   checkBatchPruning("SELECT key FROM pruningData WHERE 88 < key", 1, 2)(89 to 100)
   checkBatchPruning("SELECT key FROM pruningData WHERE 89 <= key", 1, 2)(89 to 100)
+  // Do not filter on array type
+  checkBatchPruning("SELECT _1 FROM pruningArrayData WHERE _1 = array(1)", 5, 10)(Seq(Array(1)))
+  checkBatchPruning("SELECT _1 FROM pruningArrayData WHERE _1 <= array(1)", 5, 10)(Seq(Array(1)))
+  checkBatchPruning("SELECT _1 FROM pruningArrayData WHERE _1 >= array(1)", 5, 10)(
+    testArrayData.map(_._1))
+  // Do not filter on binary type
+  checkBatchPruning(
+    "SELECT _1 FROM pruningBinaryData WHERE _1 == binary(chr(1))", 5, 10)(Seq(Array(1.toByte)))
 
   // IS NULL
   checkBatchPruning("SELECT key FROM pruningData WHERE value IS NULL", 5, 5) {
@@ -131,6 +156,9 @@ class PartitionBatchPruningSuite
   checkBatchPruning(
     "SELECT CAST(s AS INT) FROM pruningStringData WHERE s IN ('99', '150', '201')", 1, 1)(
       Seq(150))
+  // Do not filter on array type
+  checkBatchPruning("SELECT _1 FROM pruningArrayData WHERE _1 IN (array(1), array(2, 2))", 5, 10)(
+    Seq(Array(1), Array(2, 2)))
 
   // With unsupported `InSet` predicate
   {
@@ -161,7 +189,7 @@ class PartitionBatchPruningSuite
       query: String,
       expectedReadPartitions: Int,
       expectedReadBatches: Int)(
-      expectedQueryResult: => Seq[Int]): Unit = {
+      expectedQueryResult: => Seq[Any]): Unit = {
 
     test(query) {
       val df = sql(query)

From 25ea27b09147ba1e5bd3ba654bce35d7008fa607 Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Mon, 30 Jul 2018 20:53:45 +0800
Subject: [PATCH 542/566] [SPARK-24957][SQL] Average with decimal followed by
 aggregation returns wrong result

## What changes were proposed in this pull request?

When we do an average, the result is computed dividing the sum of the values by their count. In the case the result is a DecimalType, the way we are casting/managing the precision and scale is not really optimized and it is not coherent with what we do normally.

In particular, a problem can happen when the `Divide` operand returns a result which contains a precision and scale different by the ones which are expected as output of the `Divide` operand. In the case reported in the JIRA, for instance, the result of the `Divide` operand is a `Decimal(38, 36)`, while the output data type for `Divide` is 38, 22. This is not an issue when the `Divide` is followed by a `CheckOverflow` or a `Cast` to the right data type, as these operations return a decimal with the defined precision and scale. Despite in the `Average` operator we do have a `Cast`, this may be bypassed if the result of `Divide` is the same type which it is casted to, hence the issue reported in the JIRA may arise.

The PR proposes to use the normal rules/handling of the arithmetic operators with Decimal data type, so we both reuse the existing code (having a single logic for operations between decimals) and we fix this problem as the result is always guarded by `CheckOverflow`.

## How was this patch tested?

added UT

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #21910 from mgaido91/SPARK-24957.

(cherry picked from commit 85505fc8a58ca229bbaf240c6bc23ea876d594db)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/analysis/DecimalPrecision.scala    |  2 +-
 .../catalyst/expressions/aggregate/Average.scala    |  9 ++++-----
 .../sql/hive/execution/AggregationQuerySuite.scala  | 13 +++++++++++++
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
index ab63131b07573..e6f4d979eb264 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
@@ -89,7 +89,7 @@ object DecimalPrecision extends TypeCoercionRule {
   }
 
   /** Decimal precision promotion for +, -, *, /, %, pmod, and binary comparison. */
-  private val decimalAndDecimal: PartialFunction[Expression, Expression] = {
+  private[catalyst] val decimalAndDecimal: PartialFunction[Expression, Expression] = {
     // Skip nodes whose children have not been resolved yet
     case e if !e.childrenResolved => e
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
index 708bdbfc36058..f80df75ac7f72 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions.aggregate
 
-import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+import org.apache.spark.sql.catalyst.analysis.{DecimalPrecision, TypeCheckResult}
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util.TypeUtils
@@ -77,10 +77,9 @@ case class Average(child: Expression) extends DeclarativeAggregate with Implicit
 
   // If all input are nulls, count will be 0 and we will get null after the division.
   override lazy val evaluateExpression = child.dataType match {
-    case DecimalType.Fixed(p, s) =>
-      // increase the precision and scale to prevent precision loss
-      val dt = DecimalType.bounded(p + 14, s + 4)
-      Cast(Cast(sum, dt) / Cast(count, DecimalType.bounded(DecimalType.MAX_PRECISION, 0)),
+    case _: DecimalType =>
+      Cast(
+        DecimalPrecision.decimalAndDecimal(sum / Cast(count, DecimalType.LongDecimal)),
         resultType)
     case _ =>
       Cast(sum, resultType) / Cast(count, resultType)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
index ae675149df5e2..c65bf7c14c7a5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -1005,6 +1005,19 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te
       )
     )
   }
+
+  test("SPARK-24957: average with decimal followed by aggregation returning wrong result") {
+    val df = Seq(("a", BigDecimal("12.0")),
+      ("a", BigDecimal("12.0")),
+      ("a", BigDecimal("11.9999999988")),
+      ("a", BigDecimal("12.0")),
+      ("a", BigDecimal("12.0")),
+      ("a", BigDecimal("11.9999999988")),
+      ("a", BigDecimal("11.9999999988"))).toDF("text", "number")
+    val agg1 = df.groupBy($"text").agg(avg($"number").as("avg_res"))
+    val agg2 = agg1.groupBy($"text").agg(sum($"avg_res"))
+    checkAnswer(agg2, Row("a", BigDecimal("11.9999999994857142860000")))
+  }
 }
 
 

From fc3df45177d176cc0fe43049b6f8df372f7ea0e0 Mon Sep 17 00:00:00 2001
From: Mauro Palsgraaf <mauropalsgraaf@hotmail.com>
Date: Tue, 31 Jul 2018 08:18:08 -0700
Subject: [PATCH 543/566] [SPARK-24536] Validate that an evaluated limit clause
 cannot be null

It proposes a version in which nullable expressions are not valid in the limit clause

It was tested with unit and e2e tests.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Mauro Palsgraaf <mauropalsgraaf@hotmail.com>

Closes #21807 from mauropalsgraaf/SPARK-24536.

(cherry picked from commit 4ac2126bc64bad1b4cbe1c697b4bcafacd67c96c)
Signed-off-by: Xiao Li <gatorsmile@gmail.com>
---
 .../sql/catalyst/analysis/CheckAnalysis.scala | 14 +++---
 .../analysis/AnalysisErrorSuite.scala         |  6 +++
 .../test/resources/sql-tests/inputs/limit.sql |  5 +++
 .../resources/sql-tests/results/limit.sql.out | 45 +++++++++++++------
 4 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 0d189b4fd7743..beb11d7354f0d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -66,11 +66,15 @@ trait CheckAnalysis extends PredicateHelper {
           limitExpr.sql)
       case e if e.dataType != IntegerType => failAnalysis(
         s"The limit expression must be integer type, but got " +
-          e.dataType.simpleString)
-      case e if e.eval().asInstanceOf[Int] < 0 => failAnalysis(
-        "The limit expression must be equal to or greater than 0, but got " +
-          e.eval().asInstanceOf[Int])
-      case e => // OK
+          e.dataType.catalogString)
+      case e =>
+        e.eval() match {
+          case null => failAnalysis(
+            s"The evaluated limit expression must not be null, but got ${limitExpr.sql}")
+          case v: Int if v < 0 => failAnalysis(
+            s"The limit expression must be equal to or greater than 0, but got $v")
+          case _ => // OK
+        }
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index 5d2f8e735e3d4..70325b8b64fd3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -392,6 +392,12 @@ class AnalysisErrorSuite extends AnalysisTest {
     "Generators are not supported outside the SELECT clause, but got: Sort" :: Nil
   )
 
+  errorTest(
+    "an evaluated limit class must not be null",
+    testRelation.limit(Literal(null, IntegerType)),
+    "The evaluated limit expression must not be null, but got " :: Nil
+  )
+
   errorTest(
     "num_rows in limit clause must be equal to or greater than 0",
     listRelation.limit(-1),
diff --git a/sql/core/src/test/resources/sql-tests/inputs/limit.sql b/sql/core/src/test/resources/sql-tests/inputs/limit.sql
index f21912a042716..b4c73cf33e53a 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/limit.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/limit.sql
@@ -13,6 +13,11 @@ SELECT * FROM testdata LIMIT CAST(1 AS int);
 SELECT * FROM testdata LIMIT -1;
 SELECT * FROM testData TABLESAMPLE (-1 ROWS);
 
+
+SELECT * FROM testdata LIMIT CAST(1 AS INT);
+-- evaluated limit must not be null
+SELECT * FROM testdata LIMIT CAST(NULL AS INT);
+
 -- limit must be foldable
 SELECT * FROM testdata LIMIT key > 3;
 
diff --git a/sql/core/src/test/resources/sql-tests/results/limit.sql.out b/sql/core/src/test/resources/sql-tests/results/limit.sql.out
index 146abe6cbd058..02fe1de84f753 100644
--- a/sql/core/src/test/resources/sql-tests/results/limit.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/limit.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 12
+-- Number of queries: 14
 
 
 -- !query 0
@@ -66,44 +66,61 @@ The limit expression must be equal to or greater than 0, but got -1;
 
 
 -- !query 7
-SELECT * FROM testdata LIMIT key > 3
+SELECT * FROM testdata LIMIT CAST(1 AS INT)
 -- !query 7 schema
-struct<>
+struct<key:int,value:string>
 -- !query 7 output
-org.apache.spark.sql.AnalysisException
-The limit expression must evaluate to a constant value, but got (testdata.`key` > 3);
+1	1
 
 
 -- !query 8
-SELECT * FROM testdata LIMIT true
+SELECT * FROM testdata LIMIT CAST(NULL AS INT)
 -- !query 8 schema
 struct<>
 -- !query 8 output
 org.apache.spark.sql.AnalysisException
-The limit expression must be integer type, but got boolean;
+The evaluated limit expression must not be null, but got CAST(NULL AS INT);
 
 
 -- !query 9
-SELECT * FROM testdata LIMIT 'a'
+SELECT * FROM testdata LIMIT key > 3
 -- !query 9 schema
 struct<>
 -- !query 9 output
 org.apache.spark.sql.AnalysisException
-The limit expression must be integer type, but got string;
+The limit expression must evaluate to a constant value, but got (testdata.`key` > 3);
 
 
 -- !query 10
-SELECT * FROM (SELECT * FROM range(10) LIMIT 5) WHERE id > 3
+SELECT * FROM testdata LIMIT true
 -- !query 10 schema
-struct<id:bigint>
+struct<>
 -- !query 10 output
-4
+org.apache.spark.sql.AnalysisException
+The limit expression must be integer type, but got boolean;
 
 
 -- !query 11
-SELECT * FROM testdata WHERE key < 3 LIMIT ALL
+SELECT * FROM testdata LIMIT 'a'
 -- !query 11 schema
-struct<key:int,value:string>
+struct<>
 -- !query 11 output
+org.apache.spark.sql.AnalysisException
+The limit expression must be integer type, but got string;
+
+
+-- !query 12
+SELECT * FROM (SELECT * FROM range(10) LIMIT 5) WHERE id > 3
+-- !query 12 schema
+struct<id:bigint>
+-- !query 12 output
+4
+
+
+-- !query 13
+SELECT * FROM testdata WHERE key < 3 LIMIT ALL
+-- !query 13 schema
+struct<key:int,value:string>
+-- !query 13 output
 1	1
 2	2

From 5b187a85a24c788e19742e03f1300662d475bab8 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@apache.org>
Date: Tue, 31 Jul 2018 17:24:24 -0700
Subject: [PATCH 544/566] [SPARK-24976][PYTHON] Allow None for Decimal type
 conversion (specific to PyArrow 0.9.0)

## What changes were proposed in this pull request?

See [ARROW-2432](https://jira.apache.org/jira/browse/ARROW-2432). Seems using `from_pandas` to convert decimals fails if encounters a value of `None`:

```python
import pyarrow as pa
import pandas as pd
from decimal import Decimal

pa.Array.from_pandas(pd.Series([Decimal('3.14'), None]), type=pa.decimal128(3, 2))
```

**Arrow 0.8.0**

```
<pyarrow.lib.Decimal128Array object at 0x10a572c58>
[
  Decimal('3.14'),
  NA
]
```

**Arrow 0.9.0**

```
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "array.pxi", line 383, in pyarrow.lib.Array.from_pandas
  File "array.pxi", line 177, in pyarrow.lib.array
  File "error.pxi", line 77, in pyarrow.lib.check_status
  File "error.pxi", line 77, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: Error converting from Python objects to Decimal: Got Python object of type NoneType but can only handle these types: decimal.Decimal
```

This PR propose to work around this via Decimal NaN:

```python
pa.Array.from_pandas(pd.Series([Decimal('3.14'), Decimal('NaN')]), type=pa.decimal128(3, 2))
```

```
<pyarrow.lib.Decimal128Array object at 0x10ffd2e68>
[
  Decimal('3.14'),
  NA
]
```

## How was this patch tested?

Manually tested:

```bash
SPARK_TESTING=1 ./bin/pyspark pyspark.sql.tests ScalarPandasUDFTests
```

**Before**

```
Traceback (most recent call last):
  File "/.../spark/python/pyspark/sql/tests.py", line 4672, in test_vectorized_udf_null_decimal
    self.assertEquals(df.collect(), res.collect())
  File "/.../spark/python/pyspark/sql/dataframe.py", line 533, in collect
    sock_info = self._jdf.collectToPython()
  File "/.../spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/.../spark/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/.../spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
    format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o51.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 1.0 failed 1 times, most recent failure: Lost task 3.0 in stage 1.0 (TID 7, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/.../spark/python/pyspark/worker.py", line 320, in main
    process()
  File "/.../spark/python/pyspark/worker.py", line 315, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/.../spark/python/pyspark/serializers.py", line 274, in dump_stream
    batch = _create_batch(series, self._timezone)
  File "/.../spark/python/pyspark/serializers.py", line 243, in _create_batch
    arrs = [create_array(s, t) for s, t in series]
  File "/.../spark/python/pyspark/serializers.py", line 241, in create_array
    return pa.Array.from_pandas(s, mask=mask, type=t)
  File "array.pxi", line 383, in pyarrow.lib.Array.from_pandas
  File "array.pxi", line 177, in pyarrow.lib.array
  File "error.pxi", line 77, in pyarrow.lib.check_status
  File "error.pxi", line 77, in pyarrow.lib.check_status
ArrowInvalid: Error converting from Python objects to Decimal: Got Python object of type NoneType but can only handle these types: decimal.Decimal
```

**After**

```
Running tests...
----------------------------------------------------------------------
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
.......S.............................
----------------------------------------------------------------------
Ran 37 tests in 21.980s
```

Author: hyukjinkwon <gurwls223@apache.org>

Closes #21928 from HyukjinKwon/SPARK-24976.

(cherry picked from commit f4772fd26f32b11ae54e7721924b5cf6eb27298a)
Signed-off-by: Bryan Cutler <cutlerb@gmail.com>
---
 python/pyspark/serializers.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 6d107f3069dc5..52a7afe2e89c8 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -215,9 +215,10 @@ def _create_batch(series, timezone):
     :param timezone: A timezone to respect when handling timestamp values
     :return: Arrow RecordBatch
     """
-
-    from pyspark.sql.types import _check_series_convert_timestamps_internal
+    import decimal
+    from distutils.version import LooseVersion
     import pyarrow as pa
+    from pyspark.sql.types import _check_series_convert_timestamps_internal
     # Make input conform to [(series1, type1), (series2, type2), ...]
     if not isinstance(series, (list, tuple)) or \
             (len(series) == 2 and isinstance(series[1], pa.DataType)):
@@ -235,6 +236,11 @@ def create_array(s, t):
             # TODO: need decode before converting to Arrow in Python 2
             return pa.Array.from_pandas(s.apply(
                 lambda v: v.decode("utf-8") if isinstance(v, str) else v), mask=mask, type=t)
+        elif t is not None and pa.types.is_decimal(t) and \
+                LooseVersion("0.9.0") <= LooseVersion(pa.__version__) < LooseVersion("0.10.0"):
+            # TODO: see ARROW-2432. Remove when the minimum PyArrow version becomes 0.10.0.
+            return pa.Array.from_pandas(s.apply(
+                lambda v: decimal.Decimal('NaN') if v is None else v), mask=mask, type=t)
         return pa.Array.from_pandas(s, mask=mask, type=t)
 
     arrs = [create_array(s, t) for s, t in series]

From 8080c937d3752aee2fd36f0045a057f7130f6fe4 Mon Sep 17 00:00:00 2001
From: LucaCanali <luca.canali@cern.ch>
Date: Wed, 18 Jul 2018 23:19:02 +0200
Subject: [PATCH 545/566] [PYSPARK] Updates to Accumulators

(cherry picked from commit 15fc2372269159ea2556b028d4eb8860c4108650)
---
 .../apache/spark/api/python/PythonRDD.scala   | 12 +++--
 python/pyspark/accumulators.py                | 53 ++++++++++++++-----
 python/pyspark/context.py                     |  5 +-
 3 files changed, 53 insertions(+), 17 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index a1ee2f7d1b119..8bc0ff7936daf 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -586,8 +586,9 @@ class BytesToString extends org.apache.spark.api.java.function.Function[Array[By
  */
 private[spark] class PythonAccumulatorV2(
     @transient private val serverHost: String,
-    private val serverPort: Int)
-  extends CollectionAccumulator[Array[Byte]] {
+    private val serverPort: Int,
+    private val secretToken: String)
+  extends CollectionAccumulator[Array[Byte]] with Logging{
 
   Utils.checkHost(serverHost)
 
@@ -602,12 +603,17 @@ private[spark] class PythonAccumulatorV2(
   private def openSocket(): Socket = synchronized {
     if (socket == null || socket.isClosed) {
       socket = new Socket(serverHost, serverPort)
+      logInfo(s"Connected to AccumulatorServer at host: $serverHost port: $serverPort")
+      // send the secret just for the initial authentication when opening a new connection
+      socket.getOutputStream.write(secretToken.getBytes(StandardCharsets.UTF_8))
     }
     socket
   }
 
   // Need to override so the types match with PythonFunction
-  override def copyAndReset(): PythonAccumulatorV2 = new PythonAccumulatorV2(serverHost, serverPort)
+  override def copyAndReset(): PythonAccumulatorV2 = {
+    new PythonAccumulatorV2(serverHost, serverPort, secretToken)
+  }
 
   override def merge(other: AccumulatorV2[Array[Byte], JList[Array[Byte]]]): Unit = synchronized {
     val otherPythonAccumulator = other.asInstanceOf[PythonAccumulatorV2]
diff --git a/python/pyspark/accumulators.py b/python/pyspark/accumulators.py
index 6ef8cf53cc747..bc0be07bfb36e 100644
--- a/python/pyspark/accumulators.py
+++ b/python/pyspark/accumulators.py
@@ -228,20 +228,49 @@ class _UpdateRequestHandler(SocketServer.StreamRequestHandler):
 
     def handle(self):
         from pyspark.accumulators import _accumulatorRegistry
-        while not self.server.server_shutdown:
-            # Poll every 1 second for new data -- don't block in case of shutdown.
-            r, _, _ = select.select([self.rfile], [], [], 1)
-            if self.rfile in r:
-                num_updates = read_int(self.rfile)
-                for _ in range(num_updates):
-                    (aid, update) = pickleSer._read_with_length(self.rfile)
-                    _accumulatorRegistry[aid] += update
-                # Write a byte in acknowledgement
-                self.wfile.write(struct.pack("!b", 1))
+        auth_token = self.server.auth_token
+
+        def poll(func):
+            while not self.server.server_shutdown:
+                # Poll every 1 second for new data -- don't block in case of shutdown.
+                r, _, _ = select.select([self.rfile], [], [], 1)
+                if self.rfile in r:
+                    if func():
+                        break
+
+        def accum_updates():
+            num_updates = read_int(self.rfile)
+            for _ in range(num_updates):
+                (aid, update) = pickleSer._read_with_length(self.rfile)
+                _accumulatorRegistry[aid] += update
+            # Write a byte in acknowledgement
+            self.wfile.write(struct.pack("!b", 1))
+            return False
+
+        def authenticate_and_accum_updates():
+            received_token = self.rfile.read(len(auth_token))
+            if isinstance(received_token, bytes):
+                received_token = received_token.decode("utf-8")
+            if (received_token == auth_token):
+                accum_updates()
+                # we've authenticated, we can break out of the first loop now
+                return True
+            else:
+                raise Exception(
+                    "The value of the provided token to the AccumulatorServer is not correct.")
+
+        # first we keep polling till we've received the authentication token
+        poll(authenticate_and_accum_updates)
+        # now we've authenticated, don't need to check for the token anymore
+        poll(accum_updates)
 
 
 class AccumulatorServer(SocketServer.TCPServer):
 
+    def __init__(self, server_address, RequestHandlerClass, auth_token):
+        SocketServer.TCPServer.__init__(self, server_address, RequestHandlerClass)
+        self.auth_token = auth_token
+
     """
     A simple TCP server that intercepts shutdown() in order to interrupt
     our continuous polling on the handler.
@@ -254,9 +283,9 @@ def shutdown(self):
         self.server_close()
 
 
-def _start_update_server():
+def _start_update_server(auth_token):
     """Start a TCP server to receive accumulator updates in a daemon thread, and returns it"""
-    server = AccumulatorServer(("localhost", 0), _UpdateRequestHandler)
+    server = AccumulatorServer(("localhost", 0), _UpdateRequestHandler, auth_token)
     thread = threading.Thread(target=server.serve_forever)
     thread.daemon = True
     thread.start()
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 880559719d3a5..667d0a310161d 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -183,9 +183,10 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
 
         # Create a single Accumulator in Java that we'll send all our updates through;
         # they will be passed back to us through a TCP server
-        self._accumulatorServer = accumulators._start_update_server()
+        auth_token = self._gateway.gateway_parameters.auth_token
+        self._accumulatorServer = accumulators._start_update_server(auth_token)
         (host, port) = self._accumulatorServer.server_address
-        self._javaAccumulator = self._jvm.PythonAccumulatorV2(host, port)
+        self._javaAccumulator = self._jvm.PythonAccumulatorV2(host, port, auth_token)
         self._jsc.sc().register(self._javaAccumulator)
 
         self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')

From 14b50d7fee58d56cb8843994b1a423a6b475dcb5 Mon Sep 17 00:00:00 2001
From: Yuval Itzchakov <yuval.itzchakov@clicktale.com>
Date: Sat, 4 Aug 2018 14:44:10 -0500
Subject: [PATCH 546/566] [SPARK-24987][SS] - Fix Kafka consumer leak when no
 new offsets for TopicPartition

## What changes were proposed in this pull request?

This small fix adds a `consumer.release()` call to `KafkaSourceRDD` in the case where we've retrieved offsets from Kafka, but the `fromOffset` is equal to the `lastOffset`, meaning there is no new data to read for a particular topic partition. Up until now, we'd just return an empty iterator without closing the consumer which would cause a FD leak.

If accepted, this pull request should be merged into master as well.

## How was this patch tested?

Haven't ran any specific tests, would love help on how to test methods running inside `RDD.compute`.

Author: Yuval Itzchakov <yuval.itzchakov@clicktale.com>

Closes #21997 from YuvalItzchakov/master.

(cherry picked from commit b7fdf8eb2011ae76f0161caa9da91e29f52f05e4)
Signed-off-by: cody koeninger <cody@koeninger.org>
---
 .../scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala   | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala
index 498e344ea39f4..ca05b9e8d39b9 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala
@@ -124,8 +124,6 @@ private[kafka010] class KafkaSourceRDD(
       thePart: Partition,
       context: TaskContext): Iterator[ConsumerRecord[Array[Byte], Array[Byte]]] = {
     val sourcePartition = thePart.asInstanceOf[KafkaSourceRDDPartition]
-    val topic = sourcePartition.offsetRange.topic
-    val kafkaPartition = sourcePartition.offsetRange.partition
     val consumer = KafkaDataConsumer.acquire(
       sourcePartition.offsetRange.topicPartition, executorKafkaParams, reuseKafkaConsumer)
 
@@ -138,6 +136,7 @@ private[kafka010] class KafkaSourceRDD(
     if (range.fromOffset == range.untilOffset) {
       logInfo(s"Beginning offset ${range.fromOffset} is the same as ending offset " +
         s"skipping ${range.topic} ${range.partition}")
+      consumer.release()
       Iterator.empty
     } else {
       val underlying = new NextIterator[ConsumerRecord[Array[Byte], Array[Byte]]]() {

From 136588e95f69923a04458abe4862d336e5244c84 Mon Sep 17 00:00:00 2001
From: Sean Owen <srowen@gmail.com>
Date: Sat, 4 Aug 2018 14:59:13 -0500
Subject: [PATCH 547/566] [SPARK-25015][BUILD] Update Hadoop 2.7 to 2.7.7

## What changes were proposed in this pull request?

Update Hadoop 2.7 to 2.7.7 to pull in bug and security fixes.

## How was this patch tested?

Existing tests.

Author: Sean Owen <srowen@gmail.com>

Closes #21987 from srowen/SPARK-25015.

(cherry picked from commit 5f9633dc97ad5f78dd17cad39945ea32f3441f06)
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 assembly/README                |  2 +-
 dev/deps/spark-deps-hadoop-2.7 | 31 ++++++++++++++++---------------
 docs/building-spark.md         |  2 +-
 pom.xml                        |  2 +-
 4 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/assembly/README b/assembly/README
index d5dafab477410..affd281a1385c 100644
--- a/assembly/README
+++ b/assembly/README
@@ -9,4 +9,4 @@ This module is off by default. To activate it specify the profile in the command
 
 If you need to build an assembly for a different version of Hadoop the
 hadoop-version system property needs to be set as in this example:
-  -Dhadoop.version=2.7.3
+  -Dhadoop.version=2.7.7
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index df2be777ff5ac..6a3b38b91981e 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -66,21 +66,21 @@ gson-2.2.4.jar
 guava-14.0.1.jar
 guice-3.0.jar
 guice-servlet-3.0.jar
-hadoop-annotations-2.7.3.jar
-hadoop-auth-2.7.3.jar
-hadoop-client-2.7.3.jar
-hadoop-common-2.7.3.jar
-hadoop-hdfs-2.7.3.jar
-hadoop-mapreduce-client-app-2.7.3.jar
-hadoop-mapreduce-client-common-2.7.3.jar
-hadoop-mapreduce-client-core-2.7.3.jar
-hadoop-mapreduce-client-jobclient-2.7.3.jar
-hadoop-mapreduce-client-shuffle-2.7.3.jar
-hadoop-yarn-api-2.7.3.jar
-hadoop-yarn-client-2.7.3.jar
-hadoop-yarn-common-2.7.3.jar
-hadoop-yarn-server-common-2.7.3.jar
-hadoop-yarn-server-web-proxy-2.7.3.jar
+hadoop-annotations-2.7.7.jar
+hadoop-auth-2.7.7.jar
+hadoop-client-2.7.7.jar
+hadoop-common-2.7.7.jar
+hadoop-hdfs-2.7.7.jar
+hadoop-mapreduce-client-app-2.7.7.jar
+hadoop-mapreduce-client-common-2.7.7.jar
+hadoop-mapreduce-client-core-2.7.7.jar
+hadoop-mapreduce-client-jobclient-2.7.7.jar
+hadoop-mapreduce-client-shuffle-2.7.7.jar
+hadoop-yarn-api-2.7.7.jar
+hadoop-yarn-client-2.7.7.jar
+hadoop-yarn-common-2.7.7.jar
+hadoop-yarn-server-common-2.7.7.jar
+hadoop-yarn-server-web-proxy-2.7.7.jar
 hk2-api-2.4.0-b34.jar
 hk2-locator-2.4.0-b34.jar
 hk2-utils-2.4.0-b34.jar
@@ -121,6 +121,7 @@ jersey-media-jaxb-2.22.2.jar
 jersey-server-2.22.2.jar
 jets3t-0.9.4.jar
 jetty-6.1.26.jar
+jetty-sslengine-6.1.26.jar
 jetty-util-6.1.26.jar
 jline-2.12.1.jar
 joda-time-2.9.3.jar
diff --git a/docs/building-spark.md b/docs/building-spark.md
index c391255a91596..9f78c04a5c8e7 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -67,7 +67,7 @@ Examples:
     ./build/mvn -Pyarn -DskipTests clean package
 
     # Apache Hadoop 2.7.X and later
-    ./build/mvn -Pyarn -Phadoop-2.7 -Dhadoop.version=2.7.3 -DskipTests clean package
+    ./build/mvn -Pyarn -Phadoop-2.7 -Dhadoop.version=2.7.7 -DskipTests clean package
 
 ## Building With Hive and JDBC Support
 
diff --git a/pom.xml b/pom.xml
index 179e9d2bb78a7..76e83630d9a7c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2664,7 +2664,7 @@
     <profile>
       <id>hadoop-2.7</id>
       <properties>
-        <hadoop.version>2.7.3</hadoop.version>
+        <hadoop.version>2.7.7</hadoop.version>
         <curator.version>2.7.1</curator.version>
       </properties>
     </profile>

From 9fb70f4586c1b5460ea261f22eb335be902753dc Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Wed, 8 Aug 2018 10:07:02 +0800
Subject: [PATCH 548/566] [SPARK-24948][SHS][BACKPORT-2.3] Delegate check
 access permissions to the file system

## What changes were proposed in this pull request?

In `SparkHadoopUtil. checkAccessPermission`,  we consider only basic permissions in order to check whether a user can access a file or not. This is not a complete check, as it ignores ACLs and other policies a file system may apply in its internal. So this can result in returning wrongly that a user cannot access a file (despite he actually can).

The PR proposes to delegate to the filesystem the check whether a file is accessible or not, in order to return the right result. A caching layer is added for performance reasons.

## How was this patch tested?

added UT

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #22021 from mgaido91/SPARK-24948_2.3.
---
 .../apache/spark/deploy/SparkHadoopUtil.scala | 23 -----
 .../deploy/history/FsHistoryProvider.scala    | 67 +++++++++----
 .../spark/deploy/SparkHadoopUtilSuite.scala   | 97 -------------------
 .../history/FsHistoryProviderSuite.scala      | 42 +++++++-
 4 files changed, 89 insertions(+), 140 deletions(-)
 delete mode 100644 core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index 177295fb7af0f..dec1b5fafbaaa 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -31,7 +31,6 @@ import scala.util.control.NonFatal
 import com.google.common.primitives.Longs
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, FileSystem, Path, PathFilter}
-import org.apache.hadoop.fs.permission.FsAction
 import org.apache.hadoop.mapred.JobConf
 import org.apache.hadoop.security.{Credentials, UserGroupInformation}
 import org.apache.hadoop.security.token.{Token, TokenIdentifier}
@@ -378,28 +377,6 @@ class SparkHadoopUtil extends Logging {
     buffer.toString
   }
 
-  private[spark] def checkAccessPermission(status: FileStatus, mode: FsAction): Boolean = {
-    val perm = status.getPermission
-    val ugi = UserGroupInformation.getCurrentUser
-
-    if (ugi.getShortUserName == status.getOwner) {
-      if (perm.getUserAction.implies(mode)) {
-        return true
-      }
-    } else if (ugi.getGroupNames.contains(status.getGroup)) {
-      if (perm.getGroupAction.implies(mode)) {
-        return true
-      }
-    } else if (perm.getOtherAction.implies(mode)) {
-      return true
-    }
-
-    logDebug(s"Permission denied: user=${ugi.getShortUserName}, " +
-      s"path=${status.getPath}:${status.getOwner}:${status.getGroup}" +
-      s"${if (status.isDirectory) "d" else "-"}$perm")
-    false
-  }
-
   def serialize(creds: Credentials): Array[Byte] = {
     val byteStream = new ByteArrayOutputStream
     val dataStream = new DataOutputStream(byteStream)
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index ace6d9e00c838..23572ebf58f09 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -19,19 +19,19 @@ package org.apache.spark.deploy.history
 
 import java.io.{File, FileNotFoundException, IOException}
 import java.util.{Date, ServiceLoader, UUID}
-import java.util.concurrent.{ExecutorService, TimeUnit}
+import java.util.concurrent.{ConcurrentHashMap, ExecutorService, Future, TimeUnit}
 import java.util.zip.{ZipEntry, ZipOutputStream}
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable
+import scala.concurrent.ExecutionException
 import scala.util.Try
 import scala.xml.Node
 
 import com.fasterxml.jackson.annotation.JsonIgnore
 import com.google.common.io.ByteStreams
 import com.google.common.util.concurrent.MoreExecutors
-import org.apache.hadoop.fs.{FileStatus, Path}
-import org.apache.hadoop.fs.permission.FsAction
+import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
 import org.apache.hadoop.hdfs.DistributedFileSystem
 import org.apache.hadoop.hdfs.protocol.HdfsConstants
 import org.apache.hadoop.security.AccessControlException
@@ -111,7 +111,8 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     "; groups with admin permissions" + HISTORY_UI_ADMIN_ACLS_GROUPS.toString)
 
   private val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
-  private val fs = new Path(logDir).getFileSystem(hadoopConf)
+  // Visible for testing
+  private[history] val fs: FileSystem = new Path(logDir).getFileSystem(hadoopConf)
 
   // Used by check event thread and clean log thread.
   // Scheduled thread pool size must be one, otherwise it will have concurrent issues about fs
@@ -155,6 +156,25 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     new HistoryServerDiskManager(conf, path, listing, clock)
   }
 
+  private val blacklist = new ConcurrentHashMap[String, Long]
+
+  // Visible for testing
+  private[history] def isBlacklisted(path: Path): Boolean = {
+    blacklist.containsKey(path.getName)
+  }
+
+  private def blacklist(path: Path): Unit = {
+    blacklist.put(path.getName, clock.getTimeMillis())
+  }
+
+  /**
+   * Removes expired entries in the blacklist, according to the provided `expireTimeInSeconds`.
+   */
+  private def clearBlacklist(expireTimeInSeconds: Long): Unit = {
+    val expiredThreshold = clock.getTimeMillis() - expireTimeInSeconds * 1000
+    blacklist.asScala.retain((_, creationTime) => creationTime >= expiredThreshold)
+  }
+
   private val activeUIs = new mutable.HashMap[(String, Option[String]), LoadedAppUI]()
 
   /**
@@ -412,7 +432,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
             // reading a garbage file is safe, but we would log an error which can be scary to
             // the end-user.
             !entry.getPath().getName().startsWith(".") &&
-            SparkHadoopUtil.get.checkAccessPermission(entry, FsAction.READ)
+            !isBlacklisted(entry.getPath)
         }
         .filter { entry =>
           try {
@@ -446,32 +466,37 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
         logDebug(s"New/updated attempts found: ${updated.size} ${updated.map(_.getPath)}")
       }
 
-      val tasks = updated.map { entry =>
+      val tasks = updated.flatMap { entry =>
         try {
-          replayExecutor.submit(new Runnable {
+          val task: Future[Unit] = replayExecutor.submit(new Runnable {
             override def run(): Unit = mergeApplicationListing(entry, newLastScanTime)
-          })
+          }, Unit)
+          Some(task -> entry.getPath)
         } catch {
           // let the iteration over the updated entries break, since an exception on
           // replayExecutor.submit (..) indicates the ExecutorService is unable
           // to take any more submissions at this time
           case e: Exception =>
             logError(s"Exception while submitting event log for replay", e)
-            null
+            None
         }
-      }.filter(_ != null)
+      }
 
       pendingReplayTasksCount.addAndGet(tasks.size)
 
       // Wait for all tasks to finish. This makes sure that checkForLogs
       // is not scheduled again while some tasks are already running in
       // the replayExecutor.
-      tasks.foreach { task =>
+      tasks.foreach { case (task, path) =>
         try {
           task.get()
         } catch {
           case e: InterruptedException =>
             throw e
+          case e: ExecutionException if e.getCause.isInstanceOf[AccessControlException] =>
+            // We don't have read permissions on the log file
+            logWarning(s"Unable to read log $path", e.getCause)
+            blacklist(path)
           case e: Exception =>
             logError("Exception while merging application listings", e)
         } finally {
@@ -694,6 +719,8 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
         listing.delete(classOf[LogInfo], log.logPath)
       }
     }
+    // Clean the blacklist from the expired entries.
+    clearBlacklist(CLEAN_INTERVAL_S)
   }
 
   /**
@@ -871,13 +898,17 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
   }
 
   private def deleteLog(log: Path): Unit = {
-    try {
-      fs.delete(log, true)
-    } catch {
-      case _: AccessControlException =>
-        logInfo(s"No permission to delete $log, ignoring.")
-      case ioe: IOException =>
-        logError(s"IOException in cleaning $log", ioe)
+    if (isBlacklisted(log)) {
+      logDebug(s"Skipping deleting $log as we don't have permissions on it.")
+    } else {
+      try {
+        fs.delete(log, true)
+      } catch {
+        case _: AccessControlException =>
+          logInfo(s"No permission to delete $log, ignoring.")
+        case ioe: IOException =>
+          logError(s"IOException in cleaning $log", ioe)
+      }
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala
deleted file mode 100644
index ab24a76e20a30..0000000000000
--- a/core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy
-
-import java.security.PrivilegedExceptionAction
-
-import scala.util.Random
-
-import org.apache.hadoop.fs.FileStatus
-import org.apache.hadoop.fs.permission.{FsAction, FsPermission}
-import org.apache.hadoop.security.UserGroupInformation
-import org.scalatest.Matchers
-
-import org.apache.spark.SparkFunSuite
-
-class SparkHadoopUtilSuite extends SparkFunSuite with Matchers {
-  test("check file permission") {
-    import FsAction._
-    val testUser = s"user-${Random.nextInt(100)}"
-    val testGroups = Array(s"group-${Random.nextInt(100)}")
-    val testUgi = UserGroupInformation.createUserForTesting(testUser, testGroups)
-
-    testUgi.doAs(new PrivilegedExceptionAction[Void] {
-      override def run(): Void = {
-        val sparkHadoopUtil = new SparkHadoopUtil
-
-        // If file is owned by user and user has access permission
-        var status = fileStatus(testUser, testGroups.head, READ_WRITE, READ_WRITE, NONE)
-        sparkHadoopUtil.checkAccessPermission(status, READ) should be(true)
-        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(true)
-
-        // If file is owned by user but user has no access permission
-        status = fileStatus(testUser, testGroups.head, NONE, READ_WRITE, NONE)
-        sparkHadoopUtil.checkAccessPermission(status, READ) should be(false)
-        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(false)
-
-        val otherUser = s"test-${Random.nextInt(100)}"
-        val otherGroup = s"test-${Random.nextInt(100)}"
-
-        // If file is owned by user's group and user's group has access permission
-        status = fileStatus(otherUser, testGroups.head, NONE, READ_WRITE, NONE)
-        sparkHadoopUtil.checkAccessPermission(status, READ) should be(true)
-        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(true)
-
-        // If file is owned by user's group but user's group has no access permission
-        status = fileStatus(otherUser, testGroups.head, READ_WRITE, NONE, NONE)
-        sparkHadoopUtil.checkAccessPermission(status, READ) should be(false)
-        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(false)
-
-        // If file is owned by other user and this user has access permission
-        status = fileStatus(otherUser, otherGroup, READ_WRITE, READ_WRITE, READ_WRITE)
-        sparkHadoopUtil.checkAccessPermission(status, READ) should be(true)
-        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(true)
-
-        // If file is owned by other user but this user has no access permission
-        status = fileStatus(otherUser, otherGroup, READ_WRITE, READ_WRITE, NONE)
-        sparkHadoopUtil.checkAccessPermission(status, READ) should be(false)
-        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(false)
-
-        null
-      }
-    })
-  }
-
-  private def fileStatus(
-      owner: String,
-      group: String,
-      userAction: FsAction,
-      groupAction: FsAction,
-      otherAction: FsAction): FileStatus = {
-    new FileStatus(0L,
-      false,
-      0,
-      0L,
-      0L,
-      0L,
-      new FsPermission(userAction, groupAction, otherAction),
-      owner,
-      group,
-      null)
-  }
-}
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
index 787de59edf465..bf2b04484a42e 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
@@ -29,9 +29,11 @@ import scala.language.postfixOps
 import com.google.common.io.{ByteStreams, Files}
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.hdfs.DistributedFileSystem
+import org.apache.hadoop.security.AccessControlException
 import org.json4s.jackson.JsonMethods._
-import org.mockito.Matchers.any
-import org.mockito.Mockito.{doReturn, mock, spy, verify}
+import org.mockito.ArgumentMatcher
+import org.mockito.Matchers.{any, argThat}
+import org.mockito.Mockito.{doReturn, doThrow, mock, spy, verify, when}
 import org.scalatest.BeforeAndAfter
 import org.scalatest.Matchers
 import org.scalatest.concurrent.Eventually._
@@ -774,6 +776,42 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
     assert(new File(testDir.toURI).listFiles().size === validLogCount)
   }
 
+  test("SPARK-24948: blacklist files we don't have read permission on") {
+    val clock = new ManualClock(1533132471)
+    val provider = new FsHistoryProvider(createTestConf(), clock)
+    val accessDenied = newLogFile("accessDenied", None, inProgress = false)
+    writeFile(accessDenied, true, None,
+      SparkListenerApplicationStart("accessDenied", Some("accessDenied"), 1L, "test", None))
+    val accessGranted = newLogFile("accessGranted", None, inProgress = false)
+    writeFile(accessGranted, true, None,
+      SparkListenerApplicationStart("accessGranted", Some("accessGranted"), 1L, "test", None),
+      SparkListenerApplicationEnd(5L))
+    val mockedFs = spy(provider.fs)
+    doThrow(new AccessControlException("Cannot read accessDenied file")).when(mockedFs).open(
+      argThat(new ArgumentMatcher[Path]() {
+        override def matches(path: Any): Boolean = {
+          path.asInstanceOf[Path].getName.toLowerCase == "accessdenied"
+        }
+      }))
+    val mockedProvider = spy(provider)
+    when(mockedProvider.fs).thenReturn(mockedFs)
+    updateAndCheck(mockedProvider) { list =>
+      list.size should be(1)
+    }
+    writeFile(accessDenied, true, None,
+      SparkListenerApplicationStart("accessDenied", Some("accessDenied"), 1L, "test", None),
+      SparkListenerApplicationEnd(5L))
+    // Doing 2 times in order to check the blacklist filter too
+    updateAndCheck(mockedProvider) { list =>
+      list.size should be(1)
+    }
+    val accessDeniedPath = new Path(accessDenied.getPath)
+    assert(mockedProvider.isBlacklisted(accessDeniedPath))
+    clock.advance(24 * 60 * 60 * 1000 + 1) // add a bit more than 1d
+    mockedProvider.cleanLogs()
+    assert(!mockedProvider.isBlacklisted(accessDeniedPath))
+  }
+
   /**
    * Asks the provider to check for logs and calls a function to perform checks on the updated
    * app list. Example:

From 7d465d8f4ad982fbdcfc0129ff9a4952a384bb17 Mon Sep 17 00:00:00 2001
From: Sean Owen <srowen@gmail.com>
Date: Thu, 9 Aug 2018 13:04:03 -0500
Subject: [PATCH 549/566] [MINOR][BUILD] Update Jetty to 9.3.24.v20180605

Update Jetty to 9.3.24.v20180605 to pick up security fix

Existing tests.

Closes #22055 from srowen/Jetty9324.

Authored-by: Sean Owen <srowen@gmail.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
(cherry picked from commit eb9a696dd6f138225708d15bb2383854ed8a6dab)
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 76e83630d9a7c..3ff0408e22d61 100644
--- a/pom.xml
+++ b/pom.xml
@@ -133,7 +133,7 @@
     <orc.version>1.4.4</orc.version>
     <orc.classifier>nohive</orc.classifier>
     <hive.parquet.version>1.6.0</hive.parquet.version>
-    <jetty.version>9.3.20.v20170531</jetty.version>
+    <jetty.version>9.3.24.v20180605</jetty.version>
     <javaxservlet.version>3.1.0</javaxservlet.version>
     <chill.version>0.8.4</chill.version>
     <ivy.version>2.4.0</ivy.version>

From 9bfc55b1b0aae269320bb978027a800fd1878149 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 9 Aug 2018 14:38:58 -0700
Subject: [PATCH 550/566] [SPARK-25076][SQL] SQLConf should not be retrieved
 from a stopped SparkSession

## What changes were proposed in this pull request?

When a `SparkSession` is stopped, `SQLConf.get` should use the fallback conf to avoid weird issues like
```
sbt.ForkMain$ForkError: java.lang.IllegalStateException: LiveListenerBus is stopped.
	at org.apache.spark.scheduler.LiveListenerBus.addToQueue(LiveListenerBus.scala:97)
	at org.apache.spark.scheduler.LiveListenerBus.addToStatusQueue(LiveListenerBus.scala:80)
	at org.apache.spark.sql.internal.SharedState.<init>(SharedState.scala:93)
	at org.apache.spark.sql.SparkSession$$anonfun$sharedState$1.apply(SparkSession.scala:120)
	at org.apache.spark.sql.SparkSession$$anonfun$sharedState$1.apply(SparkSession.scala:120)
	at scala.Option.getOrElse(Option.scala:121)
...
```

## How was this patch tested?

a new test suite

Closes #22056 from cloud-fan/session.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Xiao Li <gatorsmile@gmail.com>
(cherry picked from commit fec67ed7e95483c5ea97a7b263ad4bea7d3d42b5)
Signed-off-by: Xiao Li <gatorsmile@gmail.com>
---
 .../org/apache/spark/sql/SparkSession.scala   |  3 +-
 .../apache/spark/sql/LocalSparkSession.scala  |  9 ++---
 .../sql/internal/SQLConfGetterSuite.scala     | 33 +++++++++++++++++++
 3 files changed, 37 insertions(+), 8 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfGetterSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index b699ccd08ff40..adc714368a30c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -92,7 +92,8 @@ class SparkSession private(
 
   // If there is no active SparkSession, uses the default SQL conf. Otherwise, use the session's.
   SQLConf.setSQLConfGetter(() => {
-    SparkSession.getActiveSession.map(_.sessionState.conf).getOrElse(SQLConf.getFallbackConf)
+    SparkSession.getActiveSession.filterNot(_.sparkContext.isStopped).map(_.sessionState.conf)
+      .getOrElse(SQLConf.getFallbackConf)
   })
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/LocalSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/LocalSparkSession.scala
index cbef1c7828319..6b90f20a94fa4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/LocalSparkSession.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/LocalSparkSession.scala
@@ -36,19 +36,14 @@ trait LocalSparkSession extends BeforeAndAfterEach with BeforeAndAfterAll { self
 
   override def afterEach() {
     try {
-      resetSparkContext()
+      LocalSparkSession.stop(spark)
       SparkSession.clearActiveSession()
       SparkSession.clearDefaultSession()
+      spark = null
     } finally {
       super.afterEach()
     }
   }
-
-  def resetSparkContext(): Unit = {
-    LocalSparkSession.stop(spark)
-    spark = null
-  }
-
 }
 
 object LocalSparkSession {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfGetterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfGetterSuite.scala
new file mode 100644
index 0000000000000..bb79d3a84e5a3
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfGetterSuite.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.internal
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.{LocalSparkSession, SparkSession}
+
+class SQLConfGetterSuite extends SparkFunSuite with LocalSparkSession {
+
+  test("SPARK-25076: SQLConf should not be retrieved from a stopped SparkSession") {
+    spark = SparkSession.builder().master("local").getOrCreate()
+    assert(SQLConf.get eq spark.sessionState.conf,
+      "SQLConf.get should get the conf from the active spark session.")
+    spark.stop()
+    assert(SQLConf.get eq SQLConf.getFallbackConf,
+      "SQLConf.get should not get conf from a stopped spark session.")
+  }
+}

From b426ec583fb5176461c5b0c7112d2194af66d93d Mon Sep 17 00:00:00 2001
From: Chris Martin <chris@cmartinit.co.uk>
Date: Sat, 28 Jul 2018 10:40:10 -0500
Subject: [PATCH 551/566] [SPARK-24950][SQL] DateTimeUtilsSuite daysToMillis
 and millisToDays fails w/java 8 181-b13

## What changes were proposed in this pull request?

- Update DateTimeUtilsSuite so that when testing roundtripping in daysToMillis and millisToDays multiple skipdates can be specified.
- Updated test so that both new years eve 2014 and new years day 2015 are skipped for kiribati time zones.  This is necessary as java versions pre 181-b13 considered new years day 2015 to be skipped while susequent versions corrected this to new years eve.

## How was this patch tested?
Unit tests

Author: Chris Martin <chris@cmartinit.co.uk>

Closes #21901 from d80tb7/SPARK-24950_datetimeUtilsSuite_failures.

(cherry picked from commit c5b8d54c61780af6e9e157e6c855718df972efad)
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 .../catalyst/util/DateTimeUtilsSuite.scala    | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index 625ff38943fa3..b025b85d9f608 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -650,18 +650,18 @@ class DateTimeUtilsSuite extends SparkFunSuite {
     assert(daysToMillis(16800, TimeZoneGMT) === c.getTimeInMillis)
 
     // There are some days are skipped entirely in some timezone, skip them here.
-    val skipped_days = Map[String, Int](
-      "Kwajalein" -> 8632,
-      "Pacific/Apia" -> 15338,
-      "Pacific/Enderbury" -> 9131,
-      "Pacific/Fakaofo" -> 15338,
-      "Pacific/Kiritimati" -> 9131,
-      "Pacific/Kwajalein" -> 8632,
-      "MIT" -> 15338)
+    val skipped_days = Map[String, Set[Int]](
+      "Kwajalein" -> Set(8632),
+      "Pacific/Apia" -> Set(15338),
+      "Pacific/Enderbury" -> Set(9130, 9131),
+      "Pacific/Fakaofo" -> Set(15338),
+      "Pacific/Kiritimati" -> Set(9130, 9131),
+      "Pacific/Kwajalein" -> Set(8632),
+      "MIT" -> Set(15338))
     for (tz <- DateTimeTestUtils.ALL_TIMEZONES) {
-      val skipped = skipped_days.getOrElse(tz.getID, Int.MinValue)
+      val skipped = skipped_days.getOrElse(tz.getID, Set.empty)
       (-20000 to 20000).foreach { d =>
-        if (d != skipped) {
+        if (!skipped.contains(d)) {
           assert(millisToDays(daysToMillis(d, tz), tz) === d,
             s"Round trip of ${d} did not work in tz ${tz}")
         }

From 6930f4885356eaec2c1e85896be3c93a80ce779c Mon Sep 17 00:00:00 2001
From: Saisai Shao <jshao@apache.org>
Date: Fri, 10 Aug 2018 02:06:28 +0000
Subject: [PATCH 552/566] Preparing Spark release v2.3.2-rc4

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 6ec496620c8bd..8df26351614ec 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.3
+Version: 2.3.2
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index f8b15cc3632ab..57485fc003097 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index e412a4727b385..53e58c2f5d374 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index d8f9a3d2015c1..d05647c601807 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index a1a4f87987119..8d46761151abe 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index e650978f1a2e5..11c290d64590a 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 350e3cb8fef99..786e959a48cce 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index e7fea4125cf69..78d5a5eaf2335 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 601cc5d998b59..11259d923900e 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 2a7e6442f2f6f..276cf10aa205a 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index 7629f5f8c5818..bca466d19fa36 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.3-SNAPSHOT
-SPARK_VERSION_SHORT: 2.3.3
+SPARK_VERSION: 2.3.2
+SPARK_VERSION_SHORT: 2.3.2
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 7ecec708ac6c1..7324f3a780e49 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 30816bba29627..a7a743129e2c4 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 4a2cb1361db27..5b8661c350554 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 5199aba2f5f8a..5b924e4d73c5c 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 3f8098887b64f..70e0c423faa3e 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index f9874cd223938..3c99f4821d4f9 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 499f1bc4c0907..65b8a3924b172 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index c4caff853ca1a..b0e857ba7534e 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index a09d23d917379..6f299a2d7b6c7 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 6660c73a677c8..de2023837594a 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index f08fe460f9d9b..5ee942480a62a 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 3fb0def7d45e5..b8a15c5b10dbd 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 4a0aa06e621c3..535104b32328c 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 5de97fe7b7824..01d140b3c7de8 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index d1f40132f2c6f..33f18dbe59a1a 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 96f4081cc5f94..77970b2f18303 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 43ef77facca8c..be5d767e4c84a 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 01ebc1ae90aa0..a4b6f95c9254b 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 3ff0408e22d61..957ff9fe80bee 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.3-SNAPSHOT</version>
+  <version>2.3.2</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index ca37f0e98320d..94a128ecb10af 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.3.dev0"
+__version__ = "2.3.2"
diff --git a/repl/pom.xml b/repl/pom.xml
index 9c16fe4fb546d..a64270a800e4d 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index d7fb35f4559df..8b6232e6772dc 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 84ce020dff77b..b68f817b44f69 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index b09d0031f4cf9..2972d61b6b2d4 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 2a86f6f1d42d3..be9d087b6393f 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 64aeb7ef10d92..2638b56da1f8b 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index a9fccdd327880..43cfd529ebdd1 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 5faa52541da3b..b26f5bb942824 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index f73d618b6e41c..77e6435fe9f44 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 8d98ba7161979..24d4e1d47b744 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From e66f3f9b11f37261ec6cbcb6bb2ebeb34e56a968 Mon Sep 17 00:00:00 2001
From: Saisai Shao <jshao@apache.org>
Date: Fri, 10 Aug 2018 02:06:37 +0000
Subject: [PATCH 553/566] Preparing development version 2.3.3-SNAPSHOT

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 8df26351614ec..6ec496620c8bd 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.2
+Version: 2.3.3
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 57485fc003097..f8b15cc3632ab 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 53e58c2f5d374..e412a4727b385 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index d05647c601807..d8f9a3d2015c1 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 8d46761151abe..a1a4f87987119 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 11c290d64590a..e650978f1a2e5 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 786e959a48cce..350e3cb8fef99 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 78d5a5eaf2335..e7fea4125cf69 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 11259d923900e..601cc5d998b59 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 276cf10aa205a..2a7e6442f2f6f 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index bca466d19fa36..7629f5f8c5818 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.2
-SPARK_VERSION_SHORT: 2.3.2
+SPARK_VERSION: 2.3.3-SNAPSHOT
+SPARK_VERSION_SHORT: 2.3.3
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 7324f3a780e49..7ecec708ac6c1 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index a7a743129e2c4..30816bba29627 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 5b8661c350554..4a2cb1361db27 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 5b924e4d73c5c..5199aba2f5f8a 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 70e0c423faa3e..3f8098887b64f 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 3c99f4821d4f9..f9874cd223938 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 65b8a3924b172..499f1bc4c0907 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index b0e857ba7534e..c4caff853ca1a 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 6f299a2d7b6c7..a09d23d917379 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index de2023837594a..6660c73a677c8 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 5ee942480a62a..f08fe460f9d9b 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index b8a15c5b10dbd..3fb0def7d45e5 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 535104b32328c..4a0aa06e621c3 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 01d140b3c7de8..5de97fe7b7824 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 33f18dbe59a1a..d1f40132f2c6f 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 77970b2f18303..96f4081cc5f94 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index be5d767e4c84a..43ef77facca8c 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index a4b6f95c9254b..01ebc1ae90aa0 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 957ff9fe80bee..3ff0408e22d61 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.2</version>
+  <version>2.3.3-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 94a128ecb10af..ca37f0e98320d 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.2"
+__version__ = "2.3.3.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index a64270a800e4d..9c16fe4fb546d 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 8b6232e6772dc..d7fb35f4559df 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index b68f817b44f69..84ce020dff77b 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 2972d61b6b2d4..b09d0031f4cf9 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index be9d087b6393f..2a86f6f1d42d3 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 2638b56da1f8b..64aeb7ef10d92 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 43cfd529ebdd1..a9fccdd327880 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index b26f5bb942824..5faa52541da3b 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 77e6435fe9f44..f73d618b6e41c 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 24d4e1d47b744..8d98ba7161979 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 7306ac71da0e31fa9655c5838dc7fcb6e4c0b7a0 Mon Sep 17 00:00:00 2001
From: Sean Owen <srowen@gmail.com>
Date: Fri, 10 Aug 2018 11:15:36 -0500
Subject: [PATCH 554/566] [MINOR][BUILD] Add ECCN notice required by
 http://www.apache.org/dev/crypto.html

Add ECCN notice required by http://www.apache.org/dev/crypto.html
See https://issues.apache.org/jira/browse/LEGAL-398

This should probably be backported to 2.3, 2.2, as that's when the key dep (commons crypto) turned up. BC is actually unused, but still there.

N/A

Closes #22064 from srowen/ECCN.

Authored-by: Sean Owen <srowen@gmail.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
(cherry picked from commit 91cdab51ccb3a4e3b6d76132d00f3da30598735b)
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 NOTICE | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/NOTICE b/NOTICE
index 6ec240efbf12e..876d6068c6c7d 100644
--- a/NOTICE
+++ b/NOTICE
@@ -5,6 +5,31 @@ This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).
 
 
+Export Control Notice
+---------------------
+
+This distribution includes cryptographic software. The country in which you currently reside may have
+restrictions on the import, possession, use, and/or re-export to another country, of encryption software.
+BEFORE using any encryption software, please check your country's laws, regulations and policies concerning
+the import, possession, or use, and re-export of encryption software, to see if this is permitted. See
+<http://www.wassenaar.org/> for more information.
+
+The U.S. Government Department of Commerce, Bureau of Industry and Security (BIS), has classified this
+software as Export Commodity Control Number (ECCN) 5D002.C.1, which includes information security software
+using or performing cryptographic functions with asymmetric algorithms. The form and manner of this Apache
+Software Foundation distribution makes it eligible for export under the License Exception ENC Technology
+Software Unrestricted (TSU) exception (see the BIS Export Administration Regulations, Section 740.13) for
+both object code and source code.
+
+The following provides more details on the included cryptographic software:
+
+This software uses Apache Commons Crypto (https://commons.apache.org/proper/commons-crypto/) to
+support authentication, and encryption and decryption of data sent across the network between
+services.
+
+This software includes Bouncy Castle (http://bouncycastle.org/) to support the jets3t library.
+
+
 ========================================================================
 Common Development and Distribution License 1.0
 ========================================================================

From 04c652064861720d991675b7f5b53f2bbca9d14d Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <zsxwing@gmail.com>
Date: Fri, 10 Aug 2018 10:53:44 -0700
Subject: [PATCH 555/566] [SPARK-25081][CORE] Nested spill in
 ShuffleExternalSorter should not access released memory page

## What changes were proposed in this pull request?

This issue is pretty similar to [SPARK-21907](https://issues.apache.org/jira/browse/SPARK-21907).

"allocateArray" in [ShuffleInMemorySorter.reset](https://github.com/apache/spark/blob/9b8521e53e56a53b44c02366a99f8a8ee1307bbf/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java#L99) may trigger a spill and cause ShuffleInMemorySorter access the released `array`. Another task may get the same memory page from the pool. This will cause two tasks access the same memory page. When a task reads memory written by another task, many types of failures may happen. Here are some examples I  have seen:

- JVM crash. (This is easy to reproduce in a unit test as we fill newly allocated and deallocated memory with 0xa5 and 0x5a bytes which usually points to an invalid memory address)
- java.lang.IllegalArgumentException: Comparison method violates its general contract!
- java.lang.NullPointerException at org.apache.spark.memory.TaskMemoryManager.getPage(TaskMemoryManager.java:384)
- java.lang.UnsupportedOperationException: Cannot grow BufferHolder by size -536870912 because the size after growing exceeds size limitation 2147483632

This PR resets states in `ShuffleInMemorySorter.reset` before calling `allocateArray` to fix the issue.

## How was this patch tested?

The new unit test will make JVM crash without the fix.

Closes #22062 from zsxwing/SPARK-25081.

Authored-by: Shixiong Zhu <zsxwing@gmail.com>
Signed-off-by: Shixiong Zhu <zsxwing@gmail.com>
(cherry picked from commit f5aba657396bd4e2e03dd06491a2d169a99592a7)
Signed-off-by: Shixiong Zhu <zsxwing@gmail.com>
---
 .../shuffle/sort/ShuffleInMemorySorter.java   |  12 +-
 .../sort/ShuffleExternalSorterSuite.scala     | 111 ++++++++++++++++++
 2 files changed, 121 insertions(+), 2 deletions(-)
 create mode 100644 core/src/test/scala/org/apache/spark/shuffle/sort/ShuffleExternalSorterSuite.scala

diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java
index dc36809d8911f..0d069125dc60e 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java
@@ -66,7 +66,7 @@ public int compare(PackedRecordPointer left, PackedRecordPointer right) {
    */
   private int usableCapacity = 0;
 
-  private int initialSize;
+  private final int initialSize;
 
   ShuffleInMemorySorter(MemoryConsumer consumer, int initialSize, boolean useRadixSort) {
     this.consumer = consumer;
@@ -95,12 +95,20 @@ public int numRecords() {
   }
 
   public void reset() {
+    // Reset `pos` here so that `spill` triggered by the below `allocateArray` will be no-op.
+    pos = 0;
     if (consumer != null) {
       consumer.freeArray(array);
+      // As `array` has been released, we should set it to  `null` to avoid accessing it before
+      // `allocateArray` returns. `usableCapacity` is also set to `0` to avoid any codes writing
+      // data to `ShuffleInMemorySorter` when `array` is `null` (e.g., in
+      // ShuffleExternalSorter.growPointerArrayIfNecessary, we may try to access
+      // `ShuffleInMemorySorter` when `allocateArray` throws SparkOutOfMemoryError).
+      array = null;
+      usableCapacity = 0;
       array = consumer.allocateArray(initialSize);
       usableCapacity = getUsableCapacity();
     }
-    pos = 0;
   }
 
   public void expandPointerArray(LongArray newArray) {
diff --git a/core/src/test/scala/org/apache/spark/shuffle/sort/ShuffleExternalSorterSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/ShuffleExternalSorterSuite.scala
new file mode 100644
index 0000000000000..b9f0e873375b0
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/shuffle/sort/ShuffleExternalSorterSuite.scala
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.sort
+
+import java.lang.{Long => JLong}
+
+import org.mockito.Mockito.when
+import org.scalatest.mockito.MockitoSugar
+
+import org.apache.spark._
+import org.apache.spark.executor.{ShuffleWriteMetrics, TaskMetrics}
+import org.apache.spark.memory._
+import org.apache.spark.unsafe.Platform
+
+class ShuffleExternalSorterSuite extends SparkFunSuite with LocalSparkContext with MockitoSugar {
+
+  test("nested spill should be no-op") {
+    val conf = new SparkConf()
+      .setMaster("local[1]")
+      .setAppName("ShuffleExternalSorterSuite")
+      .set("spark.testing", "true")
+      .set("spark.testing.memory", "1600")
+      .set("spark.memory.fraction", "1")
+    sc = new SparkContext(conf)
+
+    val memoryManager = UnifiedMemoryManager(conf, 1)
+
+    var shouldAllocate = false
+
+    // Mock `TaskMemoryManager` to allocate free memory when `shouldAllocate` is true.
+    // This will trigger a nested spill and expose issues if we don't handle this case properly.
+    val taskMemoryManager = new TaskMemoryManager(memoryManager, 0) {
+      override def acquireExecutionMemory(required: Long, consumer: MemoryConsumer): Long = {
+        // ExecutionMemoryPool.acquireMemory will wait until there are 400 bytes for a task to use.
+        // So we leave 400 bytes for the task.
+        if (shouldAllocate &&
+          memoryManager.maxHeapMemory - memoryManager.executionMemoryUsed > 400) {
+          val acquireExecutionMemoryMethod =
+            memoryManager.getClass.getMethods.filter(_.getName == "acquireExecutionMemory").head
+          acquireExecutionMemoryMethod.invoke(
+            memoryManager,
+            JLong.valueOf(
+              memoryManager.maxHeapMemory - memoryManager.executionMemoryUsed - 400),
+            JLong.valueOf(1L), // taskAttemptId
+            MemoryMode.ON_HEAP
+          ).asInstanceOf[java.lang.Long]
+        }
+        super.acquireExecutionMemory(required, consumer)
+      }
+    }
+    val taskContext = mock[TaskContext]
+    val taskMetrics = new TaskMetrics
+    when(taskContext.taskMetrics()).thenReturn(taskMetrics)
+    val sorter = new ShuffleExternalSorter(
+      taskMemoryManager,
+      sc.env.blockManager,
+      taskContext,
+      100, // initialSize - This will require ShuffleInMemorySorter to acquire at least 800 bytes
+      1, // numPartitions
+      conf,
+      new ShuffleWriteMetrics)
+    val inMemSorter = {
+      val field = sorter.getClass.getDeclaredField("inMemSorter")
+      field.setAccessible(true)
+      field.get(sorter).asInstanceOf[ShuffleInMemorySorter]
+    }
+    // Allocate memory to make the next "insertRecord" call triggers a spill.
+    val bytes = new Array[Byte](1)
+    while (inMemSorter.hasSpaceForAnotherRecord) {
+      sorter.insertRecord(bytes, Platform.BYTE_ARRAY_OFFSET, 1, 0)
+    }
+
+    // This flag will make the mocked TaskMemoryManager acquire free memory released by spill to
+    // trigger a nested spill.
+    shouldAllocate = true
+
+    // Should throw `SparkOutOfMemoryError` as there is no enough memory: `ShuffleInMemorySorter`
+    // will try to acquire 800 bytes but there are only 400 bytes available.
+    //
+    // Before the fix, a nested spill may use a released page and this causes two tasks access the
+    // same memory page. When a task reads memory written by another task, many types of failures
+    // may happen. Here are some examples we have seen:
+    //
+    // - JVM crash. (This is easy to reproduce in the unit test as we fill newly allocated and
+    //   deallocated memory with 0xa5 and 0x5a bytes which usually points to an invalid memory
+    //   address)
+    // - java.lang.IllegalArgumentException: Comparison method violates its general contract!
+    // - java.lang.NullPointerException
+    //     at org.apache.spark.memory.TaskMemoryManager.getPage(TaskMemoryManager.java:384)
+    // - java.lang.UnsupportedOperationException: Cannot grow BufferHolder by size -536870912
+    //     because the size after growing exceeds size limitation 2147483632
+    intercept[SparkOutOfMemoryError] {
+      sorter.insertRecord(bytes, Platform.BYTE_ARRAY_OFFSET, 1, 0)
+    }
+  }
+}

From a0a7e41cfbcebf1eb0228b4acfdb0381c8eeb79f Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Tue, 24 Jul 2018 16:13:57 -0700
Subject: [PATCH 556/566] [SPARK-24908][R][STYLE] removing spaces to make lintr
 happy

## What changes were proposed in this pull request?

during my travails in porting spark builds to run on our centos worker, i managed to recreate (as best i could) the centos environment on our new ubuntu-testing machine.

while running my initial builds, lintr was crashing on some extraneous spaces in test_basic.R (see:  https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.6-ubuntu-test/862/console)

after removing those spaces, the ubuntu build happily passed the lintr tests.

## How was this patch tested?

i then tested this against a modified spark-master-test-sbt-hadoop-2.6 build (see https://amplab.cs.berkeley.edu/jenkins/view/RISELab%20Infra/job/testing-spark-master-test-with-updated-R-crap/4/), which scp'ed a copy of test_basic.R in to the repo after the git clone.  everything seems to be working happily.

Author: shane knapp <incomplete@gmail.com>

Closes #21864 from shaneknapp/fixing-R-lint-spacing.

(cherry picked from commit 3efdf35327be38115b04b08e9c8d0aa282a904ab)
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 R/pkg/inst/tests/testthat/test_basic.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/pkg/inst/tests/testthat/test_basic.R b/R/pkg/inst/tests/testthat/test_basic.R
index 243f5f0298284..80df3d8ce6e59 100644
--- a/R/pkg/inst/tests/testthat/test_basic.R
+++ b/R/pkg/inst/tests/testthat/test_basic.R
@@ -18,9 +18,9 @@
 context("basic tests for CRAN")
 
 test_that("create DataFrame from list or data.frame", {
-  tryCatch( checkJavaVersion(),
+  tryCatch(checkJavaVersion(),
             error = function(e) { skip("error on Java check") },
-            warning = function(e) { skip("warning on Java check") } )
+            warning = function(e) { skip("warning on Java check") })
 
   sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE,
                  sparkConfig = sparkRTestConfig)
@@ -54,9 +54,9 @@ test_that("create DataFrame from list or data.frame", {
 })
 
 test_that("spark.glm and predict", {
-  tryCatch( checkJavaVersion(),
+  tryCatch(checkJavaVersion(),
             error = function(e) { skip("error on Java check") },
-            warning = function(e) { skip("warning on Java check") } )
+            warning = function(e) { skip("warning on Java check") })
 
   sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE,
                  sparkConfig = sparkRTestConfig)

From b9b35b9598dde5d8b2e47b40a0ca310527f31763 Mon Sep 17 00:00:00 2001
From: LantaoJin <jinlantao@gmail.com>
Date: Mon, 13 Aug 2018 08:51:12 +0800
Subject: [PATCH 557/566] [SPARK-25084][SQL][BACKPORT-2.3] distribute by" on
 multiple columns (wrap in brackets) may lead to codegen issue

## What changes were proposed in this pull request?

Backport #22066 to branch-2.3

Use different API in 2.3 here
```scala
|${ctx.JAVA_INT} $childResult = 0;
```

"distribute by" on multiple columns (wrap in brackets) may lead to codegen issue.

Simple way to reproduce:

```scala
  val df = spark.range(1000)
  val columns = (0 until 400).map{ i => s"id as id$i" }
  val distributeExprs = (0 until 100).map(c => s"id$c").mkString(",")
  df.selectExpr(columns : _*).createTempView("test")
  spark.sql(s"select * from test distribute by ($distributeExprs)").count()
```

## How was this patch tested?

UT in Jenkins

Closes #22077 from LantaoJin/SPARK-25084_2.3.

Authored-by: LantaoJin <jinlantao@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/catalyst/expressions/hash.scala | 23 ++++++++++++++-----
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 12 ++++++++++
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
index 055ebf6c0da54..5aa0325678d7f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
@@ -404,14 +404,15 @@ abstract class HashExpression[E] extends Expression {
       input: String,
       result: String,
       fields: Array[StructField]): String = {
+    val tmpInput = ctx.freshName("input")
     val fieldsHash = fields.zipWithIndex.map { case (field, index) =>
-      nullSafeElementHash(input, index.toString, field.nullable, field.dataType, result, ctx)
+      nullSafeElementHash(tmpInput, index.toString, field.nullable, field.dataType, result, ctx)
     }
     val hashResultType = ctx.javaType(dataType)
-    ctx.splitExpressions(
+    val code = ctx.splitExpressions(
       expressions = fieldsHash,
       funcName = "computeHashForStruct",
-      arguments = Seq("InternalRow" -> input, hashResultType -> result),
+      arguments = Seq("InternalRow" -> tmpInput, hashResultType -> result),
       returnType = hashResultType,
       makeSplitFunction = body =>
         s"""
@@ -419,6 +420,10 @@ abstract class HashExpression[E] extends Expression {
            |return $result;
          """.stripMargin,
       foldFunctions = _.map(funcCall => s"$result = $funcCall;").mkString("\n"))
+    s"""
+       |final InternalRow $tmpInput = $input;
+       |$code
+     """.stripMargin
   }
 
   @tailrec
@@ -769,10 +774,11 @@ case class HiveHash(children: Seq[Expression]) extends HashExpression[Int] {
       input: String,
       result: String,
       fields: Array[StructField]): String = {
+    val tmpInput = ctx.freshName("input")
     val childResult = ctx.freshName("childResult")
     val fieldsHash = fields.zipWithIndex.map { case (field, index) =>
       val computeFieldHash = nullSafeElementHash(
-        input, index.toString, field.nullable, field.dataType, childResult, ctx)
+        tmpInput, index.toString, field.nullable, field.dataType, childResult, ctx)
       s"""
          |$childResult = 0;
          |$computeFieldHash
@@ -780,10 +786,10 @@ case class HiveHash(children: Seq[Expression]) extends HashExpression[Int] {
        """.stripMargin
     }
 
-    s"${ctx.JAVA_INT} $childResult = 0;\n" + ctx.splitExpressions(
+    val code = ctx.splitExpressions(
       expressions = fieldsHash,
       funcName = "computeHashForStruct",
-      arguments = Seq("InternalRow" -> input, ctx.JAVA_INT -> result),
+      arguments = Seq("InternalRow" -> tmpInput, ctx.JAVA_INT -> result),
       returnType = ctx.JAVA_INT,
       makeSplitFunction = body =>
         s"""
@@ -792,6 +798,11 @@ case class HiveHash(children: Seq[Expression]) extends HashExpression[Int] {
            |return $result;
            """.stripMargin,
       foldFunctions = _.map(funcCall => s"$result = $funcCall;").mkString("\n"))
+    s"""
+       |final InternalRow $tmpInput = $input;
+       |${ctx.JAVA_INT} $childResult = 0;
+       |$code
+     """.stripMargin
   }
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 5aabc98b714cb..80113485c79e7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2806,4 +2806,16 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       checkAnswer(df, Seq(Row(3, 99, 1)))
     }
   }
+
+  test("SPARK-25084: 'distribute by' on multiple columns may lead to codegen issue") {
+    withView("spark_25084") {
+      val count = 1000
+      val df = spark.range(count)
+      val columns = (0 until 400).map{ i => s"id as id$i" }
+      val distributeExprs = (0 until 100).map(c => s"id$c").mkString(",")
+      df.selectExpr(columns : _*).createTempView("spark_25084")
+      assert(
+        spark.sql(s"select * from spark_25084 distribute by ($distributeExprs)").count === count)
+    }
+  }
 }

From 787790b3c733085b8b5e95cf832dedd481ab3b9a Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Tue, 14 Aug 2018 00:59:18 +0800
Subject: [PATCH 558/566] [SPARK-25028][SQL] Avoid NPE when analyzing partition
 with NULL values

## What changes were proposed in this pull request?

`ANALYZE TABLE ... PARTITION(...) COMPUTE STATISTICS` can fail with a NPE if a partition column contains a NULL value.

The PR avoids the NPE, replacing the `NULL` values with the default partition placeholder.

## How was this patch tested?

added UT

Closes #22036 from mgaido91/SPARK-25028.

Authored-by: Marco Gaido <marcogaido91@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit c220cc42abebbc98a6110b50f787eb6d338c2d97)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../command/AnalyzePartitionCommand.scala      | 10 ++++++++--
 .../spark/sql/StatisticsCollectionSuite.scala  | 18 ++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzePartitionCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzePartitionCommand.scala
index 5b54b2270b5ec..18fefa0a6f19f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzePartitionCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzePartitionCommand.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.command
 import org.apache.spark.sql.{AnalysisException, Column, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, UnresolvedAttribute}
-import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType}
+import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType, ExternalCatalogUtils}
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{And, EqualTo, Literal}
 import org.apache.spark.sql.execution.datasources.PartitioningUtils
@@ -140,7 +140,13 @@ case class AnalyzePartitionCommand(
     val df = tableDf.filter(Column(filter)).groupBy(partitionColumns: _*).count()
 
     df.collect().map { r =>
-      val partitionColumnValues = partitionColumns.indices.map(r.get(_).toString)
+      val partitionColumnValues = partitionColumns.indices.map { i =>
+        if (r.isNullAt(i)) {
+          ExternalCatalogUtils.DEFAULT_PARTITION_NAME
+        } else {
+          r.get(i).toString
+        }
+      }
       val spec = tableMeta.partitionColumnNames.zip(partitionColumnValues).toMap
       val count = BigInt(r.getLong(partitionColumns.size))
       (spec, count)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index b11e798532056..0e7209a78b25e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -198,6 +198,24 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
     }
   }
 
+  test("SPARK-25028: column stats collection for null partitioning columns") {
+    val table = "analyze_partition_with_null"
+    withTempDir { dir =>
+      withTable(table) {
+        sql(s"""
+             |CREATE TABLE $table (value string, name string)
+             |USING PARQUET
+             |PARTITIONED BY (name)
+             |LOCATION '${dir.toURI}'""".stripMargin)
+        val df = Seq(("a", null), ("b", null)).toDF("value", "name")
+        df.write.mode("overwrite").insertInto(table)
+        sql(s"ANALYZE TABLE $table PARTITION (name) COMPUTE STATISTICS")
+        val partitions = spark.sessionState.catalog.listPartitions(TableIdentifier(table))
+        assert(partitions.head.stats.get.rowCount.get == 2)
+      }
+    }
+  }
+
   test("number format in statistics") {
     val numbers = Seq(
       BigInt(0) -> (("0.0 B", "0")),

From 4dc82259d81102e0cb48f4cb2e8075f80d899ac4 Mon Sep 17 00:00:00 2001
From: Saisai Shao <jshao@apache.org>
Date: Tue, 14 Aug 2018 02:55:09 +0000
Subject: [PATCH 559/566] Preparing Spark release v2.3.2-rc5

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 6ec496620c8bd..8df26351614ec 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.3
+Version: 2.3.2
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index f8b15cc3632ab..57485fc003097 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index e412a4727b385..53e58c2f5d374 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index d8f9a3d2015c1..d05647c601807 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index a1a4f87987119..8d46761151abe 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index e650978f1a2e5..11c290d64590a 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 350e3cb8fef99..786e959a48cce 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index e7fea4125cf69..78d5a5eaf2335 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 601cc5d998b59..11259d923900e 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 2a7e6442f2f6f..276cf10aa205a 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index 7629f5f8c5818..bca466d19fa36 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.3-SNAPSHOT
-SPARK_VERSION_SHORT: 2.3.3
+SPARK_VERSION: 2.3.2
+SPARK_VERSION_SHORT: 2.3.2
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 7ecec708ac6c1..7324f3a780e49 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 30816bba29627..a7a743129e2c4 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 4a2cb1361db27..5b8661c350554 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 5199aba2f5f8a..5b924e4d73c5c 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 3f8098887b64f..70e0c423faa3e 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index f9874cd223938..3c99f4821d4f9 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 499f1bc4c0907..65b8a3924b172 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index c4caff853ca1a..b0e857ba7534e 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index a09d23d917379..6f299a2d7b6c7 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 6660c73a677c8..de2023837594a 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index f08fe460f9d9b..5ee942480a62a 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 3fb0def7d45e5..b8a15c5b10dbd 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 4a0aa06e621c3..535104b32328c 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 5de97fe7b7824..01d140b3c7de8 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index d1f40132f2c6f..33f18dbe59a1a 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 96f4081cc5f94..77970b2f18303 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 43ef77facca8c..be5d767e4c84a 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 01ebc1ae90aa0..a4b6f95c9254b 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 3ff0408e22d61..957ff9fe80bee 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.3-SNAPSHOT</version>
+  <version>2.3.2</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index ca37f0e98320d..94a128ecb10af 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.3.dev0"
+__version__ = "2.3.2"
diff --git a/repl/pom.xml b/repl/pom.xml
index 9c16fe4fb546d..a64270a800e4d 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index d7fb35f4559df..8b6232e6772dc 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 84ce020dff77b..b68f817b44f69 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index b09d0031f4cf9..2972d61b6b2d4 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 2a86f6f1d42d3..be9d087b6393f 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 64aeb7ef10d92..2638b56da1f8b 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index a9fccdd327880..43cfd529ebdd1 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 5faa52541da3b..b26f5bb942824 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index f73d618b6e41c..77e6435fe9f44 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 8d98ba7161979..24d4e1d47b744 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.3-SNAPSHOT</version>
+    <version>2.3.2</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 29a040361c4de5c6438c909ded9959ccd53e1a7c Mon Sep 17 00:00:00 2001
From: Saisai Shao <jshao@apache.org>
Date: Tue, 14 Aug 2018 02:55:19 +0000
Subject: [PATCH 560/566] Preparing development version 2.3.3-SNAPSHOT

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 41 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 8df26351614ec..6ec496620c8bd 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.2
+Version: 2.3.3
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 57485fc003097..f8b15cc3632ab 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 53e58c2f5d374..e412a4727b385 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index d05647c601807..d8f9a3d2015c1 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 8d46761151abe..a1a4f87987119 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 11c290d64590a..e650978f1a2e5 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 786e959a48cce..350e3cb8fef99 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 78d5a5eaf2335..e7fea4125cf69 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 11259d923900e..601cc5d998b59 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 276cf10aa205a..2a7e6442f2f6f 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index bca466d19fa36..7629f5f8c5818 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.2
-SPARK_VERSION_SHORT: 2.3.2
+SPARK_VERSION: 2.3.3-SNAPSHOT
+SPARK_VERSION_SHORT: 2.3.3
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 7324f3a780e49..7ecec708ac6c1 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index a7a743129e2c4..30816bba29627 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 5b8661c350554..4a2cb1361db27 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 5b924e4d73c5c..5199aba2f5f8a 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 70e0c423faa3e..3f8098887b64f 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 3c99f4821d4f9..f9874cd223938 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 65b8a3924b172..499f1bc4c0907 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index b0e857ba7534e..c4caff853ca1a 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 6f299a2d7b6c7..a09d23d917379 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index de2023837594a..6660c73a677c8 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 5ee942480a62a..f08fe460f9d9b 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index b8a15c5b10dbd..3fb0def7d45e5 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 535104b32328c..4a0aa06e621c3 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 01d140b3c7de8..5de97fe7b7824 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 33f18dbe59a1a..d1f40132f2c6f 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 77970b2f18303..96f4081cc5f94 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index be5d767e4c84a..43ef77facca8c 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index a4b6f95c9254b..01ebc1ae90aa0 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 957ff9fe80bee..3ff0408e22d61 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.2</version>
+  <version>2.3.3-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 94a128ecb10af..ca37f0e98320d 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.2"
+__version__ = "2.3.3.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index a64270a800e4d..9c16fe4fb546d 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 8b6232e6772dc..d7fb35f4559df 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index b68f817b44f69..84ce020dff77b 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 2972d61b6b2d4..b09d0031f4cf9 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index be9d087b6393f..2a86f6f1d42d3 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 2638b56da1f8b..64aeb7ef10d92 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 43cfd529ebdd1..a9fccdd327880 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index b26f5bb942824..5faa52541da3b 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 77e6435fe9f44..f73d618b6e41c 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 24d4e1d47b744..8d98ba7161979 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.2</version>
+    <version>2.3.3-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 0856b82b37ab4cda4c34155a6d0aa40391d6fc72 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 14 Aug 2018 19:59:39 +0800
Subject: [PATCH 561/566] [MINOR][SQL][DOC] Fix `to_json` example in function
 description and doc

## What changes were proposed in this pull request?

This PR fixes the an example for `to_json` in doc and function description.

- http://spark.apache.org/docs/2.3.0/api/sql/#to_json
- `describe function extended`

## How was this patch tested?

Pass the Jenkins with the updated test.

Closes #22096 from dongjoon-hyun/minor_json.

Authored-by: Dongjoon Hyun <dongjoon@apache.org>
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
(cherry picked from commit e2ab7deae76d3b6f41b9ad4d0ece14ea28db40ce)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 .../apache/spark/sql/catalyst/expressions/jsonExpressions.scala | 2 +-
 .../src/test/resources/sql-tests/results/json-functions.sql.out | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index 34161f0f03f4a..e4766e4b02c62 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -628,7 +628,7 @@ case class JsonToStructs(
        {"a":1,"b":2}
       > SELECT _FUNC_(named_struct('time', to_timestamp('2015-08-26', 'yyyy-MM-dd')), map('timestampFormat', 'dd/MM/yyyy'));
        {"time":"26/08/2015"}
-      > SELECT _FUNC_(array(named_struct('a', 1, 'b', 2));
+      > SELECT _FUNC_(array(named_struct('a', 1, 'b', 2)));
        [{"a":1,"b":2}]
       > SELECT _FUNC_(map('a', named_struct('b', 1)));
        {"a":{"b":1}}
diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
index 14a69128ffb41..1a1d5a3efe296 100644
--- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
@@ -24,7 +24,7 @@ Extended Usage:
        {"a":1,"b":2}
       > SELECT to_json(named_struct('time', to_timestamp('2015-08-26', 'yyyy-MM-dd')), map('timestampFormat', 'dd/MM/yyyy'));
        {"time":"26/08/2015"}
-      > SELECT to_json(array(named_struct('a', 1, 'b', 2));
+      > SELECT to_json(array(named_struct('a', 1, 'b', 2)));
        [{"a":1,"b":2}]
       > SELECT to_json(map('a', named_struct('b', 1)));
        {"a":{"b":1}}

From 34191e66360b987d5415397bd0b8a0c4c1c3dfef Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Tue, 14 Aug 2018 10:25:29 -0700
Subject: [PATCH 562/566] [SPARK-25051][SQL] FixNullability should not stop on
 AnalysisBarrier

## What changes were proposed in this pull request?

The introduction of `AnalysisBarrier` prevented `FixNullability` to go through all the nodes. This introduced a bug, which can lead to wrong results, as the nullability of the output attributes of an outer join can be wrong.

The PR makes `FixNullability` going through the `AnalysisBarrier`s.

## How was this patch tested?

added UT

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #22102 from mgaido91/SPARK-25051.
---
 .../org/apache/spark/sql/catalyst/analysis/Analyzer.scala   | 1 +
 .../test/scala/org/apache/spark/sql/DataFrameSuite.scala    | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 5963c1467e254..531f3d468185d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1704,6 +1704,7 @@ class Analyzer(
 
     def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
       case p if !p.resolved => p // Skip unresolved nodes.
+      case ab: AnalysisBarrier => apply(ab.child)
       case p: LogicalPlan if p.resolved =>
         val childrenOutput = p.children.flatMap(c => c.output).groupBy(_.exprId).flatMap {
           case (exprId, attributes) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 9f8d337ead09d..3640f6a4d0c62 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -2300,4 +2300,10 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       checkAnswer(aggPlusFilter1, aggPlusFilter2.collect())
     }
   }
+
+  test("SPARK-25051: fix nullabilities of outer join attributes doesn't stop on AnalysisBarrier") {
+    val df1 = spark.range(4).selectExpr("id", "cast(id as string) as name")
+    val df2 = spark.range(3).selectExpr("id")
+    assert(df1.join(df2, Seq("id"), "left_outer").where(df2("id").isNull).collect().length == 1)
+  }
 }

From 032f6d9badb4b10c6ec6591b3f82a1ca8d9d226a Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sat, 18 Aug 2018 17:20:34 +0800
Subject: [PATCH 563/566] [MINOR][DOC][SQL] use one line for annotation arg
 value

## What changes were proposed in this pull request?

Put annotation args in one line, or API doc generation will fail.

~~~
[error] /Users/meng/src/spark/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala:1559: annotation argument needs to be a constant; found: "_FUNC_(expr) - Returns the character length of string data or number of bytes of ".+("binary data. The length of string data includes the trailing spaces. The length of binary ").+("data includes binary zeros.")
[error]     "binary data. The length of string data includes the trailing spaces. The length of binary " +
[error]                                                                                                  ^
[info] No documentation generated with unsuccessful compiler run
[error] one error found
[error] (catalyst/compile:doc) Scaladoc generation failed
[error] Total time: 27 s, completed Aug 17, 2018 3:20:08 PM
~~~

## How was this patch tested?

sbt catalyst/compile:doc passed

Closes #22137 from mengxr/minor-doc-fix.

Authored-by: Xiangrui Meng <meng@databricks.com>
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
(cherry picked from commit f454d5287f3f90696c8068c424e333a71e1e7b1b)
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 .../spark/sql/catalyst/expressions/stringExpressions.scala  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index d7612e30b4c57..791dd7156d7c8 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -1653,10 +1653,9 @@ case class Left(str: Expression, len: Expression, child: Expression) extends Run
  * A function that returns the char length of the given string expression or
  * number of bytes of the given binary expression.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the character length of string data or number of bytes of " +
-    "binary data. The length of string data includes the trailing spaces. The length of binary " +
-    "data includes binary zeros.",
+  usage = "_FUNC_(expr) - Returns the character length of string data or number of bytes of binary data. The length of string data includes the trailing spaces. The length of binary data includes binary zeros.",
   examples = """
     Examples:
       > SELECT _FUNC_('Spark SQL ');
@@ -1666,6 +1665,7 @@ case class Left(str: Expression, len: Expression, child: Expression) extends Run
       > SELECT CHARACTER_LENGTH('Spark SQL ');
        10
   """)
+// scalastyle:on line.size.limit
 case class Length(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
   override def dataType: DataType = IntegerType
   override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType))

From ea01e362f7427c6f16445db95982923f19c07171 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Mon, 20 Aug 2018 20:44:22 +0800
Subject: [PATCH 564/566] [SPARK-25144][SQL][TEST][BRANCH-2.3] Free aggregate
 map when task ends

## What changes were proposed in this pull request?

[SPARK-25144](https://issues.apache.org/jira/browse/SPARK-25144) reports memory leaks on Apache Spark 2.0.2 ~ 2.3.2-RC5.

```scala
scala> case class Foo(bar: Option[String])
scala> val ds = List(Foo(Some("bar"))).toDS
scala> val result = ds.flatMap(_.bar).distinct
scala> result.rdd.isEmpty
18/08/19 23:01:54 WARN Executor: Managed memory leak detected; size = 8650752 bytes, TID = 125
res0: Boolean = false
```

This is a backport of cloud-fan 's https://github.com/apache/spark/pull/21738 which is a single commit among 3 commits of SPARK-21743. In addition, I added a test case to prevent regressions in branch-2.3 and branch-2.2. Although SPARK-21743 is reverted due to regression, this subpatch can go to branch-2.3 and branch-2.2. This will be merged as cloud-fan 's commit.

## How was this patch tested?

Pass the jenkins with a newly added test case.

Closes #22150 from dongjoon-hyun/SPARK-25144.

Lead-authored-by: Wenchen Fan <wenchen@databricks.com>
Co-authored-by: Dongjoon Hyun <dongjoon@apache.org>
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 .../UnsafeFixedWidthAggregationMap.java       | 17 ++++++++++-----
 .../aggregate/HashAggregateExec.scala         |  2 +-
 .../TungstenAggregationIterator.scala         |  2 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  8 +++++++
 .../UnsafeFixedWidthAggregationMapSuite.scala | 21 ++++++++++++-------
 5 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
index c7c4c7b3e7715..c8cf44b51df77 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
@@ -20,8 +20,8 @@
 import java.io.IOException;
 
 import org.apache.spark.SparkEnv;
+import org.apache.spark.TaskContext;
 import org.apache.spark.internal.config.package$;
-import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.UnsafeProjection;
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
@@ -82,7 +82,7 @@ public static boolean supportsAggregationBufferSchema(StructType schema) {
    * @param emptyAggregationBuffer the default value for new keys (a "zero" of the agg. function)
    * @param aggregationBufferSchema the schema of the aggregation buffer, used for row conversion.
    * @param groupingKeySchema the schema of the grouping key, used for row conversion.
-   * @param taskMemoryManager the memory manager used to allocate our Unsafe memory structures.
+   * @param taskContext the current task context.
    * @param initialCapacity the initial capacity of the map (a sizing hint to avoid re-hashing).
    * @param pageSizeBytes the data page size, in bytes; limits the maximum record size.
    */
@@ -90,19 +90,26 @@ public UnsafeFixedWidthAggregationMap(
       InternalRow emptyAggregationBuffer,
       StructType aggregationBufferSchema,
       StructType groupingKeySchema,
-      TaskMemoryManager taskMemoryManager,
+      TaskContext taskContext,
       int initialCapacity,
       long pageSizeBytes) {
     this.aggregationBufferSchema = aggregationBufferSchema;
     this.currentAggregationBuffer = new UnsafeRow(aggregationBufferSchema.length());
     this.groupingKeyProjection = UnsafeProjection.create(groupingKeySchema);
     this.groupingKeySchema = groupingKeySchema;
-    this.map =
-      new BytesToBytesMap(taskMemoryManager, initialCapacity, pageSizeBytes, true);
+    this.map = new BytesToBytesMap(
+      taskContext.taskMemoryManager(), initialCapacity, pageSizeBytes, true);
 
     // Initialize the buffer for aggregation value
     final UnsafeProjection valueProjection = UnsafeProjection.create(aggregationBufferSchema);
     this.emptyAggregationBuffer = valueProjection.apply(emptyAggregationBuffer).getBytes();
+
+    // Register a cleanup task with TaskContext to ensure that memory is guaranteed to be freed at
+    // the end of the task. This is necessary to avoid memory leaks in when the downstream operator
+    // does not fully consume the aggregation map's output (e.g. aggregate followed by limit).
+    taskContext.addTaskCompletionListener(context -> {
+      free();
+    });
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
index ce3c68810f3b6..c0fbaf60c416d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -324,7 +324,7 @@ case class HashAggregateExec(
       initialBuffer,
       bufferSchema,
       groupingKeySchema,
-      TaskContext.get().taskMemoryManager(),
+      TaskContext.get(),
       1024 * 16, // initial capacity
       TaskContext.get().taskMemoryManager().pageSizeBytes
     )
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
index 9dc334c1ead3c..c1911235f8df3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
@@ -166,7 +166,7 @@ class TungstenAggregationIterator(
     initialAggregationBuffer,
     StructType.fromAttributes(aggregateFunctions.flatMap(_.aggBufferAttributes)),
     StructType.fromAttributes(groupingExpressions.map(_.toAttribute)),
-    TaskContext.get().taskMemoryManager(),
+    TaskContext.get(),
     1024 * 16, // initial capacity
     TaskContext.get().taskMemoryManager().pageSizeBytes
   )
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 80113485c79e7..2ff47f9e2a12c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2818,4 +2818,12 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
         spark.sql(s"select * from spark_25084 distribute by ($distributeExprs)").count === count)
     }
   }
+
+  test("SPARK-25144 'distinct' causes memory leak") {
+    val ds = List(Foo(Some("bar"))).toDS
+    val result = ds.flatMap(_.bar).distinct
+    result.rdd.isEmpty
+  }
 }
+
+case class Foo(bar: Option[String])
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
index 3e31d22e15c0e..5c15ecd42fa0c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
@@ -23,6 +23,7 @@ import scala.collection.mutable
 import scala.util.{Random, Try}
 import scala.util.control.NonFatal
 
+import org.mockito.Mockito._
 import org.scalatest.Matchers
 
 import org.apache.spark.{SparkConf, SparkFunSuite, TaskContext, TaskContextImpl}
@@ -54,6 +55,8 @@ class UnsafeFixedWidthAggregationMapSuite
   private var memoryManager: TestMemoryManager = null
   private var taskMemoryManager: TaskMemoryManager = null
 
+  private var taskContext: TaskContext = null
+
   def testWithMemoryLeakDetection(name: String)(f: => Unit) {
     def cleanup(): Unit = {
       if (taskMemoryManager != null) {
@@ -67,6 +70,8 @@ class UnsafeFixedWidthAggregationMapSuite
       val conf = new SparkConf().set(MEMORY_OFFHEAP_ENABLED.key, "false")
       memoryManager = new TestMemoryManager(conf)
       taskMemoryManager = new TaskMemoryManager(memoryManager, 0)
+      taskContext = mock(classOf[TaskContext])
+      when(taskContext.taskMemoryManager()).thenReturn(taskMemoryManager)
 
       TaskContext.setTaskContext(new TaskContextImpl(
         stageId = 0,
@@ -111,7 +116,7 @@ class UnsafeFixedWidthAggregationMapSuite
       emptyAggregationBuffer,
       aggBufferSchema,
       groupKeySchema,
-      taskMemoryManager,
+      taskContext,
       1024, // initial capacity,
       PAGE_SIZE_BYTES
     )
@@ -124,7 +129,7 @@ class UnsafeFixedWidthAggregationMapSuite
       emptyAggregationBuffer,
       aggBufferSchema,
       groupKeySchema,
-      taskMemoryManager,
+      taskContext,
       1024, // initial capacity
       PAGE_SIZE_BYTES
     )
@@ -151,7 +156,7 @@ class UnsafeFixedWidthAggregationMapSuite
       emptyAggregationBuffer,
       aggBufferSchema,
       groupKeySchema,
-      taskMemoryManager,
+      taskContext,
       128, // initial capacity
       PAGE_SIZE_BYTES
     )
@@ -176,7 +181,7 @@ class UnsafeFixedWidthAggregationMapSuite
       emptyAggregationBuffer,
       aggBufferSchema,
       groupKeySchema,
-      taskMemoryManager,
+      taskContext,
       128, // initial capacity
       PAGE_SIZE_BYTES
     )
@@ -223,7 +228,7 @@ class UnsafeFixedWidthAggregationMapSuite
       emptyAggregationBuffer,
       aggBufferSchema,
       groupKeySchema,
-      taskMemoryManager,
+      taskContext,
       128, // initial capacity
       PAGE_SIZE_BYTES
     )
@@ -263,7 +268,7 @@ class UnsafeFixedWidthAggregationMapSuite
       emptyAggregationBuffer,
       StructType(Nil),
       StructType(Nil),
-      taskMemoryManager,
+      taskContext,
       128, // initial capacity
       PAGE_SIZE_BYTES
     )
@@ -307,7 +312,7 @@ class UnsafeFixedWidthAggregationMapSuite
       emptyAggregationBuffer,
       aggBufferSchema,
       groupKeySchema,
-      taskMemoryManager,
+      taskContext,
       128, // initial capacity
       pageSize
     )
@@ -344,7 +349,7 @@ class UnsafeFixedWidthAggregationMapSuite
       emptyAggregationBuffer,
       aggBufferSchema,
       groupKeySchema,
-      taskMemoryManager,
+      taskContext,
       128, // initial capacity
       pageSize
     )

From 9702bb637d5ac665fefaa96cc69c5f92553f613a Mon Sep 17 00:00:00 2001
From: Zhang Le <yueguoguo@users.noreply.github.com>
Date: Mon, 20 Aug 2018 14:59:03 -0500
Subject: [PATCH 565/566] [DOCS] Fixed NDCG formula issues

When j is 0, log(j+1) will be 0, and this leads to division by 0 issue.

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Closes #22090 from yueguoguo/patch-1.

Authored-by: Zhang Le <yueguoguo@users.noreply.github.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
(cherry picked from commit 219ed7b487c2dfb5007247f77ebf1b3cc73cecb5)
Signed-off-by: Sean Owen <sean.owen@databricks.com>
---
 docs/mllib-evaluation-metrics.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md
index 7f277543d2e9a..ac398fb7fce7d 100644
--- a/docs/mllib-evaluation-metrics.md
+++ b/docs/mllib-evaluation-metrics.md
@@ -462,13 +462,13 @@ $$rel_D(r) = \begin{cases}1 & \text{if $r \in D$}, \\ 0 & \text{otherwise}.\end{
       <td>Normalized Discounted Cumulative Gain</td>
       <td>
         $NDCG(k)=\frac{1}{M} \sum_{i=0}^{M-1} {\frac{1}{IDCG(D_i, k)}\sum_{j=0}^{n-1}
-          \frac{rel_{D_i}(R_i(j))}{\text{ln}(j+1)}} \\
+          \frac{rel_{D_i}(R_i(j))}{\text{ln}(j+2)}} \\
         \text{Where} \\
         \hspace{5 mm} n = \text{min}\left(\text{max}\left(|R_i|,|D_i|\right),k\right) \\
-        \hspace{5 mm} IDCG(D, k) = \sum_{j=0}^{\text{min}(\left|D\right|, k) - 1} \frac{1}{\text{ln}(j+1)}$
+        \hspace{5 mm} IDCG(D, k) = \sum_{j=0}^{\text{min}(\left|D\right|, k) - 1} \frac{1}{\text{ln}(j+2)}$
       </td>
       <td>
-        <a href="https://en.wikipedia.org/wiki/Information_retrieval#Discounted_cumulative_gain">NDCG at k</a> is a
+        <a href="https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Normalized_DCG">NDCG at k</a> is a
         measure of how many of the first k recommended documents are in the set of true relevant documents averaged
         across all users. In contrast to precision at k, this metric takes into account the order of the recommendations
         (documents are assumed to be in order of decreasing relevance).

From 8bde4678166f5f01837919d4f8d742b89f5e76b8 Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Mon, 20 Aug 2018 23:13:31 -0700
Subject: [PATCH 566/566] [SPARK-25114][CORE] Fix RecordBinaryComparator when
 subtraction between two words is divisible by Integer.MAX_VALUE.

https://github.com/apache/spark/pull/22079#discussion_r209705612 It is possible for two objects to be unequal and yet we consider them as equal with this code, if the long values are separated by Int.MaxValue.
This PR fixes the issue.

Add new test cases in `RecordBinaryComparatorSuite`.

Closes #22101 from jiangxb1987/fix-rbc.

Authored-by: Xingbo Jiang <xingbo.jiang@databricks.com>
Signed-off-by: Xiao Li <gatorsmile@gmail.com>
(cherry picked from commit 4fb96e5105cec4a3eb19a2b7997600b086bac32f)
Signed-off-by: Xiao Li <gatorsmile@gmail.com>
---
 .../sql/execution/RecordBinaryComparator.java |  26 +-
 .../sort/RecordBinaryComparatorSuite.java     | 322 ++++++++++++++++++
 2 files changed, 337 insertions(+), 11 deletions(-)
 create mode 100644 sql/core/src/test/java/test/org/apache/spark/sql/execution/sort/RecordBinaryComparatorSuite.java

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/RecordBinaryComparator.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/RecordBinaryComparator.java
index bb77b5bf6de2a..40c2cc806e87a 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/RecordBinaryComparator.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/RecordBinaryComparator.java
@@ -22,12 +22,10 @@
 
 public final class RecordBinaryComparator extends RecordComparator {
 
-  // TODO(jiangxb) Add test suite for this.
   @Override
   public int compare(
       Object leftObj, long leftOff, int leftLen, Object rightObj, long rightOff, int rightLen) {
     int i = 0;
-    int res = 0;
 
     // If the arrays have different length, the longer one is larger.
     if (leftLen != rightLen) {
@@ -40,27 +38,33 @@ public int compare(
     // check if stars align and we can get both offsets to be aligned
     if ((leftOff % 8) == (rightOff % 8)) {
       while ((leftOff + i) % 8 != 0 && i < leftLen) {
-        res = (Platform.getByte(leftObj, leftOff + i) & 0xff) -
-                (Platform.getByte(rightObj, rightOff + i) & 0xff);
-        if (res != 0) return res;
+        final int v1 = Platform.getByte(leftObj, leftOff + i) & 0xff;
+        final int v2 = Platform.getByte(rightObj, rightOff + i) & 0xff;
+        if (v1 != v2) {
+          return v1 > v2 ? 1 : -1;
+        }
         i += 1;
       }
     }
     // for architectures that support unaligned accesses, chew it up 8 bytes at a time
     if (Platform.unaligned() || (((leftOff + i) % 8 == 0) && ((rightOff + i) % 8 == 0))) {
       while (i <= leftLen - 8) {
-        res = (int) ((Platform.getLong(leftObj, leftOff + i) -
-                Platform.getLong(rightObj, rightOff + i)) % Integer.MAX_VALUE);
-        if (res != 0) return res;
+        final long v1 = Platform.getLong(leftObj, leftOff + i);
+        final long v2 = Platform.getLong(rightObj, rightOff + i);
+        if (v1 != v2) {
+          return v1 > v2 ? 1 : -1;
+        }
         i += 8;
       }
     }
     // this will finish off the unaligned comparisons, or do the entire aligned comparison
     // whichever is needed.
     while (i < leftLen) {
-      res = (Platform.getByte(leftObj, leftOff + i) & 0xff) -
-              (Platform.getByte(rightObj, rightOff + i) & 0xff);
-      if (res != 0) return res;
+      final int v1 = Platform.getByte(leftObj, leftOff + i) & 0xff;
+      final int v2 = Platform.getByte(rightObj, rightOff + i) & 0xff;
+      if (v1 != v2) {
+        return v1 > v2 ? 1 : -1;
+      }
       i += 1;
     }
 
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/execution/sort/RecordBinaryComparatorSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/execution/sort/RecordBinaryComparatorSuite.java
new file mode 100644
index 0000000000000..97f3dc588ecc5
--- /dev/null
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/execution/sort/RecordBinaryComparatorSuite.java
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package test.org.apache.spark.sql.execution.sort;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.memory.TaskMemoryManager;
+import org.apache.spark.memory.TestMemoryConsumer;
+import org.apache.spark.memory.TestMemoryManager;
+import org.apache.spark.sql.catalyst.expressions.UnsafeArrayData;
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
+import org.apache.spark.sql.execution.RecordBinaryComparator;
+import org.apache.spark.unsafe.Platform;
+import org.apache.spark.unsafe.UnsafeAlignedOffset;
+import org.apache.spark.unsafe.array.LongArray;
+import org.apache.spark.unsafe.memory.MemoryBlock;
+import org.apache.spark.unsafe.types.UTF8String;
+import org.apache.spark.util.collection.unsafe.sort.*;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test the RecordBinaryComparator, which compares two UnsafeRows by their binary form.
+ */
+public class RecordBinaryComparatorSuite {
+
+  private final TaskMemoryManager memoryManager = new TaskMemoryManager(
+      new TestMemoryManager(new SparkConf().set("spark.memory.offHeap.enabled", "false")), 0);
+  private final TestMemoryConsumer consumer = new TestMemoryConsumer(memoryManager);
+
+  private final int uaoSize = UnsafeAlignedOffset.getUaoSize();
+
+  private MemoryBlock dataPage;
+  private long pageCursor;
+
+  private LongArray array;
+  private int pos;
+
+  @Before
+  public void beforeEach() {
+    // Only compare between two input rows.
+    array = consumer.allocateArray(2);
+    pos = 0;
+
+    dataPage = memoryManager.allocatePage(4096, consumer);
+    pageCursor = dataPage.getBaseOffset();
+  }
+
+  @After
+  public void afterEach() {
+    consumer.freePage(dataPage);
+    dataPage = null;
+    pageCursor = 0;
+
+    consumer.freeArray(array);
+    array = null;
+    pos = 0;
+  }
+
+  private void insertRow(UnsafeRow row) {
+    Object recordBase = row.getBaseObject();
+    long recordOffset = row.getBaseOffset();
+    int recordLength = row.getSizeInBytes();
+
+    Object baseObject = dataPage.getBaseObject();
+    assert(pageCursor + recordLength <= dataPage.getBaseOffset() + dataPage.size());
+    long recordAddress = memoryManager.encodePageNumberAndOffset(dataPage, pageCursor);
+    UnsafeAlignedOffset.putSize(baseObject, pageCursor, recordLength);
+    pageCursor += uaoSize;
+    Platform.copyMemory(recordBase, recordOffset, baseObject, pageCursor, recordLength);
+    pageCursor += recordLength;
+
+    assert(pos < 2);
+    array.set(pos, recordAddress);
+    pos++;
+  }
+
+  private int compare(int index1, int index2) {
+    Object baseObject = dataPage.getBaseObject();
+
+    long recordAddress1 = array.get(index1);
+    long baseOffset1 = memoryManager.getOffsetInPage(recordAddress1) + uaoSize;
+    int recordLength1 = UnsafeAlignedOffset.getSize(baseObject, baseOffset1 - uaoSize);
+
+    long recordAddress2 = array.get(index2);
+    long baseOffset2 = memoryManager.getOffsetInPage(recordAddress2) + uaoSize;
+    int recordLength2 = UnsafeAlignedOffset.getSize(baseObject, baseOffset2 - uaoSize);
+
+    return binaryComparator.compare(baseObject, baseOffset1, recordLength1, baseObject,
+        baseOffset2, recordLength2);
+  }
+
+  private final RecordComparator binaryComparator = new RecordBinaryComparator();
+
+  // Compute the most compact size for UnsafeRow's backing data.
+  private int computeSizeInBytes(int originalSize) {
+    // All the UnsafeRows in this suite contains less than 64 columns, so the bitSetSize shall
+    // always be 8.
+    return 8 + (originalSize + 7) / 8 * 8;
+  }
+
+  // Compute the relative offset of variable-length values.
+  private long relativeOffset(int numFields) {
+    // All the UnsafeRows in this suite contains less than 64 columns, so the bitSetSize shall
+    // always be 8.
+    return 8 + numFields * 8L;
+  }
+
+  @Test
+  public void testBinaryComparatorForSingleColumnRow() throws Exception {
+    int numFields = 1;
+
+    UnsafeRow row1 = new UnsafeRow(numFields);
+    byte[] data1 = new byte[100];
+    row1.pointTo(data1, computeSizeInBytes(numFields * 8));
+    row1.setInt(0, 11);
+
+    UnsafeRow row2 = new UnsafeRow(numFields);
+    byte[] data2 = new byte[100];
+    row2.pointTo(data2, computeSizeInBytes(numFields * 8));
+    row2.setInt(0, 42);
+
+    insertRow(row1);
+    insertRow(row2);
+
+    assert(compare(0, 0) == 0);
+    assert(compare(0, 1) < 0);
+  }
+
+  @Test
+  public void testBinaryComparatorForMultipleColumnRow() throws Exception {
+    int numFields = 5;
+
+    UnsafeRow row1 = new UnsafeRow(numFields);
+    byte[] data1 = new byte[100];
+    row1.pointTo(data1, computeSizeInBytes(numFields * 8));
+    for (int i = 0; i < numFields; i++) {
+      row1.setDouble(i, i * 3.14);
+    }
+
+    UnsafeRow row2 = new UnsafeRow(numFields);
+    byte[] data2 = new byte[100];
+    row2.pointTo(data2, computeSizeInBytes(numFields * 8));
+    for (int i = 0; i < numFields; i++) {
+      row2.setDouble(i, 198.7 / (i + 1));
+    }
+
+    insertRow(row1);
+    insertRow(row2);
+
+    assert(compare(0, 0) == 0);
+    assert(compare(0, 1) < 0);
+  }
+
+  @Test
+  public void testBinaryComparatorForArrayColumn() throws Exception {
+    int numFields = 1;
+
+    UnsafeRow row1 = new UnsafeRow(numFields);
+    byte[] data1 = new byte[100];
+    UnsafeArrayData arrayData1 = UnsafeArrayData.fromPrimitiveArray(new int[]{11, 42, -1});
+    row1.pointTo(data1, computeSizeInBytes(numFields * 8 + arrayData1.getSizeInBytes()));
+    row1.setLong(0, (relativeOffset(numFields) << 32) | (long) arrayData1.getSizeInBytes());
+    Platform.copyMemory(arrayData1.getBaseObject(), arrayData1.getBaseOffset(), data1,
+        row1.getBaseOffset() + relativeOffset(numFields), arrayData1.getSizeInBytes());
+
+    UnsafeRow row2 = new UnsafeRow(numFields);
+    byte[] data2 = new byte[100];
+    UnsafeArrayData arrayData2 = UnsafeArrayData.fromPrimitiveArray(new int[]{22});
+    row2.pointTo(data2, computeSizeInBytes(numFields * 8 + arrayData2.getSizeInBytes()));
+    row2.setLong(0, (relativeOffset(numFields) << 32) | (long) arrayData2.getSizeInBytes());
+    Platform.copyMemory(arrayData2.getBaseObject(), arrayData2.getBaseOffset(), data2,
+        row2.getBaseOffset() + relativeOffset(numFields), arrayData2.getSizeInBytes());
+
+    insertRow(row1);
+    insertRow(row2);
+
+    assert(compare(0, 0) == 0);
+    assert(compare(0, 1) > 0);
+  }
+
+  @Test
+  public void testBinaryComparatorForMixedColumns() throws Exception {
+    int numFields = 4;
+
+    UnsafeRow row1 = new UnsafeRow(numFields);
+    byte[] data1 = new byte[100];
+    UTF8String str1 = UTF8String.fromString("Milk tea");
+    row1.pointTo(data1, computeSizeInBytes(numFields * 8 + str1.numBytes()));
+    row1.setInt(0, 11);
+    row1.setDouble(1, 3.14);
+    row1.setInt(2, -1);
+    row1.setLong(3, (relativeOffset(numFields) << 32) | (long) str1.numBytes());
+    Platform.copyMemory(str1.getBaseObject(), str1.getBaseOffset(), data1,
+        row1.getBaseOffset() + relativeOffset(numFields), str1.numBytes());
+
+    UnsafeRow row2 = new UnsafeRow(numFields);
+    byte[] data2 = new byte[100];
+    UTF8String str2 = UTF8String.fromString("Java");
+    row2.pointTo(data2, computeSizeInBytes(numFields * 8 + str2.numBytes()));
+    row2.setInt(0, 11);
+    row2.setDouble(1, 3.14);
+    row2.setInt(2, -1);
+    row2.setLong(3, (relativeOffset(numFields) << 32) | (long) str2.numBytes());
+    Platform.copyMemory(str2.getBaseObject(), str2.getBaseOffset(), data2,
+        row2.getBaseOffset() + relativeOffset(numFields), str2.numBytes());
+
+    insertRow(row1);
+    insertRow(row2);
+
+    assert(compare(0, 0) == 0);
+    assert(compare(0, 1) > 0);
+  }
+
+  @Test
+  public void testBinaryComparatorForNullColumns() throws Exception {
+    int numFields = 3;
+
+    UnsafeRow row1 = new UnsafeRow(numFields);
+    byte[] data1 = new byte[100];
+    row1.pointTo(data1, computeSizeInBytes(numFields * 8));
+    for (int i = 0; i < numFields; i++) {
+      row1.setNullAt(i);
+    }
+
+    UnsafeRow row2 = new UnsafeRow(numFields);
+    byte[] data2 = new byte[100];
+    row2.pointTo(data2, computeSizeInBytes(numFields * 8));
+    for (int i = 0; i < numFields - 1; i++) {
+      row2.setNullAt(i);
+    }
+    row2.setDouble(numFields - 1, 3.14);
+
+    insertRow(row1);
+    insertRow(row2);
+
+    assert(compare(0, 0) == 0);
+    assert(compare(0, 1) > 0);
+  }
+
+  @Test
+  public void testBinaryComparatorWhenSubtractionIsDivisibleByMaxIntValue() throws Exception {
+    int numFields = 1;
+
+    UnsafeRow row1 = new UnsafeRow(numFields);
+    byte[] data1 = new byte[100];
+    row1.pointTo(data1, computeSizeInBytes(numFields * 8));
+    row1.setLong(0, 11);
+
+    UnsafeRow row2 = new UnsafeRow(numFields);
+    byte[] data2 = new byte[100];
+    row2.pointTo(data2, computeSizeInBytes(numFields * 8));
+    row2.setLong(0, 11L + Integer.MAX_VALUE);
+
+    insertRow(row1);
+    insertRow(row2);
+
+    assert(compare(0, 1) < 0);
+  }
+
+  @Test
+  public void testBinaryComparatorWhenSubtractionCanOverflowLongValue() throws Exception {
+    int numFields = 1;
+
+    UnsafeRow row1 = new UnsafeRow(numFields);
+    byte[] data1 = new byte[100];
+    row1.pointTo(data1, computeSizeInBytes(numFields * 8));
+    row1.setLong(0, Long.MIN_VALUE);
+
+    UnsafeRow row2 = new UnsafeRow(numFields);
+    byte[] data2 = new byte[100];
+    row2.pointTo(data2, computeSizeInBytes(numFields * 8));
+    row2.setLong(0, 1);
+
+    insertRow(row1);
+    insertRow(row2);
+
+    assert(compare(0, 1) < 0);
+  }
+
+  @Test
+  public void testBinaryComparatorWhenOnlyTheLastColumnDiffers() throws Exception {
+    int numFields = 4;
+
+    UnsafeRow row1 = new UnsafeRow(numFields);
+    byte[] data1 = new byte[100];
+    row1.pointTo(data1, computeSizeInBytes(numFields * 8));
+    row1.setInt(0, 11);
+    row1.setDouble(1, 3.14);
+    row1.setInt(2, -1);
+    row1.setLong(3, 0);
+
+    UnsafeRow row2 = new UnsafeRow(numFields);
+    byte[] data2 = new byte[100];
+    row2.pointTo(data2, computeSizeInBytes(numFields * 8));
+    row2.setInt(0, 11);
+    row2.setDouble(1, 3.14);
+    row2.setInt(2, -1);
+    row2.setLong(3, 1);
+
+    insertRow(row1);
+    insertRow(row2);
+
+    assert(compare(0, 1) < 0);
+  }
+}