From 594ac4f7b816488091202918c409487058e6d8ac Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Mon, 25 Jun 2018 23:44:20 +0800
Subject: [PATCH 01/79] [SPARK-24633][SQL] Fix codegen when split is required
 for arrays_zip

## What changes were proposed in this pull request?

In function array_zip, when split is required by the high number of arguments, a codegen error can happen.

The PR fixes codegen for cases when splitting the code is required.

## How was this patch tested?

added UT

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #21621 from mgaido91/SPARK-24633.
---
 .../catalyst/expressions/collectionOperations.scala   |  4 ++--
 .../apache/spark/sql/DataFrameFunctionsSuite.scala    | 11 +++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 3afabe14606e4..b6137b07555f4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -200,7 +200,7 @@ case class ArraysZip(children: Seq[Expression]) extends Expression with ExpectsI
       """.stripMargin
     }
 
-    val splittedGetValuesAndCardinalities = ctx.splitExpressions(
+    val splittedGetValuesAndCardinalities = ctx.splitExpressionsWithCurrentInputs(
       expressions = getValuesAndCardinalities,
       funcName = "getValuesAndCardinalities",
       returnType = "int",
@@ -210,7 +210,7 @@ case class ArraysZip(children: Seq[Expression]) extends Expression with ExpectsI
           |return $biggestCardinality;
         """.stripMargin,
       foldFunctions = _.map(funcCall => s"$biggestCardinality = $funcCall;").mkString("\n"),
-      arguments =
+      extraArguments =
         ("ArrayData[]", arrVals) ::
         ("int", biggestCardinality) :: Nil)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 25fdbab745128..47fe67d8daea3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -556,6 +556,17 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
     checkAnswer(df8.selectExpr("arrays_zip(v1, v2)"), expectedValue8)
   }
 
+  test("SPARK-24633: arrays_zip splits input processing correctly") {
+    Seq("true", "false").foreach { wholestageCodegenEnabled =>
+      withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> wholestageCodegenEnabled) {
+        val df = spark.range(1)
+        val exprs = (0 to 5).map(x => array($"id" + lit(x)))
+        checkAnswer(df.select(arrays_zip(exprs: _*)),
+          Row(Seq(Row(0, 1, 2, 3, 4, 5))))
+      }
+    }
+  }
+
   test("map size function") {
     val df = Seq(
       (Map[Int, Int](1 -> 1, 2 -> 2), "x"),

From 5264164a67df498b73facae207eda12ee133be7d Mon Sep 17 00:00:00 2001
From: Stacy Kerkela <stacy.kerkela@databricks.com>
Date: Mon, 25 Jun 2018 23:41:39 +0200
Subject: [PATCH 02/79] [SPARK-24648][SQL] SqlMetrics should be threadsafe

Use LongAdder to make SQLMetrics thread safe.

## What changes were proposed in this pull request?
Replace += with LongAdder.add() for concurrent counting

## How was this patch tested?
Unit tests with local threads

Author: Stacy Kerkela <stacy.kerkela@databricks.com>

Closes #21634 from dbkerkela/sqlmetrics-concurrency-stacy.
---
 .../sql/execution/metric/SQLMetrics.scala     | 33 ++++++++++-------
 .../execution/metric/SQLMetricsSuite.scala    | 36 ++++++++++++++++++-
 2 files changed, 55 insertions(+), 14 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
index 77b907870d678..b4f0ae1eb1a18 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.metric
 
 import java.text.NumberFormat
 import java.util.Locale
+import java.util.concurrent.atomic.LongAdder
 
 import org.apache.spark.SparkContext
 import org.apache.spark.scheduler.AccumulableInfo
@@ -32,40 +33,45 @@ import org.apache.spark.util.{AccumulatorContext, AccumulatorV2, Utils}
  * on the driver side must be explicitly posted using [[SQLMetrics.postDriverMetricUpdates()]].
  */
 class SQLMetric(val metricType: String, initValue: Long = 0L) extends AccumulatorV2[Long, Long] {
+
   // This is a workaround for SPARK-11013.
   // We may use -1 as initial value of the accumulator, if the accumulator is valid, we will
   // update it at the end of task and the value will be at least 0. Then we can filter out the -1
   // values before calculate max, min, etc.
-  private[this] var _value = initValue
-  private var _zeroValue = initValue
+  private[this] val _value = new LongAdder
+  private val _zeroValue = initValue
+  _value.add(initValue)
 
   override def copy(): SQLMetric = {
-    val newAcc = new SQLMetric(metricType, _value)
-    newAcc._zeroValue = initValue
+    val newAcc = new SQLMetric(metricType, initValue)
+    newAcc.add(_value.sum())
     newAcc
   }
 
-  override def reset(): Unit = _value = _zeroValue
+  override def reset(): Unit = this.set(_zeroValue)
 
   override def merge(other: AccumulatorV2[Long, Long]): Unit = other match {
-    case o: SQLMetric => _value += o.value
+    case o: SQLMetric => _value.add(o.value)
     case _ => throw new UnsupportedOperationException(
       s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
   }
 
-  override def isZero(): Boolean = _value == _zeroValue
+  override def isZero(): Boolean = _value.sum() == _zeroValue
 
-  override def add(v: Long): Unit = _value += v
+  override def add(v: Long): Unit = _value.add(v)
 
   // We can set a double value to `SQLMetric` which stores only long value, if it is
   // average metrics.
   def set(v: Double): Unit = SQLMetrics.setDoubleForAverageMetrics(this, v)
 
-  def set(v: Long): Unit = _value = v
+  def set(v: Long): Unit = {
+    _value.reset()
+    _value.add(v)
+  }
 
-  def +=(v: Long): Unit = _value += v
+  def +=(v: Long): Unit = _value.add(v)
 
-  override def value: Long = _value
+  override def value: Long = _value.sum()
 
   // Provide special identifier as metadata so we can tell that this is a `SQLMetric` later
   override def toInfo(update: Option[Any], value: Option[Any]): AccumulableInfo = {
@@ -153,7 +159,7 @@ object SQLMetrics {
           Seq.fill(3)(0L)
         } else {
           val sorted = validValues.sorted
-          Seq(sorted(0), sorted(validValues.length / 2), sorted(validValues.length - 1))
+          Seq(sorted.head, sorted(validValues.length / 2), sorted(validValues.length - 1))
         }
         metric.map(v => numberFormat.format(v.toDouble / baseForAvgMetric))
       }
@@ -173,7 +179,8 @@ object SQLMetrics {
           Seq.fill(4)(0L)
         } else {
           val sorted = validValues.sorted
-          Seq(sorted.sum, sorted(0), sorted(validValues.length / 2), sorted(validValues.length - 1))
+          Seq(sorted.sum, sorted.head, sorted(validValues.length / 2),
+            sorted(validValues.length - 1))
         }
         metric.map(strFormat)
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index a3a3f3851e21c..8263c9c81c49e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -19,12 +19,12 @@ package org.apache.spark.sql.execution.metric
 
 import java.io.File
 
+import scala.concurrent.{ExecutionContext, ExecutionContextExecutor, Future}
 import scala.util.Random
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
-import org.apache.spark.sql.execution.ui.SQLAppStatusStore
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -504,4 +504,38 @@ class SQLMetricsSuite extends SparkFunSuite with SQLMetricsTestUtils with Shared
   test("writing data out metrics with dynamic partition: parquet") {
     testMetricsDynamicPartition("parquet", "parquet", "t1")
   }
+
+  test("writing metrics from single thread") {
+    val nAdds = 10
+    val acc = new SQLMetric("test", -10)
+    assert(acc.isZero())
+    acc.set(0)
+    for (i <- 1 to nAdds) acc.add(1)
+    assert(!acc.isZero())
+    assert(nAdds === acc.value)
+    acc.reset()
+    assert(acc.isZero())
+  }
+
+  test("writing metrics from multiple threads") {
+    implicit val ec: ExecutionContextExecutor = ExecutionContext.global
+    val nFutures = 1000
+    val nAdds = 100
+    val acc = new SQLMetric("test", -10)
+    assert(acc.isZero() === true)
+    acc.set(0)
+    val l = for ( i <- 1 to nFutures ) yield {
+      Future {
+        for (j <- 1 to nAdds) acc.add(1)
+        i
+      }
+    }
+    for (futures <- Future.sequence(l)) {
+      assert(nFutures === futures.length)
+      assert(!acc.isZero())
+      assert(nFutures * nAdds === acc.value)
+      acc.reset()
+      assert(acc.isZero())
+    }
+  }
 }

From baa01c8ca9e8ea456f986fbb223c61ad541b52b0 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Mon, 25 Jun 2018 15:12:33 -0700
Subject: [PATCH 03/79] [INFRA] Close stale PR.

Closes #21614

From 6d16b9885d6ad01e1cc56d5241b7ebad99487a0c Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Mon, 25 Jun 2018 16:54:57 -0700
Subject: [PATCH 04/79] [SPARK-24552][CORE][SQL] Use task ID instead of attempt
 number for writes.

This passes the unique task attempt id instead of attempt number to v2 data sources because attempt number is reused when stages are retried. When attempt numbers are reused, sources that track data by partition id and attempt number may incorrectly clean up data because the same attempt number can be both committed and aborted.

For v1 / Hadoop writes, generate a unique ID based on available attempt numbers to avoid a similar problem.

Closes #21558

Author: Marcelo Vanzin <vanzin@cloudera.com>
Author: Ryan Blue <blue@apache.org>

Closes #21606 from vanzin/SPARK-24552.2.
---
 .../spark/internal/io/SparkHadoopWriter.scala |  6 +++-
 .../sql/kafka010/KafkaStreamWriter.scala      |  2 +-
 .../sources/v2/writer/DataSourceWriter.java   |  8 +++---
 .../sql/sources/v2/writer/DataWriter.java     |  8 +++---
 .../sources/v2/writer/DataWriterFactory.java  | 11 +++-----
 .../datasources/v2/WriteToDataSourceV2.scala  | 28 ++++++++++---------
 .../continuous/ContinuousWriteRDD.scala       |  2 +-
 .../sources/ForeachWriterProvider.scala       |  2 +-
 .../sources/PackedRowWriterFactory.scala      |  2 +-
 .../streaming/sources/memoryV2.scala          |  2 +-
 .../sources/v2/SimpleWritableDataSource.scala |  8 +++---
 11 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala b/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala
index abf39213fa0d2..9ebd0aa301592 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala
@@ -76,13 +76,17 @@ object SparkHadoopWriter extends Logging {
     // Try to write all RDD partitions as a Hadoop OutputFormat.
     try {
       val ret = sparkContext.runJob(rdd, (context: TaskContext, iter: Iterator[(K, V)]) => {
+        // SPARK-24552: Generate a unique "attempt ID" based on the stage and task attempt numbers.
+        // Assumes that there won't be more than Short.MaxValue attempts, at least not concurrently.
+        val attemptId = (context.stageAttemptNumber << 16) | context.attemptNumber
+
         executeTask(
           context = context,
           config = config,
           jobTrackerId = jobTrackerId,
           commitJobId = commitJobId,
           sparkPartitionId = context.partitionId,
-          sparkAttemptNumber = context.attemptNumber,
+          sparkAttemptNumber = attemptId,
           committer = committer,
           iterator = iter)
       })
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaStreamWriter.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaStreamWriter.scala
index ae5b5c52d514e..32923dc9f5a6b 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaStreamWriter.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaStreamWriter.scala
@@ -67,7 +67,7 @@ case class KafkaStreamWriterFactory(
 
   override def createDataWriter(
       partitionId: Int,
-      attemptNumber: Int,
+      taskId: Long,
       epochId: Long): DataWriter[InternalRow] = {
     new KafkaStreamDataWriter(topic, producerParams, schema.toAttributes)
   }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceWriter.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceWriter.java
index 0030a9f05dba7..7eedc85a5d6f3 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceWriter.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceWriter.java
@@ -64,8 +64,8 @@ public interface DataSourceWriter {
   DataWriterFactory<Row> createWriterFactory();
 
   /**
-   * Returns whether Spark should use the commit coordinator to ensure that at most one attempt for
-   * each task commits.
+   * Returns whether Spark should use the commit coordinator to ensure that at most one task for
+   * each partition commits.
    *
    * @return true if commit coordinator should be used, false otherwise.
    */
@@ -90,9 +90,9 @@ default void onDataWriterCommit(WriterCommitMessage message) {}
    * is undefined and @{@link #abort(WriterCommitMessage[])} may not be able to deal with it.
    *
    * Note that speculative execution may cause multiple tasks to run for a partition. By default,
-   * Spark uses the commit coordinator to allow at most one attempt to commit. Implementations can
+   * Spark uses the commit coordinator to allow at most one task to commit. Implementations can
    * disable this behavior by overriding {@link #useCommitCoordinator()}. If disabled, multiple
-   * attempts may have committed successfully and one successful commit message per task will be
+   * tasks may have committed successfully and one successful commit message per task will be
    * passed to this commit method. The remaining commit messages are ignored by Spark.
    */
   void commit(WriterCommitMessage[] messages);
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriter.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriter.java
index 39bf458298862..1626c0013e4e7 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriter.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriter.java
@@ -22,7 +22,7 @@
 import org.apache.spark.annotation.InterfaceStability;
 
 /**
- * A data writer returned by {@link DataWriterFactory#createDataWriter(int, int, long)} and is
+ * A data writer returned by {@link DataWriterFactory#createDataWriter(int, long, long)} and is
  * responsible for writing data for an input RDD partition.
  *
  * One Spark task has one exclusive data writer, so there is no thread-safe concern.
@@ -39,14 +39,14 @@
  * {@link DataSourceWriter#commit(WriterCommitMessage[])} with commit messages from other data
  * writers. If this data writer fails(one record fails to write or {@link #commit()} fails), an
  * exception will be sent to the driver side, and Spark may retry this writing task a few times.
- * In each retry, {@link DataWriterFactory#createDataWriter(int, int, long)} will receive a
- * different `attemptNumber`. Spark will call {@link DataSourceWriter#abort(WriterCommitMessage[])}
+ * In each retry, {@link DataWriterFactory#createDataWriter(int, long, long)} will receive a
+ * different `taskId`. Spark will call {@link DataSourceWriter#abort(WriterCommitMessage[])}
  * when the configured number of retries is exhausted.
  *
  * Besides the retry mechanism, Spark may launch speculative tasks if the existing writing task
  * takes too long to finish. Different from retried tasks, which are launched one by one after the
  * previous one fails, speculative tasks are running simultaneously. It's possible that one input
- * RDD partition has multiple data writers with different `attemptNumber` running at the same time,
+ * RDD partition has multiple data writers with different `taskId` running at the same time,
  * and data sources should guarantee that these data writers don't conflict and can work together.
  * Implementations can coordinate with driver during {@link #commit()} to make sure only one of
  * these data writers can commit successfully. Or implementations can allow all of them to commit
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriterFactory.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriterFactory.java
index 7527bcc0c4027..0932ff8f8f8a7 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriterFactory.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriterFactory.java
@@ -42,15 +42,12 @@ public interface DataWriterFactory<T> extends Serializable {
    *                    Usually Spark processes many RDD partitions at the same time,
    *                    implementations should use the partition id to distinguish writers for
    *                    different partitions.
-   * @param attemptNumber Spark may launch multiple tasks with the same task id. For example, a task
-   *                      failed, Spark launches a new task wth the same task id but different
-   *                      attempt number. Or a task is too slow, Spark launches new tasks wth the
-   *                      same task id but different attempt number, which means there are multiple
-   *                      tasks with the same task id running at the same time. Implementations can
-   *                      use this attempt number to distinguish writers of different task attempts.
+   * @param taskId A unique identifier for a task that is performing the write of the partition
+   *               data. Spark may run multiple tasks for the same partition (due to speculation
+   *               or task failures, for example).
    * @param epochId A monotonically increasing id for streaming queries that are split in to
    *                discrete periods of execution. For non-streaming queries,
    *                this ID will always be 0.
    */
-  DataWriter<T> createDataWriter(int partitionId, int attemptNumber, long epochId);
+  DataWriter<T> createDataWriter(int partitionId, long taskId, long epochId);
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
index 11ed7131e7e3d..b1148c0f62f7c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
@@ -29,10 +29,8 @@ import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.streaming.{MicroBatchExecution, StreamExecution}
-import org.apache.spark.sql.execution.streaming.continuous.{CommitPartitionEpoch, ContinuousExecution, EpochCoordinatorRef, SetWriterPartitions}
+import org.apache.spark.sql.execution.streaming.MicroBatchExecution
 import org.apache.spark.sql.sources.v2.writer._
-import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
 
@@ -111,9 +109,10 @@ object DataWritingSparkTask extends Logging {
     val stageId = context.stageId()
     val stageAttempt = context.stageAttemptNumber()
     val partId = context.partitionId()
+    val taskId = context.taskAttemptId()
     val attemptId = context.attemptNumber()
     val epochId = Option(context.getLocalProperty(MicroBatchExecution.BATCH_ID_KEY)).getOrElse("0")
-    val dataWriter = writeTask.createDataWriter(partId, attemptId, epochId.toLong)
+    val dataWriter = writeTask.createDataWriter(partId, taskId, epochId.toLong)
 
     // write the data and commit this writer.
     Utils.tryWithSafeFinallyAndFailureCallbacks(block = {
@@ -125,12 +124,12 @@ object DataWritingSparkTask extends Logging {
         val coordinator = SparkEnv.get.outputCommitCoordinator
         val commitAuthorized = coordinator.canCommit(stageId, stageAttempt, partId, attemptId)
         if (commitAuthorized) {
-          logInfo(s"Writer for stage $stageId / $stageAttempt, " +
-            s"task $partId.$attemptId is authorized to commit.")
+          logInfo(s"Commit authorized for partition $partId (task $taskId, attempt $attemptId" +
+            s"stage $stageId.$stageAttempt)")
           dataWriter.commit()
         } else {
-          val message = s"Stage $stageId / $stageAttempt, " +
-            s"task $partId.$attemptId: driver did not authorize commit"
+          val message = s"Commit denied for partition $partId (task $taskId, attempt $attemptId" +
+            s"stage $stageId.$stageAttempt)"
           logInfo(message)
           // throwing CommitDeniedException will trigger the catch block for abort
           throw new CommitDeniedException(message, stageId, partId, attemptId)
@@ -141,15 +140,18 @@ object DataWritingSparkTask extends Logging {
         dataWriter.commit()
       }
 
-      logInfo(s"Writer for stage $stageId, task $partId.$attemptId committed.")
+      logInfo(s"Committed partition $partId (task $taskId, attempt $attemptId" +
+        s"stage $stageId.$stageAttempt)")
 
       msg
 
     })(catchBlock = {
       // If there is an error, abort this writer
-      logError(s"Writer for stage $stageId, task $partId.$attemptId is aborting.")
+      logError(s"Aborting commit for partition $partId (task $taskId, attempt $attemptId" +
+            s"stage $stageId.$stageAttempt)")
       dataWriter.abort()
-      logError(s"Writer for stage $stageId, task $partId.$attemptId aborted.")
+      logError(s"Aborted commit for partition $partId (task $taskId, attempt $attemptId" +
+            s"stage $stageId.$stageAttempt)")
     })
   }
 }
@@ -160,10 +162,10 @@ class InternalRowDataWriterFactory(
 
   override def createDataWriter(
       partitionId: Int,
-      attemptNumber: Int,
+      taskId: Long,
       epochId: Long): DataWriter[InternalRow] = {
     new InternalRowDataWriter(
-      rowWriterFactory.createDataWriter(partitionId, attemptNumber, epochId),
+      rowWriterFactory.createDataWriter(partitionId, taskId, epochId),
       RowEncoder.apply(schema).resolveAndBind())
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousWriteRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousWriteRDD.scala
index ef5f0da1e7cc2..76f3f5baa8d56 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousWriteRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousWriteRDD.scala
@@ -56,7 +56,7 @@ class ContinuousWriteRDD(var prev: RDD[InternalRow], writeTask: DataWriterFactor
           val dataIterator = prev.compute(split, context)
           dataWriter = writeTask.createDataWriter(
             context.partitionId(),
-            context.attemptNumber(),
+            context.taskAttemptId(),
             EpochTracker.getCurrentEpoch.get)
           while (dataIterator.hasNext) {
             dataWriter.write(dataIterator.next())
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachWriterProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachWriterProvider.scala
index f677f25f116a2..bc9b6d93ce7d9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachWriterProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachWriterProvider.scala
@@ -88,7 +88,7 @@ case class ForeachWriterFactory[T](
   extends DataWriterFactory[InternalRow] {
   override def createDataWriter(
       partitionId: Int,
-      attemptNumber: Int,
+      taskId: Long,
       epochId: Long): ForeachDataWriter[T] = {
     new ForeachDataWriter(writer, rowConverter, partitionId, epochId)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala
index e07355aa37dba..b501d90c81f06 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.sources.v2.writer.{DataSourceWriter, DataWriter, Dat
 case object PackedRowWriterFactory extends DataWriterFactory[Row] {
   override def createDataWriter(
       partitionId: Int,
-      attemptNumber: Int,
+      taskId: Long,
       epochId: Long): DataWriter[Row] = {
     new PackedRowDataWriter()
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
index 47b482007822d..29f8cca476722 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
@@ -180,7 +180,7 @@ class MemoryStreamWriter(
 case class MemoryWriterFactory(outputMode: OutputMode) extends DataWriterFactory[Row] {
   override def createDataWriter(
       partitionId: Int,
-      attemptNumber: Int,
+      taskId: Long,
       epochId: Long): DataWriter[Row] = {
     new MemoryDataWriter(partitionId, outputMode)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala
index 694bb3b95b0f0..1334cf71ae988 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala
@@ -209,10 +209,10 @@ class SimpleCSVDataWriterFactory(path: String, jobId: String, conf: Serializable
 
   override def createDataWriter(
       partitionId: Int,
-      attemptNumber: Int,
+      taskId: Long,
       epochId: Long): DataWriter[Row] = {
     val jobPath = new Path(new Path(path, "_temporary"), jobId)
-    val filePath = new Path(jobPath, s"$jobId-$partitionId-$attemptNumber")
+    val filePath = new Path(jobPath, s"$jobId-$partitionId-$taskId")
     val fs = filePath.getFileSystem(conf.value)
     new SimpleCSVDataWriter(fs, filePath)
   }
@@ -245,10 +245,10 @@ class InternalRowCSVDataWriterFactory(path: String, jobId: String, conf: Seriali
 
   override def createDataWriter(
       partitionId: Int,
-      attemptNumber: Int,
+      taskId: Long,
       epochId: Long): DataWriter[InternalRow] = {
     val jobPath = new Path(new Path(path, "_temporary"), jobId)
-    val filePath = new Path(jobPath, s"$jobId-$partitionId-$attemptNumber")
+    val filePath = new Path(jobPath, s"$jobId-$partitionId-$taskId")
     val fs = filePath.getFileSystem(conf.value)
     new InternalRowCSVDataWriter(fs, filePath)
   }

From d48803bf64dc0fccd6f560738b4682f0c05e767a Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Mon, 25 Jun 2018 17:08:23 -0700
Subject: [PATCH 05/79] [SPARK-24324][PYTHON][FOLLOWUP] Grouped Map positional
 conf should have deprecation note

## What changes were proposed in this pull request?

Followup to the discussion of the added conf in SPARK-24324 which allows assignment by column position only.  This conf is to preserve old behavior and will be removed in future releases, so it should have a note to indicate that.

## How was this patch tested?

NA

Author: Bryan Cutler <cutlerb@gmail.com>

Closes #21637 from BryanCutler/arrow-groupedMap-conf-deprecate-followup-SPARK-24324.
---
 .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index d5fb524a1396f..e768416f257c9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1167,7 +1167,7 @@ object SQLConf {
       .doc("When true, a grouped map Pandas UDF will assign columns from the returned " +
         "Pandas DataFrame based on position, regardless of column label type. When false, " +
         "columns will be looked up by name if labeled with a string and fallback to use " +
-        "position if not.")
+        "position if not. This configuration will be deprecated in future releases.")
       .booleanConf
       .createWithDefault(false)
 

From 4c059ebc6008b4e78cbebc87a421cb87d1b800ed Mon Sep 17 00:00:00 2001
From: Bruce Robbins <bersprockets@gmail.com>
Date: Tue, 26 Jun 2018 09:48:15 +0800
Subject: [PATCH 06/79] [SPARK-23776][DOC] Update instructions for running
 PySpark after building with SBT

## What changes were proposed in this pull request?

This update tells the reader how to build Spark with SBT such that pyspark-sql tests will succeed.

If you follow the current instructions for building Spark with SBT, pyspark/sql/udf.py fails with:
<pre>
AnalysisException: u'Can not load class test.org.apache.spark.sql.JavaStringLength, please make sure it is on the classpath;'
</pre>

## How was this patch tested?

I ran the doc build command (SKIP_API=1 jekyll build) and eyeballed the result.

Author: Bruce Robbins <bersprockets@gmail.com>

Closes #21628 from bersprockets/SPARK-23776_doc.
---
 docs/building-spark.md | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/docs/building-spark.md b/docs/building-spark.md
index 0236bb05849ad..c3bcd90ccc78f 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -215,19 +215,23 @@ If you are building Spark for use in a Python environment and you wish to pip in
 
 Alternatively, you can also run make-distribution with the --pip option.
 
-## PySpark Tests with Maven
+## PySpark Tests with Maven or SBT
 
 If you are building PySpark and wish to run the PySpark tests you will need to build Spark with Hive support.
 
     ./build/mvn -DskipTests clean package -Phive
     ./python/run-tests
 
+If you are building PySpark with SBT and wish to run the PySpark tests, you will need to build Spark with Hive support and also build the test components:
+
+    ./build/sbt -Phive clean package
+    ./build/sbt test:compile
+    ./python/run-tests
+
 The run-tests script also can be limited to a specific Python version or a specific module
 
     ./python/run-tests --python-executables=python --modules=pyspark-sql
 
-**Note:** You can also run Python tests with an sbt build, provided you build Spark with Hive support.
-
 ## Running R Tests
 
 To run the SparkR tests you will need to install the [knitr](https://cran.r-project.org/package=knitr), [rmarkdown](https://cran.r-project.org/package=rmarkdown), [testthat](https://cran.r-project.org/package=testthat), [e1071](https://cran.r-project.org/package=e1071) and [survival](https://cran.r-project.org/package=survival) packages first:

From c7967c6049327a03b63ea7a3b0001a97d31e309d Mon Sep 17 00:00:00 2001
From: DB Tsai <d_tsai@apple.com>
Date: Tue, 26 Jun 2018 09:48:52 +0800
Subject: [PATCH 07/79] [SPARK-24418][BUILD] Upgrade Scala to 2.11.12 and
 2.12.6
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

Scala is upgraded to `2.11.12` and `2.12.6`.

We used `loadFIles()` in `ILoop` as a hook to initialize the Spark before REPL sees any files in Scala `2.11.8`. However, it was a hack, and it was not intended to be a public API, so it was removed in Scala `2.11.12`.

From the discussion in Scala community, https://github.com/scala/bug/issues/10913 , we can use `initializeSynchronous` to initialize Spark instead. This PR implements the Spark initialization there.

However, in Scala `2.11.12`'s `ILoop.scala`, in function `def startup()`, the first thing it calls is `printWelcome()`. As a result, Scala will call `printWelcome()` and `splash` before calling `initializeSynchronous`.

Thus, the Spark shell will allow users to type commends first, and then show the Spark UI URL. It's working, but it will change the Spark Shell interface as the following.

```scala
➜  apache-spark git:(scala-2.11.12) ✗ ./bin/spark-shell
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 2.4.0-SNAPSHOT
      /_/

Using Scala version 2.11.12 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_161)
Type in expressions to have them evaluated.
Type :help for more information.

scala> Spark context Web UI available at http://192.168.1.169:4040
Spark context available as 'sc' (master = local[*], app id = local-1528180279528).
Spark session available as 'spark'.

scala>
```

It seems there is no easy way to inject the Spark initialization code in the proper place as Scala doesn't provide a hook. Maybe som-snytt can comment on this.

The following command is used to update the dep files.
```scala
./dev/test-dependencies.sh --replace-manifest
```
## How was this patch tested?

Existing tests

Author: DB Tsai <d_tsai@apple.com>

Closes #21495 from dbtsai/scala-2.11.12.
---
 LICENSE                                       | 12 +++++-----
 dev/deps/spark-deps-hadoop-2.6                | 10 ++++----
 dev/deps/spark-deps-hadoop-2.7                | 10 ++++----
 dev/deps/spark-deps-hadoop-3.1                | 10 ++++----
 pom.xml                                       |  8 +++----
 .../org/apache/spark/repl/SparkILoop.scala    | 24 +++++++------------
 .../spark/repl/SparkILoopInterpreter.scala    | 18 ++++++++++++--
 7 files changed, 50 insertions(+), 42 deletions(-)

diff --git a/LICENSE b/LICENSE
index cc1f580207a75..6f5d9452e800d 100644
--- a/LICENSE
+++ b/LICENSE
@@ -243,18 +243,18 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
      (BSD licence) ANTLR ST4 4.0.4 (org.antlr:ST4:4.0.4 - http://www.stringtemplate.org)
      (BSD licence) ANTLR StringTemplate (org.antlr:stringtemplate:3.2.1 - http://www.stringtemplate.org)
      (BSD License) Javolution (javolution:javolution:5.5.1 - http://javolution.org)
-     (BSD) JLine (jline:jline:0.9.94 - http://jline.sourceforge.net)
+     (BSD) JLine (jline:jline:2.14.3 - https://github.com/jline/jline2)
      (BSD) ParaNamer Core (com.thoughtworks.paranamer:paranamer:2.3 - http://paranamer.codehaus.org/paranamer)
      (BSD) ParaNamer Core (com.thoughtworks.paranamer:paranamer:2.6 - http://paranamer.codehaus.org/paranamer)
      (BSD 3 Clause) Scala (http://www.scala-lang.org/download/#License)
         (Interpreter classes (all .scala files in repl/src/main/scala
         except for Main.Scala, SparkHelper.scala and ExecutorClassLoader.scala),
         and for SerializableMapWrapper in JavaUtils.scala)
-     (BSD-like) Scala Actors library (org.scala-lang:scala-actors:2.11.8 - http://www.scala-lang.org/)
-     (BSD-like) Scala Compiler (org.scala-lang:scala-compiler:2.11.8 - http://www.scala-lang.org/)
-     (BSD-like) Scala Compiler (org.scala-lang:scala-reflect:2.11.8 - http://www.scala-lang.org/)
-     (BSD-like) Scala Library (org.scala-lang:scala-library:2.11.8 - http://www.scala-lang.org/)
-     (BSD-like) Scalap (org.scala-lang:scalap:2.11.8 - http://www.scala-lang.org/)
+     (BSD-like) Scala Actors library (org.scala-lang:scala-actors:2.11.12 - http://www.scala-lang.org/)
+     (BSD-like) Scala Compiler (org.scala-lang:scala-compiler:2.11.12 - http://www.scala-lang.org/)
+     (BSD-like) Scala Compiler (org.scala-lang:scala-reflect:2.11.12 - http://www.scala-lang.org/)
+     (BSD-like) Scala Library (org.scala-lang:scala-library:2.11.12 - http://www.scala-lang.org/)
+     (BSD-like) Scalap (org.scala-lang:scalap:2.11.12 - http://www.scala-lang.org/)
      (BSD-style) scalacheck (org.scalacheck:scalacheck_2.11:1.10.0 - http://www.scalacheck.org)
      (BSD-style) spire (org.spire-math:spire_2.11:0.7.1 - http://spire-math.org)
      (BSD-style) spire-macros (org.spire-math:spire-macros_2.11:0.7.1 - http://spire-math.org)
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 723180a14febb..96e9c27210d05 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -122,7 +122,7 @@ jersey-server-2.22.2.jar
 jets3t-0.9.4.jar
 jetty-6.1.26.jar
 jetty-util-6.1.26.jar
-jline-2.12.1.jar
+jline-2.14.3.jar
 joda-time-2.9.3.jar
 jodd-core-3.5.2.jar
 jpam-1.1.jar
@@ -172,10 +172,10 @@ parquet-jackson-1.10.0.jar
 protobuf-java-2.5.0.jar
 py4j-0.10.7.jar
 pyrolite-4.13.jar
-scala-compiler-2.11.8.jar
-scala-library-2.11.8.jar
-scala-parser-combinators_2.11-1.0.4.jar
-scala-reflect-2.11.8.jar
+scala-compiler-2.11.12.jar
+scala-library-2.11.12.jar
+scala-parser-combinators_2.11-1.1.0.jar
+scala-reflect-2.11.12.jar
 scala-xml_2.11-1.0.5.jar
 shapeless_2.11-2.3.2.jar
 slf4j-api-1.7.16.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index ea08a001a1c9b..4a6ee027ec355 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -122,7 +122,7 @@ jersey-server-2.22.2.jar
 jets3t-0.9.4.jar
 jetty-6.1.26.jar
 jetty-util-6.1.26.jar
-jline-2.12.1.jar
+jline-2.14.3.jar
 joda-time-2.9.3.jar
 jodd-core-3.5.2.jar
 jpam-1.1.jar
@@ -173,10 +173,10 @@ parquet-jackson-1.10.0.jar
 protobuf-java-2.5.0.jar
 py4j-0.10.7.jar
 pyrolite-4.13.jar
-scala-compiler-2.11.8.jar
-scala-library-2.11.8.jar
-scala-parser-combinators_2.11-1.0.4.jar
-scala-reflect-2.11.8.jar
+scala-compiler-2.11.12.jar
+scala-library-2.11.12.jar
+scala-parser-combinators_2.11-1.1.0.jar
+scala-reflect-2.11.12.jar
 scala-xml_2.11-1.0.5.jar
 shapeless_2.11-2.3.2.jar
 slf4j-api-1.7.16.jar
diff --git a/dev/deps/spark-deps-hadoop-3.1 b/dev/deps/spark-deps-hadoop-3.1
index da874026d7d10..e0b560c8ec71f 100644
--- a/dev/deps/spark-deps-hadoop-3.1
+++ b/dev/deps/spark-deps-hadoop-3.1
@@ -122,7 +122,7 @@ jersey-server-2.22.2.jar
 jets3t-0.9.4.jar
 jetty-webapp-9.3.20.v20170531.jar
 jetty-xml-9.3.20.v20170531.jar
-jline-2.12.1.jar
+jline-2.14.3.jar
 joda-time-2.9.3.jar
 jodd-core-3.5.2.jar
 jpam-1.1.jar
@@ -192,10 +192,10 @@ protobuf-java-2.5.0.jar
 py4j-0.10.7.jar
 pyrolite-4.13.jar
 re2j-1.1.jar
-scala-compiler-2.11.8.jar
-scala-library-2.11.8.jar
-scala-parser-combinators_2.11-1.0.4.jar
-scala-reflect-2.11.8.jar
+scala-compiler-2.11.12.jar
+scala-library-2.11.12.jar
+scala-parser-combinators_2.11-1.1.0.jar
+scala-reflect-2.11.12.jar
 scala-xml_2.11-1.0.5.jar
 shapeless_2.11-2.3.2.jar
 slf4j-api-1.7.16.jar
diff --git a/pom.xml b/pom.xml
index 4b4e6c13ea8fd..90e64ff71d229 100644
--- a/pom.xml
+++ b/pom.xml
@@ -155,7 +155,7 @@
     <commons.math3.version>3.4.1</commons.math3.version>
     <!-- managed up from 3.2.1 for SPARK-11652 -->
     <commons.collections.version>3.2.2</commons.collections.version>
-    <scala.version>2.11.8</scala.version>
+    <scala.version>2.11.12</scala.version>
     <scala.binary.version>2.11</scala.binary.version>
     <codehaus.jackson.version>1.9.13</codehaus.jackson.version>
     <fasterxml.jackson.version>2.6.7</fasterxml.jackson.version>
@@ -740,13 +740,13 @@
       <dependency>
         <groupId>org.scala-lang.modules</groupId>
         <artifactId>scala-parser-combinators_${scala.binary.version}</artifactId>
-        <version>1.0.4</version>
+        <version>1.1.0</version>
       </dependency>
       <!-- SPARK-16770 affecting Scala 2.11.x -->
       <dependency>
         <groupId>jline</groupId>
         <artifactId>jline</artifactId>
-        <version>2.12.1</version>
+        <version>2.14.3</version>
       </dependency>
       <dependency>
         <groupId>org.scalatest</groupId>
@@ -2755,7 +2755,7 @@
     <profile>
       <id>scala-2.12</id>
       <properties>
-        <scala.version>2.12.4</scala.version>
+        <scala.version>2.12.6</scala.version>
         <scala.binary.version>2.12</scala.binary.version>
       </properties>
       <build>
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
index e69441a475e9a..a44051b351e19 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -36,7 +36,7 @@ class SparkILoop(in0: Option[BufferedReader], out: JPrintWriter)
   def this() = this(None, new JPrintWriter(Console.out, true))
 
   override def createInterpreter(): Unit = {
-    intp = new SparkILoopInterpreter(settings, out)
+    intp = new SparkILoopInterpreter(settings, out, initializeSpark)
   }
 
   val initializationCommands: Seq[String] = Seq(
@@ -73,11 +73,15 @@ class SparkILoop(in0: Option[BufferedReader], out: JPrintWriter)
     "import org.apache.spark.sql.functions._"
   )
 
-  def initializeSpark() {
-    intp.beQuietDuring {
-      savingReplayStack { // remove the commands from session history.
-        initializationCommands.foreach(processLine)
+  def initializeSpark(): Unit = {
+    if (!intp.reporter.hasErrors) {
+      // `savingReplayStack` removes the commands from session history.
+      savingReplayStack {
+        initializationCommands.foreach(intp quietRun _)
       }
+    } else {
+      throw new RuntimeException(s"Scala $versionString interpreter encountered " +
+        "errors during initialization")
     }
   }
 
@@ -101,16 +105,6 @@ class SparkILoop(in0: Option[BufferedReader], out: JPrintWriter)
   /** Available commands */
   override def commands: List[LoopCommand] = standardCommands
 
-  /**
-   * We override `loadFiles` because we need to initialize Spark *before* the REPL
-   * sees any files, so that the Spark context is visible in those files. This is a bit of a
-   * hack, but there isn't another hook available to us at this point.
-   */
-  override def loadFiles(settings: Settings): Unit = {
-    initializeSpark()
-    super.loadFiles(settings)
-  }
-
   override def resetCommand(line: String): Unit = {
     super.resetCommand(line)
     initializeSpark()
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoopInterpreter.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoopInterpreter.scala
index e736607a9a6b9..4e63816402a10 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoopInterpreter.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoopInterpreter.scala
@@ -21,8 +21,22 @@ import scala.collection.mutable
 import scala.tools.nsc.Settings
 import scala.tools.nsc.interpreter._
 
-class SparkILoopInterpreter(settings: Settings, out: JPrintWriter) extends IMain(settings, out) {
-  self =>
+class SparkILoopInterpreter(settings: Settings, out: JPrintWriter, initializeSpark: () => Unit)
+    extends IMain(settings, out) { self =>
+
+  /**
+   * We override `initializeSynchronous` to initialize Spark *after* `intp` is properly initialized
+   * and *before* the REPL sees any files in the private `loadInitFiles` functions, so that
+   * the Spark context is visible in those files.
+   *
+   * This is a bit of a hack, but there isn't another hook available to us at this point.
+   *
+   * See the discussion in Scala community https://github.com/scala/bug/issues/10913 for detail.
+   */
+  override def initializeSynchronous(): Unit = {
+    super.initializeSynchronous()
+    initializeSpark()
+  }
 
   override lazy val memberHandlers = new {
     val intp: self.type = self

From e07aee2165af4d301ae12005a6d9ffb030bc2650 Mon Sep 17 00:00:00 2001
From: Marek Novotny <mn.mikke@gmail.com>
Date: Tue, 26 Jun 2018 09:51:55 +0800
Subject: [PATCH 08/79] [SPARK-24636][SQL] Type coercion of arrays for
 array_join function

## What changes were proposed in this pull request?
Presto's implementation accepts arbitrary arrays of primitive types as an input:

```
presto> SELECT array_join(ARRAY [1, 2, 3], ', ');
_col0
---------
1, 2, 3
(1 row)
```

This PR proposes to implement a type coercion rule for ```array_join``` function that converts arrays of primitive as well as non-primitive types to arrays of string.

## How was this patch tested?

New test cases add into:
- sql-tests/inputs/typeCoercion/native/arrayJoin.sql
- DataFrameFunctionsSuite.scala

Author: Marek Novotny <mn.mikke@gmail.com>

Closes #21620 from mn-mikke/SPARK-24636.
---
 .../sql/catalyst/analysis/TypeCoercion.scala  |  8 ++
 .../expressions/collectionOperations.scala    |  1 +
 .../inputs/typeCoercion/native/arrayJoin.sql  | 11 +++
 .../typeCoercion/native/arrayJoin.sql.out     | 90 +++++++++++++++++++
 .../spark/sql/DataFrameFunctionsSuite.scala   | 17 ++++
 5 files changed, 127 insertions(+)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/arrayJoin.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/typeCoercion/native/arrayJoin.sql.out

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
index b2817b0538a7f..637923928a7da 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -536,6 +536,14 @@ object TypeCoercion {
           case None => c
         }
 
+      case aj @ ArrayJoin(arr, d, nr) if !ArrayType(StringType).acceptsType(arr.dataType) &&
+        ArrayType.acceptsType(arr.dataType) =>
+        val containsNull = arr.dataType.asInstanceOf[ArrayType].containsNull
+        ImplicitTypeCasts.implicitCast(arr, ArrayType(StringType, containsNull)) match {
+          case Some(castedArr) => ArrayJoin(castedArr, d, nr)
+          case None => aj
+        }
+
       case m @ CreateMap(children) if m.keys.length == m.values.length &&
         (!haveSameType(m.keys) || !haveSameType(m.values)) =>
         val newKeys = if (haveSameType(m.keys)) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index b6137b07555f4..58612f65c1a53 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -1621,6 +1621,7 @@ case class ArrayJoin(
 
   override def dataType: DataType = StringType
 
+  override def prettyName: String = "array_join"
 }
 
 /**
diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/arrayJoin.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/arrayJoin.sql
new file mode 100644
index 0000000000000..99729c007b104
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/arrayJoin.sql
@@ -0,0 +1,11 @@
+SELECT array_join(array(true, false), ', ');
+SELECT array_join(array(2Y, 1Y), ', ');
+SELECT array_join(array(2S, 1S), ', ');
+SELECT array_join(array(2, 1), ', ');
+SELECT array_join(array(2L, 1L), ', ');
+SELECT array_join(array(9223372036854775809, 9223372036854775808), ', ');
+SELECT array_join(array(2.0D, 1.0D), ', ');
+SELECT array_join(array(float(2.0), float(1.0)), ', ');
+SELECT array_join(array(date '2016-03-14', date '2016-03-13'), ', ');
+SELECT array_join(array(timestamp '2016-11-15 20:54:00.000', timestamp '2016-11-12 20:54:00.000'), ', ');
+SELECT array_join(array('a', 'b'), ', ');
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/arrayJoin.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/arrayJoin.sql.out
new file mode 100644
index 0000000000000..b23a62dacef7c
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/arrayJoin.sql.out
@@ -0,0 +1,90 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 11
+
+
+-- !query 0
+SELECT array_join(array(true, false), ', ')
+-- !query 0 schema
+struct<array_join(array(true, false), , ):string>
+-- !query 0 output
+true, false
+
+
+-- !query 1
+SELECT array_join(array(2Y, 1Y), ', ')
+-- !query 1 schema
+struct<array_join(array(2, 1), , ):string>
+-- !query 1 output
+2, 1
+
+
+-- !query 2
+SELECT array_join(array(2S, 1S), ', ')
+-- !query 2 schema
+struct<array_join(array(2, 1), , ):string>
+-- !query 2 output
+2, 1
+
+
+-- !query 3
+SELECT array_join(array(2, 1), ', ')
+-- !query 3 schema
+struct<array_join(array(2, 1), , ):string>
+-- !query 3 output
+2, 1
+
+
+-- !query 4
+SELECT array_join(array(2L, 1L), ', ')
+-- !query 4 schema
+struct<array_join(array(2, 1), , ):string>
+-- !query 4 output
+2, 1
+
+
+-- !query 5
+SELECT array_join(array(9223372036854775809, 9223372036854775808), ', ')
+-- !query 5 schema
+struct<array_join(array(9223372036854775809, 9223372036854775808), , ):string>
+-- !query 5 output
+9223372036854775809, 9223372036854775808
+
+
+-- !query 6
+SELECT array_join(array(2.0D, 1.0D), ', ')
+-- !query 6 schema
+struct<array_join(array(2.0, 1.0), , ):string>
+-- !query 6 output
+2.0, 1.0
+
+
+-- !query 7
+SELECT array_join(array(float(2.0), float(1.0)), ', ')
+-- !query 7 schema
+struct<array_join(array(CAST(2.0 AS FLOAT), CAST(1.0 AS FLOAT)), , ):string>
+-- !query 7 output
+2.0, 1.0
+
+
+-- !query 8
+SELECT array_join(array(date '2016-03-14', date '2016-03-13'), ', ')
+-- !query 8 schema
+struct<array_join(array(DATE '2016-03-14', DATE '2016-03-13'), , ):string>
+-- !query 8 output
+2016-03-14, 2016-03-13
+
+
+-- !query 9
+SELECT array_join(array(timestamp '2016-11-15 20:54:00.000', timestamp '2016-11-12 20:54:00.000'), ', ')
+-- !query 9 schema
+struct<array_join(array(TIMESTAMP('2016-11-15 20:54:00.0'), TIMESTAMP('2016-11-12 20:54:00.0')), , ):string>
+-- !query 9 output
+2016-11-15 20:54:00, 2016-11-12 20:54:00
+
+
+-- !query 10
+SELECT array_join(array('a', 'b'), ', ')
+-- !query 10 schema
+struct<array_join(array(a, b), , ):string>
+-- !query 10 output
+a, b
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 47fe67d8daea3..5d6a6c0832c96 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -805,6 +805,23 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
     checkAnswer(
       df.selectExpr("array_join(x, delimiter, 'NULL')"),
       Seq(Row("a,b"), Row("a,NULL,b"), Row("")))
+
+    val idf = Seq(Seq(1, 2, 3)).toDF("x")
+
+    checkAnswer(
+      idf.select(array_join(idf("x"), ", ")),
+      Seq(Row("1, 2, 3"))
+    )
+    checkAnswer(
+      idf.selectExpr("array_join(x, ', ')"),
+      Seq(Row("1, 2, 3"))
+    )
+    intercept[AnalysisException] {
+      idf.selectExpr("array_join(x, 1)")
+    }
+    intercept[AnalysisException] {
+      idf.selectExpr("array_join(x, ', ', 1)")
+    }
   }
 
   test("array_min function") {

From dcaa49ff1edd7fcf0f000c6f93ae0e30bd5b6464 Mon Sep 17 00:00:00 2001
From: Yuming Wang <yumwang@ebay.com>
Date: Tue, 26 Jun 2018 14:33:04 -0700
Subject: [PATCH 09/79] [SPARK-24658][SQL] Remove workaround for ANTLR bug

## What changes were proposed in this pull request?

Issue antlr/antlr4#781 has already been fixed, so the workaround of extracting the pattern into a separate rule is no longer needed. The presto already removed it: https://github.com/prestodb/presto/pull/10744.

## How was this patch tested?

Existing tests

Author: Yuming Wang <yumwang@ebay.com>

Closes #21641 from wangyum/ANTLR-780.
---
 .../org/apache/spark/sql/catalyst/parser/SqlBase.g4      | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 3fe00eefde7d8..dc95751bf905c 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -539,18 +539,11 @@ expression
 booleanExpression
     : NOT booleanExpression                                        #logicalNot
     | EXISTS '(' query ')'                                         #exists
-    | predicated                                                   #booleanDefault
+    | valueExpression predicate?                                   #predicated
     | left=booleanExpression operator=AND right=booleanExpression  #logicalBinary
     | left=booleanExpression operator=OR right=booleanExpression   #logicalBinary
     ;
 
-// workaround for:
-//  https://github.com/antlr/antlr4/issues/780
-//  https://github.com/antlr/antlr4/issues/781
-predicated
-    : valueExpression predicate?
-    ;
-
 predicate
     : NOT? kind=BETWEEN lower=valueExpression AND upper=valueExpression
     | NOT? kind=IN '(' expression (',' expression)* ')'

From 02f8781fa2649cf1d3a5cb932e1c8408790974ff Mon Sep 17 00:00:00 2001
From: Dilip Biswal <dbiswal@us.ibm.com>
Date: Tue, 26 Jun 2018 15:17:00 -0700
Subject: [PATCH 10/79] [SPARK-24423][SQL] Add a new option  for JDBC sources

## What changes were proposed in this pull request?
Here is the description in the JIRA -

Currently, our JDBC connector provides the option `dbtable` for users to specify the to-be-loaded JDBC source table.

 ```SQL
 val jdbcDf = spark.read
   .format("jdbc")
   .option("dbtable", "dbName.tableName")
   .options(jdbcCredentials: Map)
   .load()
 ```

Normally, users do not fetch the whole JDBC table due to the poor performance/throughput of JDBC. Thus, they normally just fetch a small set of tables. For advanced users, they can pass a subquery as the option.

 ```SQL
 val query = """ (select * from tableName limit 10) as tmp """
 val jdbcDf = spark.read
   .format("jdbc")
   .option("dbtable", query)
   .options(jdbcCredentials: Map)
   .load()
 ```
However, this is straightforward to end users. We should simply allow users to specify the query by a new option `query`. We will handle the complexity for them.

 ```SQL
 val query = """select * from tableName limit 10"""
 val jdbcDf = spark.read
   .format("jdbc")
   .option("query", query)
   .options(jdbcCredentials: Map)
   .load()
```

## How was this patch tested?
Added tests in JDBCSuite and JDBCWriterSuite.
Also tested against MySQL, Postgress, Oracle, DB2 (using docker infrastructure) to make sure there are no syntax issues.

Author: Dilip Biswal <dbiswal@us.ibm.com>

Closes #21590 from dilipbiswal/SPARK-24423.
---
 docs/sql-programming-guide.md                 | 30 +++++-
 .../datasources/jdbc/JDBCOptions.scala        | 66 ++++++++++++-
 .../execution/datasources/jdbc/JDBCRDD.scala  |  4 +-
 .../datasources/jdbc/JDBCRelation.scala       |  4 +-
 .../jdbc/JdbcRelationProvider.scala           |  5 +-
 .../datasources/jdbc/JdbcUtils.scala          | 10 +-
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 94 ++++++++++++++++++-
 .../spark/sql/jdbc/JDBCWriteSuite.scala       | 14 ++-
 8 files changed, 204 insertions(+), 23 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 196b814420be1..7c4ef41cc8907 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1302,9 +1302,33 @@ the following case-insensitive options:
   <tr>
     <td><code>dbtable</code></td>
     <td>
-      The JDBC table that should be read. Note that anything that is valid in a <code>FROM</code> clause of
-      a SQL query can be used. For example, instead of a full table you could also use a
-      subquery in parentheses.
+      The JDBC table that should be read from or written into. Note that when using it in the read
+      path anything that is valid in a <code>FROM</code> clause of a SQL query can be used.
+      For example, instead of a full table you could also use a subquery in parentheses. It is not
+      allowed to specify `dbtable` and `query` options at the same time.
+    </td>
+  </tr>
+  <tr>
+    <td><code>query</code></td>
+    <td>
+      A query that will be used to read data into Spark. The specified query will be parenthesized and used
+      as a subquery in the <code>FROM</code> clause. Spark will also assign an alias to the subquery clause.
+      As an example, spark will issue a query of the following form to the JDBC Source.<br><br>
+      <code> SELECT &lt;columns&gt; FROM (&lt;user_specified_query&gt;) spark_gen_alias</code><br><br>
+      Below are couple of restrictions while using this option.<br>
+      <ol>
+         <li> It is not allowed to specify `dbtable` and `query` options at the same time. </li>
+         <li> It is not allowed to spcify `query` and `partitionColumn` options at the same time. When specifying
+            `partitionColumn` option is required, the subquery can be specified using `dbtable` option instead and
+            partition columns can be qualified using the subquery alias provided as part of `dbtable`. <br>
+            Example:<br>
+            <code>
+               spark.read.format("jdbc")<br>
+               &nbsp&nbsp .option("dbtable", "(select c1, c2 from t1) as subq")<br>
+               &nbsp&nbsp .option("partitionColumn", "subq.c1"<br>
+               &nbsp&nbsp .load()
+            </code></li>
+      </ol>
     </td>
   </tr>
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala
index a73a97c06fe5a..eea966d30948b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.types.StructType
  * Options for the JDBC data source.
  */
 class JDBCOptions(
-    @transient private val parameters: CaseInsensitiveMap[String])
+    @transient val parameters: CaseInsensitiveMap[String])
   extends Serializable {
 
   import JDBCOptions._
@@ -65,11 +65,31 @@ class JDBCOptions(
   // Required parameters
   // ------------------------------------------------------------
   require(parameters.isDefinedAt(JDBC_URL), s"Option '$JDBC_URL' is required.")
-  require(parameters.isDefinedAt(JDBC_TABLE_NAME), s"Option '$JDBC_TABLE_NAME' is required.")
   // a JDBC URL
   val url = parameters(JDBC_URL)
-  // name of table
-  val table = parameters(JDBC_TABLE_NAME)
+  // table name or a table subquery.
+  val tableOrQuery = (parameters.get(JDBC_TABLE_NAME), parameters.get(JDBC_QUERY_STRING)) match {
+    case (Some(name), Some(subquery)) =>
+      throw new IllegalArgumentException(
+        s"Both '$JDBC_TABLE_NAME' and '$JDBC_QUERY_STRING' can not be specified at the same time."
+      )
+    case (None, None) =>
+      throw new IllegalArgumentException(
+        s"Option '$JDBC_TABLE_NAME' or '$JDBC_QUERY_STRING' is required."
+      )
+    case (Some(name), None) =>
+      if (name.isEmpty) {
+        throw new IllegalArgumentException(s"Option '$JDBC_TABLE_NAME' can not be empty.")
+      } else {
+        name.trim
+      }
+    case (None, Some(subquery)) =>
+      if (subquery.isEmpty) {
+        throw new IllegalArgumentException(s"Option `$JDBC_QUERY_STRING` can not be empty.")
+      } else {
+        s"(${subquery}) __SPARK_GEN_JDBC_SUBQUERY_NAME_${curId.getAndIncrement()}"
+      }
+  }
 
   // ------------------------------------------------------------
   // Optional parameters
@@ -109,6 +129,20 @@ class JDBCOptions(
     s"When reading JDBC data sources, users need to specify all or none for the following " +
       s"options: '$JDBC_PARTITION_COLUMN', '$JDBC_LOWER_BOUND', '$JDBC_UPPER_BOUND', " +
       s"and '$JDBC_NUM_PARTITIONS'")
+
+  require(!(parameters.get(JDBC_QUERY_STRING).isDefined && partitionColumn.isDefined),
+    s"""
+       |Options '$JDBC_QUERY_STRING' and '$JDBC_PARTITION_COLUMN' can not be specified together.
+       |Please define the query using `$JDBC_TABLE_NAME` option instead and make sure to qualify
+       |the partition columns using the supplied subquery alias to resolve any ambiguity.
+       |Example :
+       |spark.read.format("jdbc")
+       |        .option("dbtable", "(select c1, c2 from t1) as subq")
+       |        .option("partitionColumn", "subq.c1"
+       |        .load()
+     """.stripMargin
+  )
+
   val fetchSize = {
     val size = parameters.getOrElse(JDBC_BATCH_FETCH_SIZE, "0").toInt
     require(size >= 0,
@@ -149,7 +183,30 @@ class JDBCOptions(
   val sessionInitStatement = parameters.get(JDBC_SESSION_INIT_STATEMENT)
 }
 
+class JdbcOptionsInWrite(
+    @transient override val parameters: CaseInsensitiveMap[String])
+  extends JDBCOptions(parameters) {
+
+  import JDBCOptions._
+
+  def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters))
+
+  def this(url: String, table: String, parameters: Map[String, String]) = {
+    this(CaseInsensitiveMap(parameters ++ Map(
+      JDBCOptions.JDBC_URL -> url,
+      JDBCOptions.JDBC_TABLE_NAME -> table)))
+  }
+
+  require(
+    parameters.get(JDBC_TABLE_NAME).isDefined,
+    s"Option '$JDBC_TABLE_NAME' is required. " +
+      s"Option '$JDBC_QUERY_STRING' is not applicable while writing.")
+
+  val table = parameters(JDBC_TABLE_NAME)
+}
+
 object JDBCOptions {
+  private val curId = new java.util.concurrent.atomic.AtomicLong(0L)
   private val jdbcOptionNames = collection.mutable.Set[String]()
 
   private def newOption(name: String): String = {
@@ -159,6 +216,7 @@ object JDBCOptions {
 
   val JDBC_URL = newOption("url")
   val JDBC_TABLE_NAME = newOption("dbtable")
+  val JDBC_QUERY_STRING = newOption("query")
   val JDBC_DRIVER_CLASS = newOption("driver")
   val JDBC_PARTITION_COLUMN = newOption("partitionColumn")
   val JDBC_LOWER_BOUND = newOption("lowerBound")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
index 0bab3689e5d0e..1b3b17c75e756 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
@@ -51,7 +51,7 @@ object JDBCRDD extends Logging {
    */
   def resolveTable(options: JDBCOptions): StructType = {
     val url = options.url
-    val table = options.table
+    val table = options.tableOrQuery
     val dialect = JdbcDialects.get(url)
     val conn: Connection = JdbcUtils.createConnectionFactory(options)()
     try {
@@ -296,7 +296,7 @@ private[jdbc] class JDBCRDD(
 
     val myWhereClause = getWhereClause(part)
 
-    val sqlText = s"SELECT $columnList FROM ${options.table} $myWhereClause"
+    val sqlText = s"SELECT $columnList FROM ${options.tableOrQuery} $myWhereClause"
     stmt = conn.prepareStatement(sqlText,
         ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)
     stmt.setFetchSize(options.fetchSize)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
index b84543ccd7869..97e2d255cb7be 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
@@ -189,12 +189,12 @@ private[sql] case class JDBCRelation(
   override def insert(data: DataFrame, overwrite: Boolean): Unit = {
     data.write
       .mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append)
-      .jdbc(jdbcOptions.url, jdbcOptions.table, jdbcOptions.asProperties)
+      .jdbc(jdbcOptions.url, jdbcOptions.tableOrQuery, jdbcOptions.asProperties)
   }
 
   override def toString: String = {
     val partitioningInfo = if (parts.nonEmpty) s" [numPartitions=${parts.length}]" else ""
     // credentials should not be included in the plan output, table information is sufficient.
-    s"JDBCRelation(${jdbcOptions.table})" + partitioningInfo
+    s"JDBCRelation(${jdbcOptions.tableOrQuery})" + partitioningInfo
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcRelationProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcRelationProvider.scala
index 2b488bb7121dc..782d626c1573c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcRelationProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcRelationProvider.scala
@@ -59,7 +59,7 @@ class JdbcRelationProvider extends CreatableRelationProvider
       mode: SaveMode,
       parameters: Map[String, String],
       df: DataFrame): BaseRelation = {
-    val options = new JDBCOptions(parameters)
+    val options = new JdbcOptionsInWrite(parameters)
     val isCaseSensitive = sqlContext.conf.caseSensitiveAnalysis
 
     val conn = JdbcUtils.createConnectionFactory(options)()
@@ -86,7 +86,8 @@ class JdbcRelationProvider extends CreatableRelationProvider
 
           case SaveMode.ErrorIfExists =>
             throw new AnalysisException(
-              s"Table or view '${options.table}' already exists. SaveMode: ErrorIfExists.")
+              s"Table or view '${options.table}' already exists. " +
+                s"SaveMode: ErrorIfExists.")
 
           case SaveMode.Ignore =>
             // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
index 433443007cfd8..b81737eda475b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
@@ -67,7 +67,7 @@ object JdbcUtils extends Logging {
   /**
    * Returns true if the table already exists in the JDBC database.
    */
-  def tableExists(conn: Connection, options: JDBCOptions): Boolean = {
+  def tableExists(conn: Connection, options: JdbcOptionsInWrite): Boolean = {
     val dialect = JdbcDialects.get(options.url)
 
     // Somewhat hacky, but there isn't a good way to identify whether a table exists for all
@@ -100,7 +100,7 @@ object JdbcUtils extends Logging {
   /**
    * Truncates a table from the JDBC database without side effects.
    */
-  def truncateTable(conn: Connection, options: JDBCOptions): Unit = {
+  def truncateTable(conn: Connection, options: JdbcOptionsInWrite): Unit = {
     val dialect = JdbcDialects.get(options.url)
     val statement = conn.createStatement
     try {
@@ -255,7 +255,7 @@ object JdbcUtils extends Logging {
     val dialect = JdbcDialects.get(options.url)
 
     try {
-      val statement = conn.prepareStatement(dialect.getSchemaQuery(options.table))
+      val statement = conn.prepareStatement(dialect.getSchemaQuery(options.tableOrQuery))
       try {
         statement.setQueryTimeout(options.queryTimeout)
         Some(getSchema(statement.executeQuery(), dialect))
@@ -809,7 +809,7 @@ object JdbcUtils extends Logging {
       df: DataFrame,
       tableSchema: Option[StructType],
       isCaseSensitive: Boolean,
-      options: JDBCOptions): Unit = {
+      options: JdbcOptionsInWrite): Unit = {
     val url = options.url
     val table = options.table
     val dialect = JdbcDialects.get(url)
@@ -838,7 +838,7 @@ object JdbcUtils extends Logging {
   def createTable(
       conn: Connection,
       df: DataFrame,
-      options: JDBCOptions): Unit = {
+      options: JdbcOptionsInWrite): Unit = {
     val strSchema = schemaString(
       df, options.url, options.createTableColumnTypes)
     val table = options.table
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 6ea61f02a8206..0389273d6cdfa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -25,7 +25,7 @@ import org.h2.jdbc.JdbcSQLException
 import org.scalatest.{BeforeAndAfter, PrivateMethodTester}
 
 import org.apache.spark.{SparkException, SparkFunSuite}
-import org.apache.spark.sql.{AnalysisException, DataFrame, Row}
+import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row}
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.execution.DataSourceScanExec
@@ -39,7 +39,7 @@ import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
-class JDBCSuite extends SparkFunSuite
+class JDBCSuite extends QueryTest
   with BeforeAndAfter with PrivateMethodTester with SharedSQLContext {
   import testImplicits._
 
@@ -1099,7 +1099,7 @@ class JDBCSuite extends SparkFunSuite
   test("SPARK-19318: Connection properties keys should be case-sensitive.") {
     def testJdbcOptions(options: JDBCOptions): Unit = {
       // Spark JDBC data source options are case-insensitive
-      assert(options.table == "t1")
+      assert(options.tableOrQuery == "t1")
       // When we convert it to properties, it should be case-sensitive.
       assert(options.asProperties.size == 3)
       assert(options.asProperties.get("customkey") == null)
@@ -1255,4 +1255,92 @@ class JDBCSuite extends SparkFunSuite
       testIncorrectJdbcPartitionColumn(testH2Dialect.quoteIdentifier("ThEiD"))
     }
   }
+
+  test("query JDBC option - negative tests") {
+    val query = "SELECT * FROM  test.people WHERE theid = 1"
+    // load path
+    val e1 = intercept[RuntimeException] {
+      val df = spark.read.format("jdbc")
+        .option("Url", urlWithUserAndPass)
+        .option("query", query)
+        .option("dbtable", "test.people")
+        .load()
+    }.getMessage
+    assert(e1.contains("Both 'dbtable' and 'query' can not be specified at the same time."))
+
+    // jdbc api path
+    val properties = new Properties()
+    properties.setProperty(JDBCOptions.JDBC_QUERY_STRING, query)
+    val e2 = intercept[RuntimeException] {
+      spark.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", properties).collect()
+    }.getMessage
+    assert(e2.contains("Both 'dbtable' and 'query' can not be specified at the same time."))
+
+    val e3 = intercept[RuntimeException] {
+      sql(
+        s"""
+         |CREATE OR REPLACE TEMPORARY VIEW queryOption
+         |USING org.apache.spark.sql.jdbc
+         |OPTIONS (url '$url', query '$query', dbtable 'TEST.PEOPLE',
+         |         user 'testUser', password 'testPass')
+       """.stripMargin.replaceAll("\n", " "))
+      }.getMessage
+    assert(e3.contains("Both 'dbtable' and 'query' can not be specified at the same time."))
+
+    val e4 = intercept[RuntimeException] {
+      val df = spark.read.format("jdbc")
+        .option("Url", urlWithUserAndPass)
+        .option("query", "")
+        .load()
+    }.getMessage
+    assert(e4.contains("Option `query` can not be empty."))
+
+    // Option query and partitioncolumn are not allowed together.
+    val expectedErrorMsg =
+      s"""
+         |Options 'query' and 'partitionColumn' can not be specified together.
+         |Please define the query using `dbtable` option instead and make sure to qualify
+         |the partition columns using the supplied subquery alias to resolve any ambiguity.
+         |Example :
+         |spark.read.format("jdbc")
+         |        .option("dbtable", "(select c1, c2 from t1) as subq")
+         |        .option("partitionColumn", "subq.c1"
+         |        .load()
+     """.stripMargin
+    val e5 = intercept[RuntimeException] {
+      sql(
+        s"""
+           |CREATE OR REPLACE TEMPORARY VIEW queryOption
+           |USING org.apache.spark.sql.jdbc
+           |OPTIONS (url '$url', query '$query', user 'testUser', password 'testPass',
+           |         partitionColumn 'THEID', lowerBound '1', upperBound '4', numPartitions '3')
+       """.stripMargin.replaceAll("\n", " "))
+    }.getMessage
+    assert(e5.contains(expectedErrorMsg))
+  }
+
+  test("query JDBC option") {
+    val query = "SELECT name, theid FROM  test.people WHERE theid = 1"
+    // query option to pass on the query string.
+    val df = spark.read.format("jdbc")
+      .option("Url", urlWithUserAndPass)
+      .option("query", query)
+      .load()
+    checkAnswer(
+      df,
+      Row("fred", 1) :: Nil)
+
+    // query option in the create table path.
+    sql(
+      s"""
+         |CREATE OR REPLACE TEMPORARY VIEW queryOption
+         |USING org.apache.spark.sql.jdbc
+         |OPTIONS (url '$url', query '$query', user 'testUser', password 'testPass')
+       """.stripMargin.replaceAll("\n", " "))
+
+    checkAnswer(
+      sql("select name, theid from queryOption"),
+      Row("fred", 1) :: Nil)
+
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
index 1c2c92d1f0737..b751ec2de4825 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
@@ -293,13 +293,23 @@ class JDBCWriteSuite extends SharedSQLContext with BeforeAndAfter {
   test("save errors if dbtable is not specified") {
     val df = spark.createDataFrame(sparkContext.parallelize(arr2x2), schema2)
 
-    val e = intercept[RuntimeException] {
+    val e1 = intercept[RuntimeException] {
+      df.write.format("jdbc")
+        .option("url", url1)
+        .options(properties.asScala)
+        .save()
+    }.getMessage
+    assert(e1.contains("Option 'dbtable' or 'query' is required"))
+
+    val e2 = intercept[RuntimeException] {
       df.write.format("jdbc")
         .option("url", url1)
         .options(properties.asScala)
+        .option("query", "select * from TEST.SAVETEST")
         .save()
     }.getMessage
-    assert(e.contains("Option 'dbtable' is required"))
+    val msg = "Option 'dbtable' is required. Option 'query' is not applicable while writing."
+    assert(e2.contains(msg))
   }
 
   test("save errors if wrong user/password combination") {

From 16f2c3ea46a330bff7fae33f2521eb36a6280f04 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Tue, 26 Jun 2018 15:56:58 -0700
Subject: [PATCH 11/79] [SPARK-6237][NETWORK] Network-layer changes to allow
 stream upload.

These changes allow an RPCHandler to receive an upload as a stream of
data, without having to buffer the entire message in the FrameDecoder.
The primary use case is for replicating large blocks.  By itself, this change is adding dead-code that is not being used -- it is a step towards SPARK-24296.

Added unit tests for handling streaming data, including successfully sending data, and failures in reading the stream with concurrent requests.

Summary of changes:

* Introduce a new UploadStream RPC which is sent to push a large payload as a stream (in contrast, the pre-existing StreamRequest and StreamResponse RPCs are used for pull-based streaming).
* Generalize RpcHandler.receive() to support requests which contain streams.
* Generalize StreamInterceptor to handle both request and response messages (previously it only handled responses).
* Introduce StdChannelListener to abstract away common logging logic in ChannelFuture listeners.

Author: Imran Rashid <irashid@cloudera.com>

Closes #21346 from squito/upload_stream.
---
 .../network/client/StreamCallbackWithID.java  |  22 ++
 .../network/client/StreamInterceptor.java     |  26 +-
 .../spark/network/client/TransportClient.java | 175 ++++++-----
 .../spark/network/crypto/AuthRpcHandler.java  |   9 +
 .../spark/network/protocol/Message.java       |   3 +-
 .../network/protocol/MessageDecoder.java      |   3 +
 .../network/protocol/StreamResponse.java      |   2 +-
 .../spark/network/protocol/UploadStream.java  | 107 +++++++
 .../spark/network/sasl/SaslRpcHandler.java    |   9 +
 .../spark/network/server/RpcHandler.java      |  34 ++-
 .../server/TransportRequestHandler.java       |  95 +++++-
 .../spark/network/RpcIntegrationSuite.java    | 289 ++++++++++++++++--
 .../org/apache/spark/network/StreamSuite.java |  94 ++----
 .../spark/network/StreamTestHelper.java       | 104 +++++++
 project/MimaExcludes.scala                    |   3 +
 15 files changed, 799 insertions(+), 176 deletions(-)
 create mode 100644 common/network-common/src/main/java/org/apache/spark/network/client/StreamCallbackWithID.java
 create mode 100644 common/network-common/src/main/java/org/apache/spark/network/protocol/UploadStream.java
 create mode 100644 common/network-common/src/test/java/org/apache/spark/network/StreamTestHelper.java

diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/StreamCallbackWithID.java b/common/network-common/src/main/java/org/apache/spark/network/client/StreamCallbackWithID.java
new file mode 100644
index 0000000000000..bd173b653e33e
--- /dev/null
+++ b/common/network-common/src/main/java/org/apache/spark/network/client/StreamCallbackWithID.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network.client;
+
+public interface StreamCallbackWithID extends StreamCallback {
+  String getID();
+}
diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/StreamInterceptor.java b/common/network-common/src/main/java/org/apache/spark/network/client/StreamInterceptor.java
index b0e85bae7c309..f3eb744ff7345 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/client/StreamInterceptor.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/client/StreamInterceptor.java
@@ -22,22 +22,24 @@
 
 import io.netty.buffer.ByteBuf;
 
+import org.apache.spark.network.protocol.Message;
+import org.apache.spark.network.server.MessageHandler;
 import org.apache.spark.network.util.TransportFrameDecoder;
 
 /**
  * An interceptor that is registered with the frame decoder to feed stream data to a
  * callback.
  */
-class StreamInterceptor implements TransportFrameDecoder.Interceptor {
+public class StreamInterceptor<T extends Message> implements TransportFrameDecoder.Interceptor {
 
-  private final TransportResponseHandler handler;
+  private final MessageHandler<T> handler;
   private final String streamId;
   private final long byteCount;
   private final StreamCallback callback;
   private long bytesRead;
 
-  StreamInterceptor(
-      TransportResponseHandler handler,
+  public StreamInterceptor(
+      MessageHandler<T> handler,
       String streamId,
       long byteCount,
       StreamCallback callback) {
@@ -50,16 +52,24 @@ class StreamInterceptor implements TransportFrameDecoder.Interceptor {
 
   @Override
   public void exceptionCaught(Throwable cause) throws Exception {
-    handler.deactivateStream();
+    deactivateStream();
     callback.onFailure(streamId, cause);
   }
 
   @Override
   public void channelInactive() throws Exception {
-    handler.deactivateStream();
+    deactivateStream();
     callback.onFailure(streamId, new ClosedChannelException());
   }
 
+  private void deactivateStream() {
+    if (handler instanceof TransportResponseHandler) {
+      // we only have to do this for TransportResponseHandler as it exposes numOutstandingFetches
+      // (there is no extra cleanup that needs to happen)
+      ((TransportResponseHandler) handler).deactivateStream();
+    }
+  }
+
   @Override
   public boolean handle(ByteBuf buf) throws Exception {
     int toRead = (int) Math.min(buf.readableBytes(), byteCount - bytesRead);
@@ -72,10 +82,10 @@ public boolean handle(ByteBuf buf) throws Exception {
       RuntimeException re = new IllegalStateException(String.format(
         "Read too many bytes? Expected %d, but read %d.", byteCount, bytesRead));
       callback.onFailure(streamId, re);
-      handler.deactivateStream();
+      deactivateStream();
       throw re;
     } else if (bytesRead == byteCount) {
-      handler.deactivateStream();
+      deactivateStream();
       callback.onComplete(streamId);
     }
 
diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java
index 8f354ad78bbaa..325225dc0ea2c 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java
@@ -32,15 +32,15 @@
 import com.google.common.base.Throwables;
 import com.google.common.util.concurrent.SettableFuture;
 import io.netty.channel.Channel;
+import io.netty.util.concurrent.Future;
+import io.netty.util.concurrent.GenericFutureListener;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.spark.network.buffer.ManagedBuffer;
 import org.apache.spark.network.buffer.NioManagedBuffer;
-import org.apache.spark.network.protocol.ChunkFetchRequest;
-import org.apache.spark.network.protocol.OneWayMessage;
-import org.apache.spark.network.protocol.RpcRequest;
-import org.apache.spark.network.protocol.StreamChunkId;
-import org.apache.spark.network.protocol.StreamRequest;
+import org.apache.spark.network.protocol.*;
+
 import static org.apache.spark.network.util.NettyUtils.getRemoteAddress;
 
 /**
@@ -133,34 +133,21 @@ public void fetchChunk(
       long streamId,
       int chunkIndex,
       ChunkReceivedCallback callback) {
-    long startTime = System.currentTimeMillis();
     if (logger.isDebugEnabled()) {
       logger.debug("Sending fetch chunk request {} to {}", chunkIndex, getRemoteAddress(channel));
     }
 
     StreamChunkId streamChunkId = new StreamChunkId(streamId, chunkIndex);
-    handler.addFetchRequest(streamChunkId, callback);
-
-    channel.writeAndFlush(new ChunkFetchRequest(streamChunkId)).addListener(future -> {
-      if (future.isSuccess()) {
-        long timeTaken = System.currentTimeMillis() - startTime;
-        if (logger.isTraceEnabled()) {
-          logger.trace("Sending request {} to {} took {} ms", streamChunkId,
-            getRemoteAddress(channel), timeTaken);
-        }
-      } else {
-        String errorMsg = String.format("Failed to send request %s to %s: %s", streamChunkId,
-          getRemoteAddress(channel), future.cause());
-        logger.error(errorMsg, future.cause());
+    StdChannelListener listener = new StdChannelListener(streamChunkId) {
+      @Override
+      void handleFailure(String errorMsg, Throwable cause) {
         handler.removeFetchRequest(streamChunkId);
-        channel.close();
-        try {
-          callback.onFailure(chunkIndex, new IOException(errorMsg, future.cause()));
-        } catch (Exception e) {
-          logger.error("Uncaught exception in RPC response callback handler!", e);
-        }
+        callback.onFailure(chunkIndex, new IOException(errorMsg, cause));
       }
-    });
+    };
+    handler.addFetchRequest(streamChunkId, callback);
+
+    channel.writeAndFlush(new ChunkFetchRequest(streamChunkId)).addListener(listener);
   }
 
   /**
@@ -170,7 +157,12 @@ public void fetchChunk(
    * @param callback Object to call with the stream data.
    */
   public void stream(String streamId, StreamCallback callback) {
-    long startTime = System.currentTimeMillis();
+    StdChannelListener listener = new StdChannelListener(streamId) {
+      @Override
+      void handleFailure(String errorMsg, Throwable cause) throws Exception {
+        callback.onFailure(streamId, new IOException(errorMsg, cause));
+      }
+    };
     if (logger.isDebugEnabled()) {
       logger.debug("Sending stream request for {} to {}", streamId, getRemoteAddress(channel));
     }
@@ -180,25 +172,7 @@ public void stream(String streamId, StreamCallback callback) {
     // when responses arrive.
     synchronized (this) {
       handler.addStreamCallback(streamId, callback);
-      channel.writeAndFlush(new StreamRequest(streamId)).addListener(future -> {
-        if (future.isSuccess()) {
-          long timeTaken = System.currentTimeMillis() - startTime;
-          if (logger.isTraceEnabled()) {
-            logger.trace("Sending request for {} to {} took {} ms", streamId,
-              getRemoteAddress(channel), timeTaken);
-          }
-        } else {
-          String errorMsg = String.format("Failed to send request for %s to %s: %s", streamId,
-            getRemoteAddress(channel), future.cause());
-          logger.error(errorMsg, future.cause());
-          channel.close();
-          try {
-            callback.onFailure(streamId, new IOException(errorMsg, future.cause()));
-          } catch (Exception e) {
-            logger.error("Uncaught exception in RPC response callback handler!", e);
-          }
-        }
-      });
+      channel.writeAndFlush(new StreamRequest(streamId)).addListener(listener);
     }
   }
 
@@ -211,35 +185,44 @@ public void stream(String streamId, StreamCallback callback) {
    * @return The RPC's id.
    */
   public long sendRpc(ByteBuffer message, RpcResponseCallback callback) {
-    long startTime = System.currentTimeMillis();
     if (logger.isTraceEnabled()) {
       logger.trace("Sending RPC to {}", getRemoteAddress(channel));
     }
 
-    long requestId = Math.abs(UUID.randomUUID().getLeastSignificantBits());
+    long requestId = requestId();
     handler.addRpcRequest(requestId, callback);
 
+    RpcChannelListener listener = new RpcChannelListener(requestId, callback);
     channel.writeAndFlush(new RpcRequest(requestId, new NioManagedBuffer(message)))
-        .addListener(future -> {
-          if (future.isSuccess()) {
-            long timeTaken = System.currentTimeMillis() - startTime;
-            if (logger.isTraceEnabled()) {
-              logger.trace("Sending request {} to {} took {} ms", requestId,
-                getRemoteAddress(channel), timeTaken);
-            }
-          } else {
-            String errorMsg = String.format("Failed to send RPC %s to %s: %s", requestId,
-              getRemoteAddress(channel), future.cause());
-            logger.error(errorMsg, future.cause());
-            handler.removeRpcRequest(requestId);
-            channel.close();
-            try {
-              callback.onFailure(new IOException(errorMsg, future.cause()));
-            } catch (Exception e) {
-              logger.error("Uncaught exception in RPC response callback handler!", e);
-            }
-          }
-        });
+      .addListener(listener);
+
+    return requestId;
+  }
+
+  /**
+   * Send data to the remote end as a stream.  This differs from stream() in that this is a request
+   * to *send* data to the remote end, not to receive it from the remote.
+   *
+   * @param meta meta data associated with the stream, which will be read completely on the
+   *             receiving end before the stream itself.
+   * @param data this will be streamed to the remote end to allow for transferring large amounts
+   *             of data without reading into memory.
+   * @param callback handles the reply -- onSuccess will only be called when both message and data
+   *                 are received successfully.
+   */
+  public long uploadStream(
+      ManagedBuffer meta,
+      ManagedBuffer data,
+      RpcResponseCallback callback) {
+    if (logger.isTraceEnabled()) {
+      logger.trace("Sending RPC to {}", getRemoteAddress(channel));
+    }
+
+    long requestId = requestId();
+    handler.addRpcRequest(requestId, callback);
+
+    RpcChannelListener listener = new RpcChannelListener(requestId, callback);
+    channel.writeAndFlush(new UploadStream(requestId, meta, data)).addListener(listener);
 
     return requestId;
   }
@@ -319,4 +302,60 @@ public String toString() {
       .add("isActive", isActive())
       .toString();
   }
+
+  private static long requestId() {
+    return Math.abs(UUID.randomUUID().getLeastSignificantBits());
+  }
+
+  private class StdChannelListener
+      implements GenericFutureListener<Future<? super Void>> {
+    final long startTime;
+    final Object requestId;
+
+    StdChannelListener(Object requestId) {
+      this.startTime = System.currentTimeMillis();
+      this.requestId = requestId;
+    }
+
+    @Override
+    public void operationComplete(Future future) throws Exception {
+      if (future.isSuccess()) {
+        if (logger.isTraceEnabled()) {
+          long timeTaken = System.currentTimeMillis() - startTime;
+          logger.trace("Sending request {} to {} took {} ms", requestId,
+              getRemoteAddress(channel), timeTaken);
+        }
+      } else {
+        String errorMsg = String.format("Failed to send RPC %s to %s: %s", requestId,
+            getRemoteAddress(channel), future.cause());
+        logger.error(errorMsg, future.cause());
+        channel.close();
+        try {
+          handleFailure(errorMsg, future.cause());
+        } catch (Exception e) {
+          logger.error("Uncaught exception in RPC response callback handler!", e);
+        }
+      }
+    }
+
+    void handleFailure(String errorMsg, Throwable cause) throws Exception {}
+  }
+
+  private class RpcChannelListener extends StdChannelListener {
+    final long rpcRequestId;
+    final RpcResponseCallback callback;
+
+    RpcChannelListener(long rpcRequestId, RpcResponseCallback callback) {
+      super("RPC " + rpcRequestId);
+      this.rpcRequestId = rpcRequestId;
+      this.callback = callback;
+    }
+
+    @Override
+    void handleFailure(String errorMsg, Throwable cause) {
+      handler.removeRpcRequest(rpcRequestId);
+      callback.onFailure(new IOException(errorMsg, cause));
+    }
+  }
+
 }
diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java
index 8a6e3858081bf..fb44dbbb0953b 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java
@@ -29,6 +29,7 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.spark.network.client.RpcResponseCallback;
+import org.apache.spark.network.client.StreamCallbackWithID;
 import org.apache.spark.network.client.TransportClient;
 import org.apache.spark.network.sasl.SecretKeyHolder;
 import org.apache.spark.network.sasl.SaslRpcHandler;
@@ -149,6 +150,14 @@ public void receive(TransportClient client, ByteBuffer message) {
     delegate.receive(client, message);
   }
 
+  @Override
+  public StreamCallbackWithID receiveStream(
+      TransportClient client,
+      ByteBuffer message,
+      RpcResponseCallback callback) {
+    return delegate.receiveStream(client, message, callback);
+  }
+
   @Override
   public StreamManager getStreamManager() {
     return delegate.getStreamManager();
diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/Message.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/Message.java
index 434935a8ef2ad..0ccd70c03aba8 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/protocol/Message.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/Message.java
@@ -37,7 +37,7 @@ enum Type implements Encodable {
     ChunkFetchRequest(0), ChunkFetchSuccess(1), ChunkFetchFailure(2),
     RpcRequest(3), RpcResponse(4), RpcFailure(5),
     StreamRequest(6), StreamResponse(7), StreamFailure(8),
-    OneWayMessage(9), User(-1);
+    OneWayMessage(9), UploadStream(10), User(-1);
 
     private final byte id;
 
@@ -65,6 +65,7 @@ public static Type decode(ByteBuf buf) {
         case 7: return StreamResponse;
         case 8: return StreamFailure;
         case 9: return OneWayMessage;
+        case 10: return UploadStream;
         case -1: throw new IllegalArgumentException("User type messages cannot be decoded.");
         default: throw new IllegalArgumentException("Unknown message type: " + id);
       }
diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageDecoder.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageDecoder.java
index 39a7495828a8a..bf80aed0afe10 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageDecoder.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageDecoder.java
@@ -80,6 +80,9 @@ private Message decode(Message.Type msgType, ByteBuf in) {
       case StreamFailure:
         return StreamFailure.decode(in);
 
+      case UploadStream:
+        return UploadStream.decode(in);
+
       default:
         throw new IllegalArgumentException("Unexpected message type: " + msgType);
     }
diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/StreamResponse.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/StreamResponse.java
index 87e212f3e157b..50b811604b84b 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/protocol/StreamResponse.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/StreamResponse.java
@@ -67,7 +67,7 @@ public static StreamResponse decode(ByteBuf buf) {
 
   @Override
   public int hashCode() {
-    return Objects.hashCode(byteCount, streamId, body());
+    return Objects.hashCode(byteCount, streamId);
   }
 
   @Override
diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/UploadStream.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/UploadStream.java
new file mode 100644
index 0000000000000..fa1d26e76b852
--- /dev/null
+++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/UploadStream.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network.protocol;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import com.google.common.base.Objects;
+import io.netty.buffer.ByteBuf;
+
+import org.apache.spark.network.buffer.ManagedBuffer;
+import org.apache.spark.network.buffer.NettyManagedBuffer;
+
+/**
+ * An RPC with data that is sent outside of the frame, so it can be read as a stream.
+ */
+public final class UploadStream extends AbstractMessage implements RequestMessage {
+  /** Used to link an RPC request with its response. */
+  public final long requestId;
+  public final ManagedBuffer meta;
+  public final long bodyByteCount;
+
+  public UploadStream(long requestId, ManagedBuffer meta, ManagedBuffer body) {
+    super(body, false); // body is *not* included in the frame
+    this.requestId = requestId;
+    this.meta = meta;
+    bodyByteCount = body.size();
+  }
+
+  // this version is called when decoding the bytes on the receiving end.  The body is handled
+  // separately.
+  private UploadStream(long requestId, ManagedBuffer meta, long bodyByteCount) {
+    super(null, false);
+    this.requestId = requestId;
+    this.meta = meta;
+    this.bodyByteCount = bodyByteCount;
+  }
+
+  @Override
+  public Type type() { return Type.UploadStream; }
+
+  @Override
+  public int encodedLength() {
+    // the requestId, meta size, meta and bodyByteCount (body is not included)
+    return 8 + 4 + ((int) meta.size()) + 8;
+  }
+
+  @Override
+  public void encode(ByteBuf buf) {
+    buf.writeLong(requestId);
+    try {
+      ByteBuffer metaBuf = meta.nioByteBuffer();
+      buf.writeInt(metaBuf.remaining());
+      buf.writeBytes(metaBuf);
+    } catch (IOException io) {
+      throw new RuntimeException(io);
+    }
+    buf.writeLong(bodyByteCount);
+  }
+
+  public static UploadStream decode(ByteBuf buf) {
+    long requestId = buf.readLong();
+    int metaSize = buf.readInt();
+    ManagedBuffer meta = new NettyManagedBuffer(buf.readRetainedSlice(metaSize));
+    long bodyByteCount = buf.readLong();
+    // This is called by the frame decoder, so the data is still null.  We need a StreamInterceptor
+    // to read the data.
+    return new UploadStream(requestId, meta, bodyByteCount);
+  }
+
+  @Override
+  public int hashCode() {
+    return Long.hashCode(requestId);
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (other instanceof UploadStream) {
+      UploadStream o = (UploadStream) other;
+      return requestId == o.requestId && super.equals(o);
+    }
+    return false;
+  }
+
+  @Override
+  public String toString() {
+    return Objects.toStringHelper(this)
+      .add("requestId", requestId)
+      .add("body", body())
+      .toString();
+  }
+}
diff --git a/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java b/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java
index 0231428318add..355a3def8cc22 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java
@@ -28,6 +28,7 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.spark.network.client.RpcResponseCallback;
+import org.apache.spark.network.client.StreamCallbackWithID;
 import org.apache.spark.network.client.TransportClient;
 import org.apache.spark.network.server.RpcHandler;
 import org.apache.spark.network.server.StreamManager;
@@ -132,6 +133,14 @@ public void receive(TransportClient client, ByteBuffer message) {
     delegate.receive(client, message);
   }
 
+  @Override
+  public StreamCallbackWithID receiveStream(
+      TransportClient client,
+      ByteBuffer message,
+      RpcResponseCallback callback) {
+    return delegate.receiveStream(client, message, callback);
+  }
+
   @Override
   public StreamManager getStreamManager() {
     return delegate.getStreamManager();
diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/RpcHandler.java b/common/network-common/src/main/java/org/apache/spark/network/server/RpcHandler.java
index 8f7554e2e07d5..38569baf82bce 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/server/RpcHandler.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/server/RpcHandler.java
@@ -23,6 +23,7 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.spark.network.client.RpcResponseCallback;
+import org.apache.spark.network.client.StreamCallbackWithID;
 import org.apache.spark.network.client.TransportClient;
 
 /**
@@ -36,7 +37,8 @@ public abstract class RpcHandler {
    * Receive a single RPC message. Any exception thrown while in this method will be sent back to
    * the client in string form as a standard RPC failure.
    *
-   * This method will not be called in parallel for a single TransportClient (i.e., channel).
+   * Neither this method nor #receiveStream will be called in parallel for a single
+   * TransportClient (i.e., channel).
    *
    * @param client A channel client which enables the handler to make requests back to the sender
    *               of this RPC. This will always be the exact same object for a particular channel.
@@ -49,6 +51,36 @@ public abstract void receive(
       ByteBuffer message,
       RpcResponseCallback callback);
 
+  /**
+   * Receive a single RPC message which includes data that is to be received as a stream. Any
+   * exception thrown while in this method will be sent back to the client in string form as a
+   * standard RPC failure.
+   *
+   * Neither this method nor #receive will be called in parallel for a single TransportClient
+   * (i.e., channel).
+   *
+   * An error while reading data from the stream
+   * ({@link org.apache.spark.network.client.StreamCallback#onData(String, ByteBuffer)})
+   * will fail the entire channel.  A failure in "post-processing" the stream in
+   * {@link org.apache.spark.network.client.StreamCallback#onComplete(String)} will result in an
+   * rpcFailure, but the channel will remain active.
+   *
+   * @param client A channel client which enables the handler to make requests back to the sender
+   *               of this RPC. This will always be the exact same object for a particular channel.
+   * @param messageHeader The serialized bytes of the header portion of the RPC.  This is in meant
+   *                      to be relatively small, and will be buffered entirely in memory, to
+   *                      facilitate how the streaming portion should be received.
+   * @param callback Callback which should be invoked exactly once upon success or failure of the
+   *                 RPC.
+   * @return a StreamCallback for handling the accompanying streaming data
+   */
+  public StreamCallbackWithID receiveStream(
+      TransportClient client,
+      ByteBuffer messageHeader,
+      RpcResponseCallback callback) {
+    throw new UnsupportedOperationException();
+  }
+
   /**
    * Returns the StreamManager which contains the state about which streams are currently being
    * fetched by a TransportClient.
diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java b/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
index e94453578e6b0..e1d7b2dbff60f 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.network.server;
 
+import java.io.IOException;
 import java.net.SocketAddress;
 import java.nio.ByteBuffer;
 
@@ -28,20 +29,10 @@
 
 import org.apache.spark.network.buffer.ManagedBuffer;
 import org.apache.spark.network.buffer.NioManagedBuffer;
-import org.apache.spark.network.client.RpcResponseCallback;
-import org.apache.spark.network.client.TransportClient;
-import org.apache.spark.network.protocol.ChunkFetchRequest;
-import org.apache.spark.network.protocol.ChunkFetchFailure;
-import org.apache.spark.network.protocol.ChunkFetchSuccess;
-import org.apache.spark.network.protocol.Encodable;
-import org.apache.spark.network.protocol.OneWayMessage;
-import org.apache.spark.network.protocol.RequestMessage;
-import org.apache.spark.network.protocol.RpcFailure;
-import org.apache.spark.network.protocol.RpcRequest;
-import org.apache.spark.network.protocol.RpcResponse;
-import org.apache.spark.network.protocol.StreamFailure;
-import org.apache.spark.network.protocol.StreamRequest;
-import org.apache.spark.network.protocol.StreamResponse;
+import org.apache.spark.network.client.*;
+import org.apache.spark.network.protocol.*;
+import org.apache.spark.network.util.TransportFrameDecoder;
+
 import static org.apache.spark.network.util.NettyUtils.getRemoteAddress;
 
 /**
@@ -52,6 +43,7 @@
  * The messages should have been processed by the pipeline setup by {@link TransportServer}.
  */
 public class TransportRequestHandler extends MessageHandler<RequestMessage> {
+
   private static final Logger logger = LoggerFactory.getLogger(TransportRequestHandler.class);
 
   /** The Netty channel that this handler is associated with. */
@@ -113,6 +105,8 @@ public void handle(RequestMessage request) {
       processOneWayMessage((OneWayMessage) request);
     } else if (request instanceof StreamRequest) {
       processStreamRequest((StreamRequest) request);
+    } else if (request instanceof UploadStream) {
+      processStreamUpload((UploadStream) request);
     } else {
       throw new IllegalArgumentException("Unknown request type: " + request);
     }
@@ -203,6 +197,79 @@ public void onFailure(Throwable e) {
     }
   }
 
+  /**
+   * Handle a request from the client to upload a stream of data.
+   */
+  private void processStreamUpload(final UploadStream req) {
+    assert (req.body() == null);
+    try {
+      RpcResponseCallback callback = new RpcResponseCallback() {
+        @Override
+        public void onSuccess(ByteBuffer response) {
+          respond(new RpcResponse(req.requestId, new NioManagedBuffer(response)));
+        }
+
+        @Override
+        public void onFailure(Throwable e) {
+          respond(new RpcFailure(req.requestId, Throwables.getStackTraceAsString(e)));
+        }
+      };
+      TransportFrameDecoder frameDecoder = (TransportFrameDecoder)
+          channel.pipeline().get(TransportFrameDecoder.HANDLER_NAME);
+      ByteBuffer meta = req.meta.nioByteBuffer();
+      StreamCallbackWithID streamHandler = rpcHandler.receiveStream(reverseClient, meta, callback);
+      if (streamHandler == null) {
+        throw new NullPointerException("rpcHandler returned a null streamHandler");
+      }
+      StreamCallbackWithID wrappedCallback = new StreamCallbackWithID() {
+        @Override
+        public void onData(String streamId, ByteBuffer buf) throws IOException {
+          streamHandler.onData(streamId, buf);
+        }
+
+        @Override
+        public void onComplete(String streamId) throws IOException {
+           try {
+             streamHandler.onComplete(streamId);
+             callback.onSuccess(ByteBuffer.allocate(0));
+           } catch (Exception ex) {
+             IOException ioExc = new IOException("Failure post-processing complete stream;" +
+               " failing this rpc and leaving channel active");
+             callback.onFailure(ioExc);
+             streamHandler.onFailure(streamId, ioExc);
+           }
+        }
+
+        @Override
+        public void onFailure(String streamId, Throwable cause) throws IOException {
+          callback.onFailure(new IOException("Destination failed while reading stream", cause));
+          streamHandler.onFailure(streamId, cause);
+        }
+
+        @Override
+        public String getID() {
+          return streamHandler.getID();
+        }
+      };
+      if (req.bodyByteCount > 0) {
+        StreamInterceptor interceptor = new StreamInterceptor(this, wrappedCallback.getID(),
+          req.bodyByteCount, wrappedCallback);
+        frameDecoder.setInterceptor(interceptor);
+      } else {
+        wrappedCallback.onComplete(wrappedCallback.getID());
+      }
+    } catch (Exception e) {
+      logger.error("Error while invoking RpcHandler#receive() on RPC id " + req.requestId, e);
+      respond(new RpcFailure(req.requestId, Throwables.getStackTraceAsString(e)));
+      // We choose to totally fail the channel, rather than trying to recover as we do in other
+      // cases.  We don't know how many bytes of the stream the client has already sent for the
+      // stream, it's not worth trying to recover.
+      channel.pipeline().fireExceptionCaught(e);
+    } finally {
+      req.meta.release();
+    }
+  }
+
   private void processOneWayMessage(OneWayMessage req) {
     try {
       rpcHandler.receive(reverseClient, req.body().nioByteBuffer());
diff --git a/common/network-common/src/test/java/org/apache/spark/network/RpcIntegrationSuite.java b/common/network-common/src/test/java/org/apache/spark/network/RpcIntegrationSuite.java
index 8ff737b129641..1f4d75c7e2ec5 100644
--- a/common/network-common/src/test/java/org/apache/spark/network/RpcIntegrationSuite.java
+++ b/common/network-common/src/test/java/org/apache/spark/network/RpcIntegrationSuite.java
@@ -17,43 +17,46 @@
 
 package org.apache.spark.network;
 
+import java.io.*;
 import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Set;
+import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.Semaphore;
 import java.util.concurrent.TimeUnit;
 
 import com.google.common.collect.Sets;
+import com.google.common.io.Files;
+import org.apache.commons.lang3.tuple.ImmutablePair;
+import org.apache.commons.lang3.tuple.Pair;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
 import static org.junit.Assert.*;
 
-import org.apache.spark.network.client.RpcResponseCallback;
-import org.apache.spark.network.client.TransportClient;
-import org.apache.spark.network.client.TransportClientFactory;
-import org.apache.spark.network.server.OneForOneStreamManager;
-import org.apache.spark.network.server.RpcHandler;
-import org.apache.spark.network.server.StreamManager;
-import org.apache.spark.network.server.TransportServer;
+import org.apache.spark.network.buffer.ManagedBuffer;
+import org.apache.spark.network.buffer.NioManagedBuffer;
+import org.apache.spark.network.client.*;
+import org.apache.spark.network.server.*;
 import org.apache.spark.network.util.JavaUtils;
 import org.apache.spark.network.util.MapConfigProvider;
 import org.apache.spark.network.util.TransportConf;
 
 public class RpcIntegrationSuite {
+  static TransportConf conf;
   static TransportServer server;
   static TransportClientFactory clientFactory;
   static RpcHandler rpcHandler;
   static List<String> oneWayMsgs;
+  static StreamTestHelper testData;
+
+  static ConcurrentHashMap<String, VerifyingStreamCallback> streamCallbacks =
+      new ConcurrentHashMap<>();
 
   @BeforeClass
   public static void setUp() throws Exception {
-    TransportConf conf = new TransportConf("shuffle", MapConfigProvider.EMPTY);
+    conf = new TransportConf("shuffle", MapConfigProvider.EMPTY);
+    testData = new StreamTestHelper();
     rpcHandler = new RpcHandler() {
       @Override
       public void receive(
@@ -71,6 +74,14 @@ public void receive(
         }
       }
 
+      @Override
+      public StreamCallbackWithID receiveStream(
+          TransportClient client,
+          ByteBuffer messageHeader,
+          RpcResponseCallback callback) {
+        return receiveStreamHelper(JavaUtils.bytesToString(messageHeader));
+      }
+
       @Override
       public void receive(TransportClient client, ByteBuffer message) {
         oneWayMsgs.add(JavaUtils.bytesToString(message));
@@ -85,10 +96,71 @@ public void receive(TransportClient client, ByteBuffer message) {
     oneWayMsgs = new ArrayList<>();
   }
 
+  private static StreamCallbackWithID receiveStreamHelper(String msg) {
+    try {
+      if (msg.startsWith("fail/")) {
+        String[] parts = msg.split("/");
+        switch (parts[1]) {
+          case "exception-ondata":
+            return new StreamCallbackWithID() {
+              @Override
+              public void onData(String streamId, ByteBuffer buf) throws IOException {
+                throw new IOException("failed to read stream data!");
+              }
+
+              @Override
+              public void onComplete(String streamId) throws IOException {
+              }
+
+              @Override
+              public void onFailure(String streamId, Throwable cause) throws IOException {
+              }
+
+              @Override
+              public String getID() {
+                return msg;
+              }
+            };
+          case "exception-oncomplete":
+            return new StreamCallbackWithID() {
+              @Override
+              public void onData(String streamId, ByteBuffer buf) throws IOException {
+              }
+
+              @Override
+              public void onComplete(String streamId) throws IOException {
+                throw new IOException("exception in onComplete");
+              }
+
+              @Override
+              public void onFailure(String streamId, Throwable cause) throws IOException {
+              }
+
+              @Override
+              public String getID() {
+                return msg;
+              }
+            };
+          case "null":
+            return null;
+          default:
+            throw new IllegalArgumentException("unexpected msg: " + msg);
+        }
+      } else {
+        VerifyingStreamCallback streamCallback = new VerifyingStreamCallback(msg);
+        streamCallbacks.put(msg, streamCallback);
+        return streamCallback;
+      }
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
   @AfterClass
   public static void tearDown() {
     server.close();
     clientFactory.close();
+    testData.cleanup();
   }
 
   static class RpcResult {
@@ -130,6 +202,59 @@ public void onFailure(Throwable e) {
     return res;
   }
 
+  private RpcResult sendRpcWithStream(String... streams) throws Exception {
+    TransportClient client = clientFactory.createClient(TestUtils.getLocalHost(), server.getPort());
+    final Semaphore sem = new Semaphore(0);
+    RpcResult res = new RpcResult();
+    res.successMessages = Collections.synchronizedSet(new HashSet<String>());
+    res.errorMessages = Collections.synchronizedSet(new HashSet<String>());
+
+    for (String stream : streams) {
+      int idx = stream.lastIndexOf('/');
+      ManagedBuffer meta = new NioManagedBuffer(JavaUtils.stringToBytes(stream));
+      String streamName = (idx == -1) ? stream : stream.substring(idx + 1);
+      ManagedBuffer data = testData.openStream(conf, streamName);
+      client.uploadStream(meta, data, new RpcStreamCallback(stream, res, sem));
+    }
+
+    if (!sem.tryAcquire(streams.length, 5, TimeUnit.SECONDS)) {
+      fail("Timeout getting response from the server");
+    }
+    streamCallbacks.values().forEach(streamCallback -> {
+      try {
+        streamCallback.verify();
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+    });
+    client.close();
+    return res;
+  }
+
+  private static class RpcStreamCallback implements RpcResponseCallback {
+    final String streamId;
+    final RpcResult res;
+    final Semaphore sem;
+
+    RpcStreamCallback(String streamId, RpcResult res, Semaphore sem) {
+      this.streamId = streamId;
+      this.res = res;
+      this.sem = sem;
+    }
+
+    @Override
+    public void onSuccess(ByteBuffer message) {
+      res.successMessages.add(streamId);
+      sem.release();
+    }
+
+    @Override
+    public void onFailure(Throwable e) {
+      res.errorMessages.add(e.getMessage());
+      sem.release();
+    }
+  }
+
   @Test
   public void singleRPC() throws Exception {
     RpcResult res = sendRPC("hello/Aaron");
@@ -193,10 +318,83 @@ public void sendOneWayMessage() throws Exception {
     }
   }
 
+  @Test
+  public void sendRpcWithStreamOneAtATime() throws Exception {
+    for (String stream : StreamTestHelper.STREAMS) {
+      RpcResult res = sendRpcWithStream(stream);
+      assertTrue("there were error messages!" + res.errorMessages, res.errorMessages.isEmpty());
+      assertEquals(Sets.newHashSet(stream), res.successMessages);
+    }
+  }
+
+  @Test
+  public void sendRpcWithStreamConcurrently() throws Exception {
+    String[] streams = new String[10];
+    for (int i = 0; i < 10; i++) {
+      streams[i] = StreamTestHelper.STREAMS[i % StreamTestHelper.STREAMS.length];
+    }
+    RpcResult res = sendRpcWithStream(streams);
+    assertEquals(Sets.newHashSet(StreamTestHelper.STREAMS), res.successMessages);
+    assertTrue(res.errorMessages.isEmpty());
+  }
+
+  @Test
+  public void sendRpcWithStreamFailures() throws Exception {
+    // when there is a failure reading stream data, we don't try to keep the channel usable,
+    // just send back a decent error msg.
+    RpcResult exceptionInCallbackResult =
+        sendRpcWithStream("fail/exception-ondata/smallBuffer", "smallBuffer");
+    assertErrorAndClosed(exceptionInCallbackResult, "Destination failed while reading stream");
+
+    RpcResult nullStreamHandler =
+        sendRpcWithStream("fail/null/smallBuffer", "smallBuffer");
+    assertErrorAndClosed(exceptionInCallbackResult, "Destination failed while reading stream");
+
+    // OTOH, if there is a failure during onComplete, the channel should still be fine
+    RpcResult exceptionInOnComplete =
+        sendRpcWithStream("fail/exception-oncomplete/smallBuffer", "smallBuffer");
+    assertErrorsContain(exceptionInOnComplete.errorMessages,
+        Sets.newHashSet("Failure post-processing"));
+    assertEquals(Sets.newHashSet("smallBuffer"), exceptionInOnComplete.successMessages);
+  }
+
   private void assertErrorsContain(Set<String> errors, Set<String> contains) {
-    assertEquals(contains.size(), errors.size());
+    assertEquals("Expected " + contains.size() + " errors, got " + errors.size() + "errors: " +
+        errors, contains.size(), errors.size());
+
+    Pair<Set<String>, Set<String>> r = checkErrorsContain(errors, contains);
+    assertTrue("Could not find error containing " + r.getRight() + "; errors: " + errors,
+        r.getRight().isEmpty());
+
+    assertTrue(r.getLeft().isEmpty());
+  }
+
+  private void assertErrorAndClosed(RpcResult result, String expectedError) {
+    assertTrue("unexpected success: " + result.successMessages, result.successMessages.isEmpty());
+    // we expect 1 additional error, which contains *either* "closed" or "Connection reset"
+    Set<String> errors = result.errorMessages;
+    assertEquals("Expected 2 errors, got " + errors.size() + "errors: " +
+        errors, 2, errors.size());
+
+    Set<String> containsAndClosed = Sets.newHashSet(expectedError);
+    containsAndClosed.add("closed");
+    containsAndClosed.add("Connection reset");
+
+    Pair<Set<String>, Set<String>> r = checkErrorsContain(errors, containsAndClosed);
 
+    Set<String> errorsNotFound = r.getRight();
+    assertEquals(1, errorsNotFound.size());
+    String err = errorsNotFound.iterator().next();
+    assertTrue(err.equals("closed") || err.equals("Connection reset"));
+
+    assertTrue(r.getLeft().isEmpty());
+  }
+
+  private Pair<Set<String>, Set<String>> checkErrorsContain(
+      Set<String> errors,
+      Set<String> contains) {
     Set<String> remainingErrors = Sets.newHashSet(errors);
+    Set<String> notFound = Sets.newHashSet();
     for (String contain : contains) {
       Iterator<String> it = remainingErrors.iterator();
       boolean foundMatch = false;
@@ -207,9 +405,66 @@ private void assertErrorsContain(Set<String> errors, Set<String> contains) {
           break;
         }
       }
-      assertTrue("Could not find error containing " + contain + "; errors: " + errors, foundMatch);
+      if (!foundMatch) {
+        notFound.add(contain);
+      }
+    }
+    return new ImmutablePair<>(remainingErrors, notFound);
+  }
+
+  private static class VerifyingStreamCallback implements StreamCallbackWithID {
+    final String streamId;
+    final StreamSuite.TestCallback helper;
+    final OutputStream out;
+    final File outFile;
+
+    VerifyingStreamCallback(String streamId) throws IOException {
+      if (streamId.equals("file")) {
+        outFile = File.createTempFile("data", ".tmp", testData.tempDir);
+        out = new FileOutputStream(outFile);
+      } else {
+        out = new ByteArrayOutputStream();
+        outFile = null;
+      }
+      this.streamId = streamId;
+      helper = new StreamSuite.TestCallback(out);
+    }
+
+    void verify() throws IOException {
+      if (streamId.equals("file")) {
+        assertTrue("File stream did not match.", Files.equal(testData.testFile, outFile));
+      } else {
+        byte[] result = ((ByteArrayOutputStream)out).toByteArray();
+        ByteBuffer srcBuffer = testData.srcBuffer(streamId);
+        ByteBuffer base;
+        synchronized (srcBuffer) {
+          base = srcBuffer.duplicate();
+        }
+        byte[] expected = new byte[base.remaining()];
+        base.get(expected);
+        assertEquals(expected.length, result.length);
+        assertTrue("buffers don't match", Arrays.equals(expected, result));
+      }
+    }
+
+    @Override
+    public void onData(String streamId, ByteBuffer buf) throws IOException {
+      helper.onData(streamId, buf);
     }
 
-    assertTrue(remainingErrors.isEmpty());
+    @Override
+    public void onComplete(String streamId) throws IOException {
+      helper.onComplete(streamId);
+    }
+
+    @Override
+    public void onFailure(String streamId, Throwable cause) throws IOException {
+      helper.onFailure(streamId, cause);
+    }
+
+    @Override
+    public String getID() {
+      return streamId;
+    }
   }
 }
diff --git a/common/network-common/src/test/java/org/apache/spark/network/StreamSuite.java b/common/network-common/src/test/java/org/apache/spark/network/StreamSuite.java
index f253a07e64be1..f3050cb79cdfd 100644
--- a/common/network-common/src/test/java/org/apache/spark/network/StreamSuite.java
+++ b/common/network-common/src/test/java/org/apache/spark/network/StreamSuite.java
@@ -26,7 +26,6 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
-import java.util.Random;
 import java.util.concurrent.Executors;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.TimeUnit;
@@ -37,9 +36,7 @@
 import org.junit.Test;
 import static org.junit.Assert.*;
 
-import org.apache.spark.network.buffer.FileSegmentManagedBuffer;
 import org.apache.spark.network.buffer.ManagedBuffer;
-import org.apache.spark.network.buffer.NioManagedBuffer;
 import org.apache.spark.network.client.RpcResponseCallback;
 import org.apache.spark.network.client.StreamCallback;
 import org.apache.spark.network.client.TransportClient;
@@ -51,16 +48,11 @@
 import org.apache.spark.network.util.TransportConf;
 
 public class StreamSuite {
-  private static final String[] STREAMS = { "largeBuffer", "smallBuffer", "emptyBuffer", "file" };
+  private static final String[] STREAMS = StreamTestHelper.STREAMS;
+  private static StreamTestHelper testData;
 
   private static TransportServer server;
   private static TransportClientFactory clientFactory;
-  private static File testFile;
-  private static File tempDir;
-
-  private static ByteBuffer emptyBuffer;
-  private static ByteBuffer smallBuffer;
-  private static ByteBuffer largeBuffer;
 
   private static ByteBuffer createBuffer(int bufSize) {
     ByteBuffer buf = ByteBuffer.allocate(bufSize);
@@ -73,23 +65,7 @@ private static ByteBuffer createBuffer(int bufSize) {
 
   @BeforeClass
   public static void setUp() throws Exception {
-    tempDir = Files.createTempDir();
-    emptyBuffer = createBuffer(0);
-    smallBuffer = createBuffer(100);
-    largeBuffer = createBuffer(100000);
-
-    testFile = File.createTempFile("stream-test-file", "txt", tempDir);
-    FileOutputStream fp = new FileOutputStream(testFile);
-    try {
-      Random rnd = new Random();
-      for (int i = 0; i < 512; i++) {
-        byte[] fileContent = new byte[1024];
-        rnd.nextBytes(fileContent);
-        fp.write(fileContent);
-      }
-    } finally {
-      fp.close();
-    }
+    testData = new StreamTestHelper();
 
     final TransportConf conf = new TransportConf("shuffle", MapConfigProvider.EMPTY);
     final StreamManager streamManager = new StreamManager() {
@@ -100,18 +76,7 @@ public ManagedBuffer getChunk(long streamId, int chunkIndex) {
 
       @Override
       public ManagedBuffer openStream(String streamId) {
-        switch (streamId) {
-          case "largeBuffer":
-            return new NioManagedBuffer(largeBuffer);
-          case "smallBuffer":
-            return new NioManagedBuffer(smallBuffer);
-          case "emptyBuffer":
-            return new NioManagedBuffer(emptyBuffer);
-          case "file":
-            return new FileSegmentManagedBuffer(conf, testFile, 0, testFile.length());
-          default:
-            throw new IllegalArgumentException("Invalid stream: " + streamId);
-        }
+        return testData.openStream(conf, streamId);
       }
     };
     RpcHandler handler = new RpcHandler() {
@@ -137,12 +102,7 @@ public StreamManager getStreamManager() {
   public static void tearDown() {
     server.close();
     clientFactory.close();
-    if (tempDir != null) {
-      for (File f : tempDir.listFiles()) {
-        f.delete();
-      }
-      tempDir.delete();
-    }
+    testData.cleanup();
   }
 
   @Test
@@ -234,21 +194,21 @@ public void run() {
           case "largeBuffer":
             baos = new ByteArrayOutputStream();
             out = baos;
-            srcBuffer = largeBuffer;
+            srcBuffer = testData.largeBuffer;
             break;
           case "smallBuffer":
             baos = new ByteArrayOutputStream();
             out = baos;
-            srcBuffer = smallBuffer;
+            srcBuffer = testData.smallBuffer;
             break;
           case "file":
-            outFile = File.createTempFile("data", ".tmp", tempDir);
+            outFile = File.createTempFile("data", ".tmp", testData.tempDir);
             out = new FileOutputStream(outFile);
             break;
           case "emptyBuffer":
             baos = new ByteArrayOutputStream();
             out = baos;
-            srcBuffer = emptyBuffer;
+            srcBuffer = testData.emptyBuffer;
             break;
           default:
             throw new IllegalArgumentException(streamId);
@@ -256,10 +216,10 @@ public void run() {
 
         TestCallback callback = new TestCallback(out);
         client.stream(streamId, callback);
-        waitForCompletion(callback);
+        callback.waitForCompletion(timeoutMs);
 
         if (srcBuffer == null) {
-          assertTrue("File stream did not match.", Files.equal(testFile, outFile));
+          assertTrue("File stream did not match.", Files.equal(testData.testFile, outFile));
         } else {
           ByteBuffer base;
           synchronized (srcBuffer) {
@@ -292,23 +252,9 @@ public void check() throws Throwable {
         throw error;
       }
     }
-
-    private void waitForCompletion(TestCallback callback) throws Exception {
-      long now = System.currentTimeMillis();
-      long deadline = now + timeoutMs;
-      synchronized (callback) {
-        while (!callback.completed && now < deadline) {
-          callback.wait(deadline - now);
-          now = System.currentTimeMillis();
-        }
-      }
-      assertTrue("Timed out waiting for stream.", callback.completed);
-      assertNull(callback.error);
-    }
-
   }
 
-  private static class TestCallback implements StreamCallback {
+  static class TestCallback implements StreamCallback {
 
     private final OutputStream out;
     public volatile boolean completed;
@@ -344,6 +290,22 @@ public void onFailure(String streamId, Throwable cause) {
       }
     }
 
+    void waitForCompletion(long timeoutMs) {
+      long now = System.currentTimeMillis();
+      long deadline = now + timeoutMs;
+      synchronized (this) {
+        while (!completed && now < deadline) {
+          try {
+            wait(deadline - now);
+          } catch (InterruptedException ie) {
+            throw new RuntimeException(ie);
+          }
+          now = System.currentTimeMillis();
+        }
+      }
+      assertTrue("Timed out waiting for stream.", completed);
+      assertNull(error);
+    }
   }
 
 }
diff --git a/common/network-common/src/test/java/org/apache/spark/network/StreamTestHelper.java b/common/network-common/src/test/java/org/apache/spark/network/StreamTestHelper.java
new file mode 100644
index 0000000000000..0f5c82c9e9b1f
--- /dev/null
+++ b/common/network-common/src/test/java/org/apache/spark/network/StreamTestHelper.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Random;
+
+import com.google.common.io.Files;
+
+import org.apache.spark.network.buffer.FileSegmentManagedBuffer;
+import org.apache.spark.network.buffer.ManagedBuffer;
+import org.apache.spark.network.buffer.NioManagedBuffer;
+import org.apache.spark.network.util.JavaUtils;
+import org.apache.spark.network.util.TransportConf;
+
+class StreamTestHelper {
+  static final String[] STREAMS = { "largeBuffer", "smallBuffer", "emptyBuffer", "file" };
+
+  final File testFile;
+  final File tempDir;
+
+  final ByteBuffer emptyBuffer;
+  final ByteBuffer smallBuffer;
+  final ByteBuffer largeBuffer;
+
+  private static ByteBuffer createBuffer(int bufSize) {
+    ByteBuffer buf = ByteBuffer.allocate(bufSize);
+    for (int i = 0; i < bufSize; i ++) {
+      buf.put((byte) i);
+    }
+    buf.flip();
+    return buf;
+  }
+
+  StreamTestHelper() throws Exception {
+    tempDir = Files.createTempDir();
+    emptyBuffer = createBuffer(0);
+    smallBuffer = createBuffer(100);
+    largeBuffer = createBuffer(100000);
+
+    testFile = File.createTempFile("stream-test-file", "txt", tempDir);
+    FileOutputStream fp = new FileOutputStream(testFile);
+    try {
+      Random rnd = new Random();
+      for (int i = 0; i < 512; i++) {
+        byte[] fileContent = new byte[1024];
+        rnd.nextBytes(fileContent);
+        fp.write(fileContent);
+      }
+    } finally {
+      fp.close();
+    }
+  }
+
+  public ByteBuffer srcBuffer(String name) {
+    switch (name) {
+      case "largeBuffer":
+        return largeBuffer;
+      case "smallBuffer":
+        return smallBuffer;
+      case "emptyBuffer":
+        return emptyBuffer;
+      default:
+        throw new IllegalArgumentException("Invalid stream: " + name);
+    }
+  }
+
+  public ManagedBuffer openStream(TransportConf conf, String streamId) {
+    switch (streamId) {
+      case "file":
+        return new FileSegmentManagedBuffer(conf, testFile, 0, testFile.length());
+      default:
+        return new NioManagedBuffer(srcBuffer(streamId));
+    }
+  }
+
+  void cleanup() {
+    if (tempDir != null) {
+      try {
+        JavaUtils.deleteRecursively(tempDir);
+      } catch (IOException io) {
+        throw new RuntimeException(io);
+      }
+    }
+  }
+}
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 4f6d5ff898681..eeb097ef153ad 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -36,6 +36,9 @@ object MimaExcludes {
 
   // Exclude rules for 2.4.x
   lazy val v24excludes = v23excludes ++ Seq(
+    // [SPARK-6237][NETWORK] Network-layer changes to allow stream upload
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.network.netty.NettyBlockRpcServer.receive"),
+
     // [SPARK-20087][CORE] Attach accumulators / metrics to 'TaskKilled' end reason
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.TaskKilled.apply"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.TaskKilled.copy"),

From 1b9368f7d4c1d5c0df49204f48515d3b4ffe3e13 Mon Sep 17 00:00:00 2001
From: Kris Mok <kris.mok@databricks.com>
Date: Wed, 27 Jun 2018 10:27:40 +0800
Subject: [PATCH 12/79] [SPARK-24659][SQL] GenericArrayData.equals should
 respect element type differences

## What changes were proposed in this pull request?

Fix `GenericArrayData.equals`, so that it respects the actual types of the elements.
e.g. an instance that represents an `array<int>` and another instance that represents an `array<long>` should be considered incompatible, and thus should return false for `equals`.

`GenericArrayData` doesn't keep any schema information by itself, and rather relies on the Java objects referenced by its `array` field's elements to keep track of their own object types. So, the most straightforward way to respect their types is to call `equals` on the elements, instead of using Scala's `==` operator, which can have semantics that are not always desirable:
```
new java.lang.Integer(123) == new java.lang.Long(123L) // true in Scala
new java.lang.Integer(123).equals(new java.lang.Long(123L)) // false in Scala
```

## How was this patch tested?

Added unit test in `ComplexDataSuite`

Author: Kris Mok <kris.mok@databricks.com>

Closes #21643 from rednaxelafx/fix-genericarraydata-equals.
---
 .../sql/catalyst/util/GenericArrayData.scala  |  2 +-
 .../sql/catalyst/util/ComplexDataSuite.scala  | 36 +++++++++++++++++++
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala
index 9e39ed9c3a778..83ad08d8e1758 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala
@@ -122,7 +122,7 @@ class GenericArrayData(val array: Array[Any]) extends ArrayData {
             if (!o2.isInstanceOf[Double] || ! java.lang.Double.isNaN(o2.asInstanceOf[Double])) {
               return false
             }
-          case _ => if (o1 != o2) {
+          case _ => if (!o1.equals(o2)) {
             return false
           }
         }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ComplexDataSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ComplexDataSuite.scala
index 9d285916bcf42..229e32479082c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ComplexDataSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ComplexDataSuite.scala
@@ -104,4 +104,40 @@ class ComplexDataSuite extends SparkFunSuite {
     // The copied data should not be changed externally.
     assert(copied.getStruct(0, 1).getUTF8String(0).toString == "a")
   }
+
+  test("SPARK-24659: GenericArrayData.equals should respect element type differences") {
+    import scala.reflect.ClassTag
+
+    // Expected positive cases
+    def arraysShouldEqual[T: ClassTag](element: T*): Unit = {
+      val array1 = new GenericArrayData(Array[T](element: _*))
+      val array2 = new GenericArrayData(Array[T](element: _*))
+      assert(array1.equals(array2))
+    }
+    arraysShouldEqual(true, false)                                            // Boolean
+    arraysShouldEqual(0.toByte, 123.toByte, Byte.MinValue, Byte.MaxValue)     // Byte
+    arraysShouldEqual(0.toShort, 123.toShort, Short.MinValue, Short.MaxValue) // Short
+    arraysShouldEqual(0, 123, -65536, Int.MinValue, Int.MaxValue)             // Int
+    arraysShouldEqual(0L, 123L, -65536L, Long.MinValue, Long.MaxValue)        // Long
+    arraysShouldEqual(0.0F, 123.0F, Float.MinValue, Float.MaxValue, Float.MinPositiveValue,
+      Float.PositiveInfinity, Float.NegativeInfinity, Float.NaN)              // Float
+    arraysShouldEqual(0.0, 123.0, Double.MinValue, Double.MaxValue, Double.MinPositiveValue,
+      Double.PositiveInfinity, Double.NegativeInfinity, Double.NaN)           // Double
+    arraysShouldEqual(Array[Byte](123.toByte), Array[Byte](), null)           // SQL Binary
+    arraysShouldEqual(UTF8String.fromString("foo"), null)                     // SQL String
+
+    // Expected negative cases
+    // Spark SQL considers cases like array<int> vs array<long> to be incompatible,
+    // so an underlying implementation of array type should return false in such cases.
+    def arraysShouldNotEqual[T: ClassTag, U: ClassTag](element1: T, element2: U): Unit = {
+      val array1 = new GenericArrayData(Array[T](element1))
+      val array2 = new GenericArrayData(Array[U](element2))
+      assert(!array1.equals(array2))
+    }
+    arraysShouldNotEqual(true, 1)                            // Boolean <-> Int
+    arraysShouldNotEqual(123.toByte, 123)                    // Byte    <-> Int
+    arraysShouldNotEqual(123.toByte, 123L)                   // Byte    <-> Long
+    arraysShouldNotEqual(123.toShort, 123)                   // Short   <-> Int
+    arraysShouldNotEqual(123, 123L)                          // Int     <-> Long
+  }
 }

From d08f53dc61f662f5291f71bcbe1a7b9f531a34d2 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Wed, 27 Jun 2018 10:36:51 +0800
Subject: [PATCH 13/79] [SPARK-24605][SQL] size(null) returns null instead of
 -1

## What changes were proposed in this pull request?

In PR, I propose new behavior of `size(null)` under the config flag `spark.sql.legacy.sizeOfNull`. If the former one is disabled, the `size()` function returns `null` for `null` input. By default the `spark.sql.legacy.sizeOfNull` is enabled to keep backward compatibility with previous versions. In that case, `size(null)` returns `-1`.

## How was this patch tested?

Modified existing tests for the `size()` function to check new behavior (`null`) and old one (`-1`).

Author: Maxim Gekk <maxim.gekk@databricks.com>

Closes #21598 from MaxGekk/legacy-size-of-null.
---
 .../expressions/collectionOperations.scala    | 38 ++++++++++---
 .../apache/spark/sql/internal/SQLConf.scala   |  8 +++
 .../CollectionExpressionsSuite.scala          | 30 +++++++----
 .../org/apache/spark/sql/functions.scala      |  2 +-
 .../spark/sql/DataFrameFunctionsSuite.scala   | 54 +++++++++++--------
 5 files changed, 93 insertions(+), 39 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 58612f65c1a53..abd6c88d3d985 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -67,37 +67,61 @@ trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression
 
 
 /**
- * Given an array or map, returns its size. Returns -1 if null.
+ * Given an array or map, returns total number of elements in it.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the size of an array or a map. Returns -1 if null.",
+  usage = """
+    _FUNC_(expr) - Returns the size of an array or a map.
+    The function returns -1 if its input is null and spark.sql.legacy.sizeOfNull is set to true.
+    If spark.sql.legacy.sizeOfNull is set to false, the function returns null for null input.
+    By default, the spark.sql.legacy.sizeOfNull parameter is set to true.
+  """,
   examples = """
     Examples:
       > SELECT _FUNC_(array('b', 'd', 'c', 'a'));
        4
+      > SELECT _FUNC_(map('a', 1, 'b', 2));
+       2
+      > SELECT _FUNC_(NULL);
+       -1
   """)
-case class Size(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+case class Size(
+    child: Expression,
+    legacySizeOfNull: Boolean)
+  extends UnaryExpression with ExpectsInputTypes {
+
+  def this(child: Expression) =
+    this(
+      child,
+      legacySizeOfNull = SQLConf.get.getConf(SQLConf.LEGACY_SIZE_OF_NULL))
+
   override def dataType: DataType = IntegerType
   override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(ArrayType, MapType))
-  override def nullable: Boolean = false
+  override def nullable: Boolean = if (legacySizeOfNull) false else super.nullable
 
   override def eval(input: InternalRow): Any = {
     val value = child.eval(input)
     if (value == null) {
-      -1
+      if (legacySizeOfNull) -1 else null
     } else child.dataType match {
       case _: ArrayType => value.asInstanceOf[ArrayData].numElements()
       case _: MapType => value.asInstanceOf[MapData].numElements()
+      case other => throw new UnsupportedOperationException(
+        s"The size function doesn't support the operand type ${other.getClass.getCanonicalName}")
     }
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val childGen = child.genCode(ctx)
-    ev.copy(code = code"""
+    if (legacySizeOfNull) {
+      val childGen = child.genCode(ctx)
+      ev.copy(code = code"""
       boolean ${ev.isNull} = false;
       ${childGen.code}
       ${CodeGenerator.javaType(dataType)} ${ev.value} = ${childGen.isNull} ? -1 :
         (${childGen.value}).numElements();""", isNull = FalseLiteral)
+    } else {
+      defineCodeGen(ctx, ev, c => s"($c).numElements()")
+    }
   }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index e768416f257c9..239c8266351ae 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1324,6 +1324,12 @@ object SQLConf {
       "Other column values can be ignored during parsing even if they are malformed.")
     .booleanConf
     .createWithDefault(true)
+
+  val LEGACY_SIZE_OF_NULL = buildConf("spark.sql.legacy.sizeOfNull")
+    .doc("If it is set to true, size of null returns -1. This behavior was inherited from Hive. " +
+      "The size function returns null for null input if the flag is disabled.")
+    .booleanConf
+    .createWithDefault(true)
 }
 
 /**
@@ -1686,6 +1692,8 @@ class SQLConf extends Serializable with Logging {
 
   def csvColumnPruning: Boolean = getConf(SQLConf.CSV_PARSER_COLUMN_PRUNING)
 
+  def legacySizeOfNull: Boolean = getConf(SQLConf.LEGACY_SIZE_OF_NULL)
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
index 5b8cf5128fe21..caea4fb25ff7e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -24,25 +24,37 @@ import org.apache.spark.sql.types._
 
 class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
-  test("Array and Map Size") {
+  def testSize(legacySizeOfNull: Boolean, sizeOfNull: Any): Unit = {
     val a0 = Literal.create(Seq(1, 2, 3), ArrayType(IntegerType))
     val a1 = Literal.create(Seq[Integer](), ArrayType(IntegerType))
     val a2 = Literal.create(Seq(1, 2), ArrayType(IntegerType))
 
-    checkEvaluation(Size(a0), 3)
-    checkEvaluation(Size(a1), 0)
-    checkEvaluation(Size(a2), 2)
+    checkEvaluation(Size(a0, legacySizeOfNull), 3)
+    checkEvaluation(Size(a1, legacySizeOfNull), 0)
+    checkEvaluation(Size(a2, legacySizeOfNull), 2)
 
     val m0 = Literal.create(Map("a" -> "a", "b" -> "b"), MapType(StringType, StringType))
     val m1 = Literal.create(Map[String, String](), MapType(StringType, StringType))
     val m2 = Literal.create(Map("a" -> "a"), MapType(StringType, StringType))
 
-    checkEvaluation(Size(m0), 2)
-    checkEvaluation(Size(m1), 0)
-    checkEvaluation(Size(m2), 1)
+    checkEvaluation(Size(m0, legacySizeOfNull), 2)
+    checkEvaluation(Size(m1, legacySizeOfNull), 0)
+    checkEvaluation(Size(m2, legacySizeOfNull), 1)
+
+    checkEvaluation(
+      Size(Literal.create(null, MapType(StringType, StringType)), legacySizeOfNull),
+      expected = sizeOfNull)
+    checkEvaluation(
+      Size(Literal.create(null, ArrayType(StringType)), legacySizeOfNull),
+      expected = sizeOfNull)
+  }
+
+  test("Array and Map Size - legacy") {
+    testSize(legacySizeOfNull = true, sizeOfNull = -1)
+  }
 
-    checkEvaluation(Size(Literal.create(null, MapType(StringType, StringType))), -1)
-    checkEvaluation(Size(Literal.create(null, ArrayType(StringType))), -1)
+  test("Array and Map Size") {
+    testSize(legacySizeOfNull = false, sizeOfNull = null)
   }
 
   test("MapKeys/MapValues") {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 40c40e7083d1c..ef99ce3ad69d9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -3431,7 +3431,7 @@ object functions {
    * @group collection_funcs
    * @since 1.5.0
    */
-  def size(e: Column): Column = withExpr { Size(e.expr) }
+  def size(e: Column): Column = withExpr { new Size(e.expr) }
 
   /**
    * Sorts the input array for the given column in ascending order,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 5d6a6c0832c96..b109898b5bfb3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -487,26 +487,29 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
     }.getMessage().contains("only supports array input"))
   }
 
-  test("array size function") {
+  def testSizeOfArray(sizeOfNull: Any): Unit = {
     val df = Seq(
       (Seq[Int](1, 2), "x"),
       (Seq[Int](), "y"),
       (Seq[Int](1, 2, 3), "z"),
       (null, "empty")
     ).toDF("a", "b")
-    checkAnswer(
-      df.select(size($"a")),
-      Seq(Row(2), Row(0), Row(3), Row(-1))
-    )
-    checkAnswer(
-      df.selectExpr("size(a)"),
-      Seq(Row(2), Row(0), Row(3), Row(-1))
-    )
 
-    checkAnswer(
-      df.selectExpr("cardinality(a)"),
-      Seq(Row(2L), Row(0L), Row(3L), Row(-1L))
-    )
+    checkAnswer(df.select(size($"a")), Seq(Row(2), Row(0), Row(3), Row(sizeOfNull)))
+    checkAnswer(df.selectExpr("size(a)"), Seq(Row(2), Row(0), Row(3), Row(sizeOfNull)))
+    checkAnswer(df.selectExpr("cardinality(a)"), Seq(Row(2L), Row(0L), Row(3L), Row(sizeOfNull)))
+  }
+
+  test("array size function - legacy") {
+    withSQLConf(SQLConf.LEGACY_SIZE_OF_NULL.key -> "true") {
+      testSizeOfArray(sizeOfNull = -1)
+    }
+  }
+
+  test("array size function") {
+    withSQLConf(SQLConf.LEGACY_SIZE_OF_NULL.key -> "false") {
+      testSizeOfArray(sizeOfNull = null)
+    }
   }
 
   test("dataframe arrays_zip function") {
@@ -567,21 +570,28 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
     }
   }
 
-  test("map size function") {
+  def testSizeOfMap(sizeOfNull: Any): Unit = {
     val df = Seq(
       (Map[Int, Int](1 -> 1, 2 -> 2), "x"),
       (Map[Int, Int](), "y"),
       (Map[Int, Int](1 -> 1, 2 -> 2, 3 -> 3), "z"),
       (null, "empty")
     ).toDF("a", "b")
-    checkAnswer(
-      df.select(size($"a")),
-      Seq(Row(2), Row(0), Row(3), Row(-1))
-    )
-    checkAnswer(
-      df.selectExpr("size(a)"),
-      Seq(Row(2), Row(0), Row(3), Row(-1))
-    )
+
+    checkAnswer(df.select(size($"a")), Seq(Row(2), Row(0), Row(3), Row(sizeOfNull)))
+    checkAnswer(df.selectExpr("size(a)"), Seq(Row(2), Row(0), Row(3), Row(sizeOfNull)))
+  }
+
+  test("map size function - legacy") {
+    withSQLConf(SQLConf.LEGACY_SIZE_OF_NULL.key -> "true") {
+      testSizeOfMap(sizeOfNull = -1: Int)
+    }
+  }
+
+  test("map size function") {
+    withSQLConf(SQLConf.LEGACY_SIZE_OF_NULL.key -> "false") {
+      testSizeOfMap(sizeOfNull = null)
+    }
   }
 
   test("map_keys/map_values function") {

From 2669b4de3b336dde84b698c20dbc73b30abf79d4 Mon Sep 17 00:00:00 2001
From: "Vayda, Oleksandr: IT (PRG)" <Oleksandr.Vayda@barclayscapital.com>
Date: Wed, 27 Jun 2018 11:52:31 +0900
Subject: [PATCH 14/79] [SPARK-23927][SQL] Add "sequence" expression
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?
The PR adds the SQL function ```sequence```.
https://issues.apache.org/jira/browse/SPARK-23927

The behavior of the function is based on Presto's one.
Ref: https://prestodb.io/docs/current/functions/array.html

- ```sequence(start, stop) → array<bigint>```
Generate a sequence of integers from ```start``` to ```stop```, incrementing by ```1``` if ```start``` is less than or equal to ```stop```, otherwise ```-1```.
- ```sequence(start, stop, step) → array<bigint>```
Generate a sequence of integers from ```start``` to ```stop```, incrementing by ```step```.
- ```sequence(start_date, stop_date) → array<date>```
Generate a sequence of dates from ```start_date``` to ```stop_date```, incrementing by ```interval 1 day``` if ```start_date``` is less than or equal to ```stop_date```, otherwise ```- interval 1 day```.
- ```sequence(start_date, stop_date, step_interval) → array<date>```
Generate a sequence of dates from ```start_date``` to ```stop_date```, incrementing by ```step_interval```. The type of ```step_interval``` is ```CalendarInterval```.
- ```sequence(start_timestemp, stop_timestemp) → array<timestamp>```
Generate a sequence of timestamps from ```start_timestamps``` to ```stop_timestamps```, incrementing by ```interval 1 day``` if ```start_date``` is less than or equal to ```stop_date```, otherwise ```- interval 1 day```.
- ```sequence(start_timestamp, stop_timestamp, step_interval) → array<timestamp>```
Generate a sequence of timestamps from ```start_timestamps``` to ```stop_timestamps```, incrementing by ```step_interval```. The type of ```step_interval``` is ```CalendarInterval```.

## How was this patch tested?

Added unit tests.

Author: Vayda, Oleksandr: IT (PRG) <Oleksandr.Vayda@barclayscapital.com>

Closes #21155 from wajda/feature/array-api-sequence.
---
 .../catalyst/analysis/FunctionRegistry.scala  |   1 +
 .../sql/catalyst/analysis/TypeCoercion.scala  |   7 +
 .../expressions/collectionOperations.scala    | 402 +++++++++++++++++-
 .../CollectionExpressionsSuite.scala          | 292 +++++++++++++
 .../org/apache/spark/sql/functions.scala      |  21 +
 .../spark/sql/DataFrameFunctionsSuite.scala   |  56 +++
 6 files changed, 777 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 8abc616c1a3f7..a574d8a84d4fb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -432,6 +432,7 @@ object FunctionRegistry {
     expression[Reverse]("reverse"),
     expression[Concat]("concat"),
     expression[Flatten]("flatten"),
+    expression[Sequence]("sequence"),
     expression[ArrayRepeat]("array_repeat"),
     expression[ArrayRemove]("array_remove"),
     expression[ArrayDistinct]("array_distinct"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
index 637923928a7da..3ebab430ffbcd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -544,6 +544,13 @@ object TypeCoercion {
           case None => aj
         }
 
+      case s @ Sequence(_, _, _, timeZoneId) if !haveSameType(s.coercibleChildren) =>
+        val types = s.coercibleChildren.map(_.dataType)
+        findWiderCommonType(types) match {
+          case Some(widerDataType) => s.castChildrenTo(widerDataType)
+          case None => s
+        }
+
       case m @ CreateMap(children) if m.keys.length == m.values.length &&
         (!haveSameType(m.keys) || !haveSameType(m.values)) =>
         val newKeys = if (haveSameType(m.keys)) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index abd6c88d3d985..0395e1ef9a7ad 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -16,9 +16,10 @@
  */
 package org.apache.spark.sql.catalyst.expressions
 
-import java.util.Comparator
+import java.util.{Comparator, TimeZone}
 
 import scala.collection.mutable
+import scala.reflect.ClassTag
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion}
@@ -26,11 +27,13 @@ import org.apache.spark.sql.catalyst.expressions.ArraySortLike.NullOrder
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.expressions.codegen.Block._
 import org.apache.spark.sql.catalyst.util._
-import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.catalyst.util.DateTimeUtils._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.unsafe.array.ByteArrayMethods
+import org.apache.spark.unsafe.array.ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH
 import org.apache.spark.unsafe.types.{ByteArray, UTF8String}
+import org.apache.spark.unsafe.types.CalendarInterval
 import org.apache.spark.util.collection.OpenHashSet
 
 /**
@@ -2313,6 +2316,401 @@ case class Flatten(child: Expression) extends UnaryExpression {
   override def prettyName: String = "flatten"
 }
 
+@ExpressionDescription(
+  usage = """
+    _FUNC_(start, stop, step) - Generates an array of elements from start to stop (inclusive),
+      incrementing by step. The type of the returned elements is the same as the type of argument
+      expressions.
+
+      Supported types are: byte, short, integer, long, date, timestamp.
+
+      The start and stop expressions must resolve to the same type.
+      If start and stop expressions resolve to the 'date' or 'timestamp' type
+      then the step expression must resolve to the 'interval' type, otherwise to the same type
+      as the start and stop expressions.
+  """,
+  arguments = """
+    Arguments:
+      * start - an expression. The start of the range.
+      * stop - an expression. The end the range (inclusive).
+      * step - an optional expression. The step of the range.
+          By default step is 1 if start is less than or equal to stop, otherwise -1.
+          For the temporal sequences it's 1 day and -1 day respectively.
+          If start is greater than stop then the step must be negative, and vice versa.
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_(1, 5);
+       [1, 2, 3, 4, 5]
+      > SELECT _FUNC_(5, 1);
+       [5, 4, 3, 2, 1]
+      > SELECT _FUNC_(to_date('2018-01-01'), to_date('2018-03-01'), interval 1 month);
+       [2018-01-01, 2018-02-01, 2018-03-01]
+  """,
+  since = "2.4.0"
+)
+case class Sequence(
+    start: Expression,
+    stop: Expression,
+    stepOpt: Option[Expression],
+    timeZoneId: Option[String] = None)
+  extends Expression
+  with TimeZoneAwareExpression {
+
+  import Sequence._
+
+  def this(start: Expression, stop: Expression) =
+    this(start, stop, None, None)
+
+  def this(start: Expression, stop: Expression, step: Expression) =
+    this(start, stop, Some(step), None)
+
+  override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression =
+    copy(timeZoneId = Some(timeZoneId))
+
+  override def children: Seq[Expression] = Seq(start, stop) ++ stepOpt
+
+  override def foldable: Boolean = children.forall(_.foldable)
+
+  override def nullable: Boolean = children.exists(_.nullable)
+
+  override lazy val dataType: ArrayType = ArrayType(start.dataType, containsNull = false)
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    val startType = start.dataType
+    def stepType = stepOpt.get.dataType
+    val typesCorrect =
+      startType.sameType(stop.dataType) &&
+        (startType match {
+          case TimestampType | DateType =>
+            stepOpt.isEmpty || CalendarIntervalType.acceptsType(stepType)
+          case _: IntegralType =>
+            stepOpt.isEmpty || stepType.sameType(startType)
+          case _ => false
+        })
+
+    if (typesCorrect) {
+      TypeCheckResult.TypeCheckSuccess
+    } else {
+      TypeCheckResult.TypeCheckFailure(
+        s"$prettyName only supports integral, timestamp or date types")
+    }
+  }
+
+  def coercibleChildren: Seq[Expression] = children.filter(_.dataType != CalendarIntervalType)
+
+  def castChildrenTo(widerType: DataType): Expression = Sequence(
+    Cast(start, widerType),
+    Cast(stop, widerType),
+    stepOpt.map(step => if (step.dataType != CalendarIntervalType) Cast(step, widerType) else step),
+    timeZoneId)
+
+  private lazy val impl: SequenceImpl = dataType.elementType match {
+    case iType: IntegralType =>
+      type T = iType.InternalType
+      val ct = ClassTag[T](iType.tag.mirror.runtimeClass(iType.tag.tpe))
+      new IntegralSequenceImpl(iType)(ct, iType.integral)
+
+    case TimestampType =>
+      new TemporalSequenceImpl[Long](LongType, 1, identity, timeZone)
+
+    case DateType =>
+      new TemporalSequenceImpl[Int](IntegerType, MICROS_PER_DAY, _.toInt, timeZone)
+  }
+
+  override def eval(input: InternalRow): Any = {
+    val startVal = start.eval(input)
+    if (startVal == null) return null
+    val stopVal = stop.eval(input)
+    if (stopVal == null) return null
+    val stepVal = stepOpt.map(_.eval(input)).getOrElse(impl.defaultStep(startVal, stopVal))
+    if (stepVal == null) return null
+
+    ArrayData.toArrayData(impl.eval(startVal, stopVal, stepVal))
+  }
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    val startGen = start.genCode(ctx)
+    val stopGen = stop.genCode(ctx)
+    val stepGen = stepOpt.map(_.genCode(ctx)).getOrElse(
+      impl.defaultStep.genCode(ctx, startGen, stopGen))
+
+    val resultType = CodeGenerator.javaType(dataType)
+    val resultCode = {
+      val arr = ctx.freshName("arr")
+      val arrElemType = CodeGenerator.javaType(dataType.elementType)
+      s"""
+         |final $arrElemType[] $arr = null;
+         |${impl.genCode(ctx, startGen.value, stopGen.value, stepGen.value, arr, arrElemType)}
+         |${ev.value} = UnsafeArrayData.fromPrimitiveArray($arr);
+       """.stripMargin
+    }
+
+    if (nullable) {
+      val nullSafeEval =
+        startGen.code + ctx.nullSafeExec(start.nullable, startGen.isNull) {
+          stopGen.code + ctx.nullSafeExec(stop.nullable, stopGen.isNull) {
+            stepGen.code + ctx.nullSafeExec(stepOpt.exists(_.nullable), stepGen.isNull) {
+              s"""
+                 |${ev.isNull} = false;
+                 |$resultCode
+               """.stripMargin
+            }
+          }
+        }
+      ev.copy(code =
+        code"""
+           |boolean ${ev.isNull} = true;
+           |$resultType ${ev.value} = null;
+           |$nullSafeEval
+         """.stripMargin)
+
+    } else {
+      ev.copy(code =
+        code"""
+           |${startGen.code}
+           |${stopGen.code}
+           |${stepGen.code}
+           |$resultType ${ev.value} = null;
+           |$resultCode
+         """.stripMargin,
+        isNull = FalseLiteral)
+    }
+  }
+}
+
+object Sequence {
+
+  private type LessThanOrEqualFn = (Any, Any) => Boolean
+
+  private class DefaultStep(lteq: LessThanOrEqualFn, stepType: DataType, one: Any) {
+    private val negativeOne = UnaryMinus(Literal(one)).eval()
+
+    def apply(start: Any, stop: Any): Any = {
+      if (lteq(start, stop)) one else negativeOne
+    }
+
+    def genCode(ctx: CodegenContext, startGen: ExprCode, stopGen: ExprCode): ExprCode = {
+      val Seq(oneVal, negativeOneVal) = Seq(one, negativeOne).map(Literal(_).genCode(ctx).value)
+      ExprCode.forNonNullValue(JavaCode.expression(
+        s"${startGen.value} <= ${stopGen.value} ? $oneVal : $negativeOneVal",
+        stepType))
+    }
+  }
+
+  private trait SequenceImpl {
+    def eval(start: Any, stop: Any, step: Any): Any
+
+    def genCode(
+        ctx: CodegenContext,
+        start: String,
+        stop: String,
+        step: String,
+        arr: String,
+        elemType: String): String
+
+    val defaultStep: DefaultStep
+  }
+
+  private class IntegralSequenceImpl[T: ClassTag]
+    (elemType: IntegralType)(implicit num: Integral[T]) extends SequenceImpl {
+
+    override val defaultStep: DefaultStep = new DefaultStep(
+      (elemType.ordering.lteq _).asInstanceOf[LessThanOrEqualFn],
+      elemType,
+      num.one)
+
+    override def eval(input1: Any, input2: Any, input3: Any): Array[T] = {
+      import num._
+
+      val start = input1.asInstanceOf[T]
+      val stop = input2.asInstanceOf[T]
+      val step = input3.asInstanceOf[T]
+
+      var i: Int = getSequenceLength(start, stop, step)
+      val arr = new Array[T](i)
+      while (i > 0) {
+        i -= 1
+        arr(i) = start + step * num.fromInt(i)
+      }
+      arr
+    }
+
+    override def genCode(
+        ctx: CodegenContext,
+        start: String,
+        stop: String,
+        step: String,
+        arr: String,
+        elemType: String): String = {
+      val i = ctx.freshName("i")
+      s"""
+         |${genSequenceLengthCode(ctx, start, stop, step, i)}
+         |$arr = new $elemType[$i];
+         |while ($i > 0) {
+         |  $i--;
+         |  $arr[$i] = ($elemType) ($start + $step * $i);
+         |}
+         """.stripMargin
+    }
+  }
+
+  private class TemporalSequenceImpl[T: ClassTag]
+      (dt: IntegralType, scale: Long, fromLong: Long => T, timeZone: TimeZone)
+      (implicit num: Integral[T]) extends SequenceImpl {
+
+    override val defaultStep: DefaultStep = new DefaultStep(
+      (dt.ordering.lteq _).asInstanceOf[LessThanOrEqualFn],
+      CalendarIntervalType,
+      new CalendarInterval(0, MICROS_PER_DAY))
+
+    private val backedSequenceImpl = new IntegralSequenceImpl[T](dt)
+    private val microsPerMonth = 28 * CalendarInterval.MICROS_PER_DAY
+
+    override def eval(input1: Any, input2: Any, input3: Any): Array[T] = {
+      val start = input1.asInstanceOf[T]
+      val stop = input2.asInstanceOf[T]
+      val step = input3.asInstanceOf[CalendarInterval]
+      val stepMonths = step.months
+      val stepMicros = step.microseconds
+
+      if (stepMonths == 0) {
+        backedSequenceImpl.eval(start, stop, fromLong(stepMicros / scale))
+
+      } else {
+        // To estimate the resulted array length we need to make assumptions
+        // about a month length in microseconds
+        val intervalStepInMicros = stepMicros + stepMonths * microsPerMonth
+        val startMicros: Long = num.toLong(start) * scale
+        val stopMicros: Long = num.toLong(stop) * scale
+        val maxEstimatedArrayLength =
+          getSequenceLength(startMicros, stopMicros, intervalStepInMicros)
+
+        val stepSign = if (stopMicros > startMicros) +1 else -1
+        val exclusiveItem = stopMicros + stepSign
+        val arr = new Array[T](maxEstimatedArrayLength)
+        var t = startMicros
+        var i = 0
+
+        while (t < exclusiveItem ^ stepSign < 0) {
+          arr(i) = fromLong(t / scale)
+          t = timestampAddInterval(t, stepMonths, stepMicros, timeZone)
+          i += 1
+        }
+
+        // truncate array to the correct length
+        if (arr.length == i) arr else arr.slice(0, i)
+      }
+    }
+
+    override def genCode(
+        ctx: CodegenContext,
+        start: String,
+        stop: String,
+        step: String,
+        arr: String,
+        elemType: String): String = {
+      val stepMonths = ctx.freshName("stepMonths")
+      val stepMicros = ctx.freshName("stepMicros")
+      val stepScaled = ctx.freshName("stepScaled")
+      val intervalInMicros = ctx.freshName("intervalInMicros")
+      val startMicros = ctx.freshName("startMicros")
+      val stopMicros = ctx.freshName("stopMicros")
+      val arrLength = ctx.freshName("arrLength")
+      val stepSign = ctx.freshName("stepSign")
+      val exclusiveItem = ctx.freshName("exclusiveItem")
+      val t = ctx.freshName("t")
+      val i = ctx.freshName("i")
+      val genTimeZone = ctx.addReferenceObj("timeZone", timeZone, classOf[TimeZone].getName)
+
+      val sequenceLengthCode =
+        s"""
+           |final long $intervalInMicros = $stepMicros + $stepMonths * ${microsPerMonth}L;
+           |${genSequenceLengthCode(ctx, startMicros, stopMicros, intervalInMicros, arrLength)}
+          """.stripMargin
+
+      val timestampAddIntervalCode =
+        s"""
+           |$t = org.apache.spark.sql.catalyst.util.DateTimeUtils.timestampAddInterval(
+           |  $t, $stepMonths, $stepMicros, $genTimeZone);
+          """.stripMargin
+
+      s"""
+         |final int $stepMonths = $step.months;
+         |final long $stepMicros = $step.microseconds;
+         |
+         |if ($stepMonths == 0) {
+         |  final $elemType $stepScaled = ($elemType) ($stepMicros / ${scale}L);
+         |  ${backedSequenceImpl.genCode(ctx, start, stop, stepScaled, arr, elemType)};
+         |
+         |} else {
+         |  final long $startMicros = $start * ${scale}L;
+         |  final long $stopMicros = $stop * ${scale}L;
+         |
+         |  $sequenceLengthCode
+         |
+         |  final int $stepSign = $stopMicros > $startMicros ? +1 : -1;
+         |  final long $exclusiveItem = $stopMicros + $stepSign;
+         |
+         |  $arr = new $elemType[$arrLength];
+         |  long $t = $startMicros;
+         |  int $i = 0;
+         |
+         |  while ($t < $exclusiveItem ^ $stepSign < 0) {
+         |    $arr[$i] = ($elemType) ($t / ${scale}L);
+         |    $timestampAddIntervalCode
+         |    $i += 1;
+         |  }
+         |
+         |  if ($arr.length > $i) {
+         |    $arr = java.util.Arrays.copyOf($arr, $i);
+         |  }
+         |}
+         """.stripMargin
+    }
+  }
+
+  private def getSequenceLength[U](start: U, stop: U, step: U)(implicit num: Integral[U]): Int = {
+    import num._
+    require(
+      (step > num.zero && start <= stop)
+        || (step < num.zero && start >= stop)
+        || (step == num.zero && start == stop),
+      s"Illegal sequence boundaries: $start to $stop by $step")
+
+    val len = if (start == stop) 1L else 1L + (stop.toLong - start.toLong) / step.toLong
+
+    require(
+      len <= MAX_ROUNDED_ARRAY_LENGTH,
+      s"Too long sequence: $len. Should be <= $MAX_ROUNDED_ARRAY_LENGTH")
+
+    len.toInt
+  }
+
+  private def genSequenceLengthCode(
+      ctx: CodegenContext,
+      start: String,
+      stop: String,
+      step: String,
+      len: String): String = {
+    val longLen = ctx.freshName("longLen")
+    s"""
+       |if (!(($step > 0 && $start <= $stop) ||
+       |  ($step < 0 && $start >= $stop) ||
+       |  ($step == 0 && $start == $stop))) {
+       |  throw new IllegalArgumentException(
+       |    "Illegal sequence boundaries: " + $start + " to " + $stop + " by " + $step);
+       |}
+       |long $longLen = $stop == $start ? 1L : 1L + ((long) $stop - $start) / $step;
+       |if ($longLen > $MAX_ROUNDED_ARRAY_LENGTH) {
+       |  throw new IllegalArgumentException(
+       |    "Too long sequence: " + $longLen + ". Should be <= $MAX_ROUNDED_ARRAY_LENGTH");
+       |}
+       |int $len = (int) $longLen;
+       """.stripMargin
+  }
+}
+
 /**
  * Returns the array containing the given input value (left) count (right) times.
  */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
index caea4fb25ff7e..d7744eb4c7dc7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -17,10 +17,16 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.sql.{Date, Timestamp}
+import java.util.TimeZone
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.util.DateTimeTestUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.array.ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH
+import org.apache.spark.unsafe.types.CalendarInterval
 
 class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
@@ -484,6 +490,292 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
       ArrayMax(Literal.create(Seq(1.123, 0.1234, 1.121), ArrayType(DoubleType))), 1.123)
   }
 
+  test("Sequence of numbers") {
+    // test null handling
+
+    checkEvaluation(new Sequence(Literal(null, LongType), Literal(1L)), null)
+    checkEvaluation(new Sequence(Literal(1L), Literal(null, LongType)), null)
+    checkEvaluation(new Sequence(Literal(null, LongType), Literal(1L), Literal(1L)), null)
+    checkEvaluation(new Sequence(Literal(1L), Literal(null, LongType), Literal(1L)), null)
+    checkEvaluation(new Sequence(Literal(1L), Literal(1L), Literal(null, LongType)), null)
+
+    // test sequence boundaries checking
+
+    checkExceptionInExpression[IllegalArgumentException](
+      new Sequence(Literal(Int.MinValue), Literal(Int.MaxValue), Literal(1)),
+      EmptyRow, s"Too long sequence: 4294967296. Should be <= $MAX_ROUNDED_ARRAY_LENGTH")
+
+    checkExceptionInExpression[IllegalArgumentException](
+      new Sequence(Literal(1), Literal(2), Literal(0)), EmptyRow, "boundaries: 1 to 2 by 0")
+    checkExceptionInExpression[IllegalArgumentException](
+      new Sequence(Literal(2), Literal(1), Literal(0)), EmptyRow, "boundaries: 2 to 1 by 0")
+    checkExceptionInExpression[IllegalArgumentException](
+      new Sequence(Literal(2), Literal(1), Literal(1)), EmptyRow, "boundaries: 2 to 1 by 1")
+    checkExceptionInExpression[IllegalArgumentException](
+      new Sequence(Literal(1), Literal(2), Literal(-1)), EmptyRow, "boundaries: 1 to 2 by -1")
+
+    // test sequence with one element (zero step or equal start and stop)
+
+    checkEvaluation(new Sequence(Literal(1), Literal(1), Literal(-1)), Seq(1))
+    checkEvaluation(new Sequence(Literal(1), Literal(1), Literal(0)), Seq(1))
+    checkEvaluation(new Sequence(Literal(1), Literal(1), Literal(1)), Seq(1))
+    checkEvaluation(new Sequence(Literal(1), Literal(2), Literal(2)), Seq(1))
+    checkEvaluation(new Sequence(Literal(1), Literal(0), Literal(-2)), Seq(1))
+
+    // test sequence of different integral types (ascending and descending)
+
+    checkEvaluation(new Sequence(Literal(1L), Literal(3L), Literal(1L)), Seq(1L, 2L, 3L))
+    checkEvaluation(new Sequence(Literal(-3), Literal(3), Literal(3)), Seq(-3, 0, 3))
+    checkEvaluation(
+      new Sequence(Literal(3.toShort), Literal(-3.toShort), Literal(-3.toShort)),
+      Seq(3.toShort, 0.toShort, -3.toShort))
+    checkEvaluation(
+      new Sequence(Literal(-1.toByte), Literal(-3.toByte), Literal(-1.toByte)),
+      Seq(-1.toByte, -2.toByte, -3.toByte))
+  }
+
+  test("Sequence of timestamps") {
+    checkEvaluation(new Sequence(
+      Literal(Timestamp.valueOf("2018-01-01 00:00:00")),
+      Literal(Timestamp.valueOf("2018-01-02 00:00:00")),
+      Literal(CalendarInterval.fromString("interval 12 hours"))),
+      Seq(
+        Timestamp.valueOf("2018-01-01 00:00:00"),
+        Timestamp.valueOf("2018-01-01 12:00:00"),
+        Timestamp.valueOf("2018-01-02 00:00:00")))
+
+    checkEvaluation(new Sequence(
+      Literal(Timestamp.valueOf("2018-01-01 00:00:00")),
+      Literal(Timestamp.valueOf("2018-01-02 00:00:01")),
+      Literal(CalendarInterval.fromString("interval 12 hours"))),
+      Seq(
+        Timestamp.valueOf("2018-01-01 00:00:00"),
+        Timestamp.valueOf("2018-01-01 12:00:00"),
+        Timestamp.valueOf("2018-01-02 00:00:00")))
+
+    checkEvaluation(new Sequence(
+      Literal(Timestamp.valueOf("2018-01-02 00:00:00")),
+      Literal(Timestamp.valueOf("2018-01-01 00:00:00")),
+      Literal(CalendarInterval.fromString("interval 12 hours").negate())),
+      Seq(
+        Timestamp.valueOf("2018-01-02 00:00:00"),
+        Timestamp.valueOf("2018-01-01 12:00:00"),
+        Timestamp.valueOf("2018-01-01 00:00:00")))
+
+    checkEvaluation(new Sequence(
+      Literal(Timestamp.valueOf("2018-01-02 00:00:00")),
+      Literal(Timestamp.valueOf("2017-12-31 23:59:59")),
+      Literal(CalendarInterval.fromString("interval 12 hours").negate())),
+      Seq(
+        Timestamp.valueOf("2018-01-02 00:00:00"),
+        Timestamp.valueOf("2018-01-01 12:00:00"),
+        Timestamp.valueOf("2018-01-01 00:00:00")))
+
+    checkEvaluation(new Sequence(
+      Literal(Timestamp.valueOf("2018-01-01 00:00:00")),
+      Literal(Timestamp.valueOf("2018-03-01 00:00:00")),
+      Literal(CalendarInterval.fromString("interval 1 month"))),
+      Seq(
+        Timestamp.valueOf("2018-01-01 00:00:00"),
+        Timestamp.valueOf("2018-02-01 00:00:00"),
+        Timestamp.valueOf("2018-03-01 00:00:00")))
+
+    checkEvaluation(new Sequence(
+      Literal(Timestamp.valueOf("2018-03-01 00:00:00")),
+      Literal(Timestamp.valueOf("2018-01-01 00:00:00")),
+      Literal(CalendarInterval.fromString("interval 1 month").negate())),
+      Seq(
+        Timestamp.valueOf("2018-03-01 00:00:00"),
+        Timestamp.valueOf("2018-02-01 00:00:00"),
+        Timestamp.valueOf("2018-01-01 00:00:00")))
+
+    checkEvaluation(new Sequence(
+      Literal(Timestamp.valueOf("2018-03-03 00:00:00")),
+      Literal(Timestamp.valueOf("2018-01-01 00:00:00")),
+      Literal(CalendarInterval.fromString("interval 1 month 1 day").negate())),
+      Seq(
+        Timestamp.valueOf("2018-03-03 00:00:00"),
+        Timestamp.valueOf("2018-02-02 00:00:00"),
+        Timestamp.valueOf("2018-01-01 00:00:00")))
+
+    checkEvaluation(new Sequence(
+      Literal(Timestamp.valueOf("2018-01-31 00:00:00")),
+      Literal(Timestamp.valueOf("2018-04-30 00:00:00")),
+      Literal(CalendarInterval.fromString("interval 1 month"))),
+      Seq(
+        Timestamp.valueOf("2018-01-31 00:00:00"),
+        Timestamp.valueOf("2018-02-28 00:00:00"),
+        Timestamp.valueOf("2018-03-31 00:00:00"),
+        Timestamp.valueOf("2018-04-30 00:00:00")))
+
+    checkEvaluation(new Sequence(
+      Literal(Timestamp.valueOf("2018-01-01 00:00:00")),
+      Literal(Timestamp.valueOf("2018-03-01 00:00:00")),
+      Literal(CalendarInterval.fromString("interval 1 month 1 second"))),
+      Seq(
+        Timestamp.valueOf("2018-01-01 00:00:00"),
+        Timestamp.valueOf("2018-02-01 00:00:01")))
+
+    checkEvaluation(new Sequence(
+      Literal(Timestamp.valueOf("2018-01-01 00:00:00")),
+      Literal(Timestamp.valueOf("2018-03-01 00:04:06")),
+      Literal(CalendarInterval.fromString("interval 1 month 2 minutes 3 seconds"))),
+      Seq(
+        Timestamp.valueOf("2018-01-01 00:00:00"),
+        Timestamp.valueOf("2018-02-01 00:02:03"),
+        Timestamp.valueOf("2018-03-01 00:04:06")))
+
+    checkEvaluation(new Sequence(
+      Literal(Timestamp.valueOf("2018-01-01 00:00:00")),
+      Literal(Timestamp.valueOf("2023-01-01 00:00:00")),
+      Literal(CalendarInterval.fromYearMonthString("1-5"))),
+      Seq(
+        Timestamp.valueOf("2018-01-01 00:00:00.000"),
+        Timestamp.valueOf("2019-06-01 00:00:00.000"),
+        Timestamp.valueOf("2020-11-01 00:00:00.000"),
+        Timestamp.valueOf("2022-04-01 00:00:00.000")))
+
+    checkEvaluation(new Sequence(
+      Literal(Timestamp.valueOf("2022-04-01 00:00:00")),
+      Literal(Timestamp.valueOf("2017-01-01 00:00:00")),
+      Literal(CalendarInterval.fromYearMonthString("1-5").negate())),
+      Seq(
+        Timestamp.valueOf("2022-04-01 00:00:00.000"),
+        Timestamp.valueOf("2020-11-01 00:00:00.000"),
+        Timestamp.valueOf("2019-06-01 00:00:00.000"),
+        Timestamp.valueOf("2018-01-01 00:00:00.000")))
+  }
+
+  test("Sequence on DST boundaries") {
+    val timeZone = TimeZone.getTimeZone("Europe/Prague")
+    val dstOffset = timeZone.getDSTSavings
+
+    def noDST(t: Timestamp): Timestamp = new Timestamp(t.getTime - dstOffset)
+
+    DateTimeTestUtils.withDefaultTimeZone(timeZone) {
+      // Spring time change
+      checkEvaluation(new Sequence(
+        Literal(Timestamp.valueOf("2018-03-25 01:30:00")),
+        Literal(Timestamp.valueOf("2018-03-25 03:30:00")),
+        Literal(CalendarInterval.fromString("interval 30 minutes"))),
+        Seq(
+          Timestamp.valueOf("2018-03-25 01:30:00"),
+          Timestamp.valueOf("2018-03-25 03:00:00"),
+          Timestamp.valueOf("2018-03-25 03:30:00")))
+
+      // Autumn time change
+      checkEvaluation(new Sequence(
+        Literal(Timestamp.valueOf("2018-10-28 01:30:00")),
+        Literal(Timestamp.valueOf("2018-10-28 03:30:00")),
+        Literal(CalendarInterval.fromString("interval 30 minutes"))),
+        Seq(
+          Timestamp.valueOf("2018-10-28 01:30:00"),
+          noDST(Timestamp.valueOf("2018-10-28 02:00:00")),
+          noDST(Timestamp.valueOf("2018-10-28 02:30:00")),
+          Timestamp.valueOf("2018-10-28 02:00:00"),
+          Timestamp.valueOf("2018-10-28 02:30:00"),
+          Timestamp.valueOf("2018-10-28 03:00:00"),
+          Timestamp.valueOf("2018-10-28 03:30:00")))
+    }
+  }
+
+  test("Sequence of dates") {
+    DateTimeTestUtils.withDefaultTimeZone(TimeZone.getTimeZone("UTC")) {
+      checkEvaluation(new Sequence(
+        Literal(Date.valueOf("2018-01-01")),
+        Literal(Date.valueOf("2018-01-05")),
+        Literal(CalendarInterval.fromString("interval 2 days"))),
+        Seq(
+          Date.valueOf("2018-01-01"),
+          Date.valueOf("2018-01-03"),
+          Date.valueOf("2018-01-05")))
+
+      checkEvaluation(new Sequence(
+        Literal(Date.valueOf("2018-01-01")),
+        Literal(Date.valueOf("2018-03-01")),
+        Literal(CalendarInterval.fromString("interval 1 month"))),
+        Seq(
+          Date.valueOf("2018-01-01"),
+          Date.valueOf("2018-02-01"),
+          Date.valueOf("2018-03-01")))
+
+      checkEvaluation(new Sequence(
+        Literal(Date.valueOf("2018-01-31")),
+        Literal(Date.valueOf("2018-04-30")),
+        Literal(CalendarInterval.fromString("interval 1 month"))),
+        Seq(
+          Date.valueOf("2018-01-31"),
+          Date.valueOf("2018-02-28"),
+          Date.valueOf("2018-03-31"),
+          Date.valueOf("2018-04-30")))
+
+      checkEvaluation(new Sequence(
+        Literal(Date.valueOf("2018-01-01")),
+        Literal(Date.valueOf("2023-01-01")),
+        Literal(CalendarInterval.fromYearMonthString("1-5"))),
+        Seq(
+          Date.valueOf("2018-01-01"),
+          Date.valueOf("2019-06-01"),
+          Date.valueOf("2020-11-01"),
+          Date.valueOf("2022-04-01")))
+
+      checkExceptionInExpression[IllegalArgumentException](
+        new Sequence(
+          Literal(Date.valueOf("1970-01-02")),
+          Literal(Date.valueOf("1970-01-01")),
+          Literal(CalendarInterval.fromString("interval 1 day"))),
+        EmptyRow, "sequence boundaries: 1 to 0 by 1")
+
+      checkExceptionInExpression[IllegalArgumentException](
+        new Sequence(
+          Literal(Date.valueOf("1970-01-01")),
+          Literal(Date.valueOf("1970-02-01")),
+          Literal(CalendarInterval.fromString("interval 1 month").negate())),
+        EmptyRow,
+        s"sequence boundaries: 0 to 2678400000000 by -${28 * CalendarInterval.MICROS_PER_DAY}")
+    }
+  }
+
+  test("Sequence with default step") {
+    // +/- 1 for integral type
+    checkEvaluation(new Sequence(Literal(1), Literal(3)), Seq(1, 2, 3))
+    checkEvaluation(new Sequence(Literal(3), Literal(1)), Seq(3, 2, 1))
+
+    // +/- 1 day for timestamps
+    checkEvaluation(new Sequence(
+      Literal(Timestamp.valueOf("2018-01-01 00:00:00")),
+      Literal(Timestamp.valueOf("2018-01-03 00:00:00"))),
+      Seq(
+        Timestamp.valueOf("2018-01-01 00:00:00"),
+        Timestamp.valueOf("2018-01-02 00:00:00"),
+        Timestamp.valueOf("2018-01-03 00:00:00")))
+
+    checkEvaluation(new Sequence(
+      Literal(Timestamp.valueOf("2018-01-03 00:00:00")),
+      Literal(Timestamp.valueOf("2018-01-01 00:00:00"))),
+      Seq(
+        Timestamp.valueOf("2018-01-03 00:00:00"),
+        Timestamp.valueOf("2018-01-02 00:00:00"),
+        Timestamp.valueOf("2018-01-01 00:00:00")))
+
+    // +/- 1 day for dates
+    checkEvaluation(new Sequence(
+      Literal(Date.valueOf("2018-01-01")),
+      Literal(Date.valueOf("2018-01-03"))),
+      Seq(
+        Date.valueOf("2018-01-01"),
+        Date.valueOf("2018-01-02"),
+        Date.valueOf("2018-01-03")))
+
+    checkEvaluation(new Sequence(
+      Literal(Date.valueOf("2018-01-03")),
+      Literal(Date.valueOf("2018-01-01"))),
+      Seq(
+        Date.valueOf("2018-01-03"),
+        Date.valueOf("2018-01-02"),
+        Date.valueOf("2018-01-01")))
+  }
+
   test("Reverse") {
     // Primitive-type elements
     val ai0 = Literal.create(Seq(2, 1, 4, 3), ArrayType(IntegerType))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index ef99ce3ad69d9..0b4f526799578 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -3485,6 +3485,27 @@ object functions {
    */
   def flatten(e: Column): Column = withExpr { Flatten(e.expr) }
 
+  /**
+   * Generate a sequence of integers from start to stop, incrementing by step.
+   *
+   * @group collection_funcs
+   * @since 2.4.0
+   */
+  def sequence(start: Column, stop: Column, step: Column): Column = withExpr {
+    new Sequence(start.expr, stop.expr, step.expr)
+  }
+
+  /**
+   * Generate a sequence of integers from start to stop,
+   * incrementing by 1 if start is less than or equal to stop, otherwise -1.
+   *
+   * @group collection_funcs
+   * @since 2.4.0
+   */
+  def sequence(start: Column, stop: Column): Column = withExpr {
+    new Sequence(start.expr, stop.expr)
+  }
+
   /**
    * Creates an array containing the left argument repeated the number of times given by the
    * right argument.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index b109898b5bfb3..4c28e2f1cd909 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -18,12 +18,15 @@
 package org.apache.spark.sql
 
 import java.nio.charset.StandardCharsets
+import java.sql.{Date, Timestamp}
+import java.util.TimeZone
 
 import scala.util.Random
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
+import org.apache.spark.sql.catalyst.util.DateTimeTestUtils
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -862,6 +865,59 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
     checkAnswer(df.selectExpr("array_max(a)"), answer)
   }
 
+  test("sequence") {
+    checkAnswer(Seq((-2, 2)).toDF().select(sequence('_1, '_2)), Seq(Row(Array(-2, -1, 0, 1, 2))))
+    checkAnswer(Seq((7, 2, -2)).toDF().select(sequence('_1, '_2, '_3)), Seq(Row(Array(7, 5, 3))))
+
+    checkAnswer(
+      spark.sql("select sequence(" +
+        "   cast('2018-01-01 00:00:00' as timestamp)" +
+        ",  cast('2018-01-02 00:00:00' as timestamp)" +
+        ",  interval 12 hours)"),
+      Seq(Row(Array(
+        Timestamp.valueOf("2018-01-01 00:00:00"),
+        Timestamp.valueOf("2018-01-01 12:00:00"),
+        Timestamp.valueOf("2018-01-02 00:00:00")))))
+
+    DateTimeTestUtils.withDefaultTimeZone(TimeZone.getTimeZone("UTC")) {
+      checkAnswer(
+        spark.sql("select sequence(" +
+          "   cast('2018-01-01' as date)" +
+          ",  cast('2018-03-01' as date)" +
+          ",  interval 1 month)"),
+        Seq(Row(Array(
+          Date.valueOf("2018-01-01"),
+          Date.valueOf("2018-02-01"),
+          Date.valueOf("2018-03-01")))))
+    }
+
+    // test type coercion
+    checkAnswer(
+      Seq((1.toByte, 3L, 1)).toDF().select(sequence('_1, '_2, '_3)),
+      Seq(Row(Array(1L, 2L, 3L))))
+
+    checkAnswer(
+      spark.sql("select sequence(" +
+        "   cast('2018-01-01' as date)" +
+        ",  cast('2018-01-02 00:00:00' as timestamp)" +
+        ",  interval 12 hours)"),
+      Seq(Row(Array(
+        Timestamp.valueOf("2018-01-01 00:00:00"),
+        Timestamp.valueOf("2018-01-01 12:00:00"),
+        Timestamp.valueOf("2018-01-02 00:00:00")))))
+
+    // test invalid data types
+    intercept[AnalysisException] {
+      Seq((true, false)).toDF().selectExpr("sequence(_1, _2)")
+    }
+    intercept[AnalysisException] {
+      Seq((true, false, 42)).toDF().selectExpr("sequence(_1, _2, _3)")
+    }
+    intercept[AnalysisException] {
+      Seq((1, 2, 0.5)).toDF().selectExpr("sequence(_1, _2, _3)")
+    }
+  }
+
   test("reverse function") {
     val dummyFilter = (c: Column) => c.isNull || c.isNotNull // switch codegen on
 

From 9a76f23c6a1756053c30f58baea2966d1b023981 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Wed, 27 Jun 2018 11:52:48 +0800
Subject: [PATCH 15/79] [SPARK-23927][SQL][FOLLOW-UP] Fix a build failure.

## What changes were proposed in this pull request?

This pr is a follow-up pr of #21155.
The #21155 removed unnecessary import at that time, but the import became necessary in another pr.

## How was this patch tested?

Existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #21646 from ueshin/issues/SPARK-23927/fup1.
---
 .../spark/sql/catalyst/expressions/collectionOperations.scala    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 0395e1ef9a7ad..8b278f067749e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.expressions.codegen.Block._
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.unsafe.array.ByteArrayMethods

From a1a64e3583cfa451b4d0d2361c1da2972a5e4444 Mon Sep 17 00:00:00 2001
From: Yuexin Zhang <zach.yx.zhang@gmail.com>
Date: Wed, 27 Jun 2018 16:05:36 +0800
Subject: [PATCH 16/79] [SPARK-21335][DOC] doc changes for disallowed
 un-aliased subquery use case

## What changes were proposed in this pull request?
Document a change for un-aliased subquery use case, to address the last question in PR #18559:
https://github.com/apache/spark/pull/18559#issuecomment-316884858

(Please fill in changes proposed in this fix)

## How was this patch tested?
 it does not affect tests.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Yuexin Zhang <zach.yx.zhang@gmail.com>

Closes #21647 from cnZach/doc_change_for_SPARK-20690_SPARK-21335.
---
 docs/sql-programming-guide.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 7c4ef41cc8907..cd7329b621122 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -2017,6 +2017,7 @@ working with timestamps in `pandas_udf`s to get the best performance, see
     - Literal values used in SQL operations are converted to DECIMAL with the exact precision and scale needed by them.
     - The configuration `spark.sql.decimalOperations.allowPrecisionLoss` has been introduced. It defaults to `true`, which means the new behavior described here; if set to `false`, Spark uses previous rules, ie. it doesn't adjust the needed scale to represent the values and it returns NULL if an exact representation of the value is not possible.
   - In PySpark, `df.replace` does not allow to omit `value` when `to_replace` is not a dictionary. Previously, `value` could be omitted in the other cases and had `None` by default, which is counterintuitive and error-prone.
+  - Un-aliased subquery's semantic has not been well defined with confusing behaviors. Since Spark 2.3, we invalidate such confusing cases, for example: `SELECT v.i from (SELECT i FROM v)`, Spark will throw an analysis exception in this case because users should not be able to use the qualifier inside a subquery. See [SPARK-20690](https://issues.apache.org/jira/browse/SPARK-20690) and [SPARK-21335](https://issues.apache.org/jira/browse/SPARK-21335) for more details.
 
 ## Upgrading From Spark SQL 2.1 to 2.2
 

From 6a0b77a55d53e74ac0a0892556c3a7a933474948 Mon Sep 17 00:00:00 2001
From: Yuanjian Li <xyliyuanjian@gmail.com>
Date: Wed, 27 Jun 2018 10:43:06 -0700
Subject: [PATCH 17/79] [SPARK-24215][PYSPARK][FOLLOW UP] Implement eager
 evaluation for DataFrame APIs in PySpark

## What changes were proposed in this pull request?

Address comments in #21370 and add more test.

## How was this patch tested?

Enhance test in pyspark/sql/test.py and DataFrameSuite

Author: Yuanjian Li <xyliyuanjian@gmail.com>

Closes #21553 from xuanyuanking/SPARK-24215-follow.
---
 docs/configuration.md                         | 27 ---------
 python/pyspark/sql/dataframe.py               |  3 +-
 python/pyspark/sql/tests.py                   | 46 ++++++++++++++-
 .../apache/spark/sql/internal/SQLConf.scala   | 23 ++++++++
 .../scala/org/apache/spark/sql/Dataset.scala  | 11 ++--
 .../org/apache/spark/sql/DataFrameSuite.scala | 59 +++++++++++++++++++
 6 files changed, 131 insertions(+), 38 deletions(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index 6aa7878fe614d..0c7c4472be643 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -456,33 +456,6 @@ Apart from these, the following properties are also available, and may be useful
     from JVM to Python worker for every task.
   </td>
 </tr>
-<tr>
-  <td><code>spark.sql.repl.eagerEval.enabled</code></td>
-  <td>false</td>
-  <td>
-    Enable eager evaluation or not. If true and the REPL you are using supports eager evaluation,
-    Dataset will be ran automatically. The HTML table which generated by <code>_repl_html_</code>
-    called by notebooks like Jupyter will feedback the queries user have defined. For plain Python
-    REPL, the output will be shown like <code>dataframe.show()</code>
-    (see <a href="https://issues.apache.org/jira/browse/SPARK-24215">SPARK-24215</a> for more details).
-  </td>
-</tr>
-<tr>
-  <td><code>spark.sql.repl.eagerEval.maxNumRows</code></td>
-  <td>20</td>
-  <td>
-    Default number of rows in eager evaluation output HTML table generated by <code>_repr_html_</code> or plain text,
-    this only take effect when <code>spark.sql.repl.eagerEval.enabled</code> is set to true.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.sql.repl.eagerEval.truncate</code></td>
-  <td>20</td>
-  <td>
-    Default number of truncate in eager evaluation output HTML table generated by <code>_repr_html_</code> or
-    plain text, this only take effect when <code>spark.sql.repl.eagerEval.enabled</code> set to true.
-  </td>
-</tr>
 <tr>
   <td><code>spark.files</code></td>
   <td></td>
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 1e6a1acebb5ca..cb3fe448b6fc7 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -393,9 +393,8 @@ def _repr_html_(self):
             self._support_repr_html = True
         if self._eager_eval:
             max_num_rows = max(self._max_num_rows, 0)
-            vertical = False
             sock_info = self._jdf.getRowsToPython(
-                max_num_rows, self._truncate, vertical)
+                max_num_rows, self._truncate)
             rows = list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
             head = rows[0]
             row_data = rows[1:]
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 35a0636e5cfc0..8d738069adb3d 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3351,11 +3351,41 @@ def test_checking_csv_header(self):
         finally:
             shutil.rmtree(path)
 
-    def test_repr_html(self):
+    def test_repr_behaviors(self):
         import re
         pattern = re.compile(r'^ *\|', re.MULTILINE)
         df = self.spark.createDataFrame([(1, "1"), (22222, "22222")], ("key", "value"))
-        self.assertEquals(None, df._repr_html_())
+
+        # test when eager evaluation is enabled and _repr_html_ will not be called
+        with self.sql_conf({"spark.sql.repl.eagerEval.enabled": True}):
+            expected1 = """+-----+-----+
+                ||  key|value|
+                |+-----+-----+
+                ||    1|    1|
+                ||22222|22222|
+                |+-----+-----+
+                |"""
+            self.assertEquals(re.sub(pattern, '', expected1), df.__repr__())
+            with self.sql_conf({"spark.sql.repl.eagerEval.truncate": 3}):
+                expected2 = """+---+-----+
+                ||key|value|
+                |+---+-----+
+                ||  1|    1|
+                ||222|  222|
+                |+---+-----+
+                |"""
+                self.assertEquals(re.sub(pattern, '', expected2), df.__repr__())
+                with self.sql_conf({"spark.sql.repl.eagerEval.maxNumRows": 1}):
+                    expected3 = """+---+-----+
+                    ||key|value|
+                    |+---+-----+
+                    ||  1|    1|
+                    |+---+-----+
+                    |only showing top 1 row
+                    |"""
+                    self.assertEquals(re.sub(pattern, '', expected3), df.__repr__())
+
+        # test when eager evaluation is enabled and _repr_html_ will be called
         with self.sql_conf({"spark.sql.repl.eagerEval.enabled": True}):
             expected1 = """<table border='1'>
                 |<tr><th>key</th><th>value</th></tr>
@@ -3381,6 +3411,18 @@ def test_repr_html(self):
                         |"""
                     self.assertEquals(re.sub(pattern, '', expected3), df._repr_html_())
 
+        # test when eager evaluation is disabled and _repr_html_ will be called
+        with self.sql_conf({"spark.sql.repl.eagerEval.enabled": False}):
+            expected = "DataFrame[key: bigint, value: string]"
+            self.assertEquals(None, df._repr_html_())
+            self.assertEquals(expected, df.__repr__())
+            with self.sql_conf({"spark.sql.repl.eagerEval.truncate": 3}):
+                self.assertEquals(None, df._repr_html_())
+                self.assertEquals(expected, df.__repr__())
+                with self.sql_conf({"spark.sql.repl.eagerEval.maxNumRows": 1}):
+                    self.assertEquals(None, df._repr_html_())
+                    self.assertEquals(expected, df.__repr__())
+
 
 class HiveSparkSubmitTests(SparkSubmitTests):
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 239c8266351ae..e1752ff997b69 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1330,6 +1330,29 @@ object SQLConf {
       "The size function returns null for null input if the flag is disabled.")
     .booleanConf
     .createWithDefault(true)
+
+  val REPL_EAGER_EVAL_ENABLED = buildConf("spark.sql.repl.eagerEval.enabled")
+    .doc("Enables eager evaluation or not. When true, the top K rows of Dataset will be " +
+      "displayed if and only if the REPL supports the eager evaluation. Currently, the " +
+      "eager evaluation is only supported in PySpark. For the notebooks like Jupyter, " +
+      "the HTML table (generated by _repr_html_) will be returned. For plain Python REPL, " +
+      "the returned outputs are formatted like dataframe.show().")
+    .booleanConf
+    .createWithDefault(false)
+
+  val REPL_EAGER_EVAL_MAX_NUM_ROWS = buildConf("spark.sql.repl.eagerEval.maxNumRows")
+    .doc("The max number of rows that are returned by eager evaluation. This only takes " +
+      "effect when spark.sql.repl.eagerEval.enabled is set to true. The valid range of this " +
+      "config is from 0 to (Int.MaxValue - 1), so the invalid config like negative and " +
+      "greater than (Int.MaxValue - 1) will be normalized to 0 and (Int.MaxValue - 1).")
+    .intConf
+    .createWithDefault(20)
+
+  val REPL_EAGER_EVAL_TRUNCATE = buildConf("spark.sql.repl.eagerEval.truncate")
+    .doc("The max number of characters for each cell that is returned by eager evaluation. " +
+      "This only takes effect when spark.sql.repl.eagerEval.enabled is set to true.")
+    .intConf
+    .createWithDefault(20)
 }
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 57f1e173211af..2ec236fc75efc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -236,12 +236,10 @@ class Dataset[T] private[sql](
    * @param numRows Number of rows to return
    * @param truncate If set to more than 0, truncates strings to `truncate` characters and
    *                   all cells will be aligned right.
-   * @param vertical If set to true, the rows to return do not need truncate.
    */
   private[sql] def getRows(
       numRows: Int,
-      truncate: Int,
-      vertical: Boolean): Seq[Seq[String]] = {
+      truncate: Int): Seq[Seq[String]] = {
     val newDf = toDF()
     val castCols = newDf.logicalPlan.output.map { col =>
       // Since binary types in top-level schema fields have a specific format to print,
@@ -289,7 +287,7 @@ class Dataset[T] private[sql](
       vertical: Boolean = false): String = {
     val numRows = _numRows.max(0).min(Int.MaxValue - 1)
     // Get rows represented by Seq[Seq[String]], we may get one more line if it has more data.
-    val tmpRows = getRows(numRows, truncate, vertical)
+    val tmpRows = getRows(numRows, truncate)
 
     val hasMoreData = tmpRows.length - 1 > numRows
     val rows = tmpRows.take(numRows + 1)
@@ -3226,11 +3224,10 @@ class Dataset[T] private[sql](
 
   private[sql] def getRowsToPython(
       _numRows: Int,
-      truncate: Int,
-      vertical: Boolean): Array[Any] = {
+      truncate: Int): Array[Any] = {
     EvaluatePython.registerPicklers()
     val numRows = _numRows.max(0).min(Int.MaxValue - 1)
-    val rows = getRows(numRows, truncate, vertical).map(_.toArray).toArray
+    val rows = getRows(numRows, truncate).map(_.toArray).toArray
     val toJava: (Any) => Any = EvaluatePython.toJava(_, ArrayType(ArrayType(StringType)))
     val iter: Iterator[Array[Byte]] = new SerDeUtil.AutoBatchedPickler(
       rows.iterator.map(toJava))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 1cc8cb3874c9b..ea00d22bff001 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1044,6 +1044,65 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     testData.select($"*").show(1000)
   }
 
+  test("getRows: truncate = [0, 20]") {
+    val longString = Array.fill(21)("1").mkString
+    val df = sparkContext.parallelize(Seq("1", longString)).toDF()
+    val expectedAnswerForFalse = Seq(
+      Seq("value"),
+      Seq("1"),
+      Seq("111111111111111111111"))
+    assert(df.getRows(10, 0) === expectedAnswerForFalse)
+    val expectedAnswerForTrue = Seq(
+      Seq("value"),
+      Seq("1"),
+      Seq("11111111111111111..."))
+    assert(df.getRows(10, 20) === expectedAnswerForTrue)
+  }
+
+  test("getRows: truncate = [3, 17]") {
+    val longString = Array.fill(21)("1").mkString
+    val df = sparkContext.parallelize(Seq("1", longString)).toDF()
+    val expectedAnswerForFalse = Seq(
+      Seq("value"),
+      Seq("1"),
+      Seq("111"))
+    assert(df.getRows(10, 3) === expectedAnswerForFalse)
+    val expectedAnswerForTrue = Seq(
+      Seq("value"),
+      Seq("1"),
+      Seq("11111111111111..."))
+    assert(df.getRows(10, 17) === expectedAnswerForTrue)
+  }
+
+  test("getRows: numRows = 0") {
+    val expectedAnswer = Seq(Seq("key", "value"), Seq("1", "1"))
+    assert(testData.select($"*").getRows(0, 20) === expectedAnswer)
+  }
+
+  test("getRows: array") {
+    val df = Seq(
+      (Array(1, 2, 3), Array(1, 2, 3)),
+      (Array(2, 3, 4), Array(2, 3, 4))
+    ).toDF()
+    val expectedAnswer = Seq(
+      Seq("_1", "_2"),
+      Seq("[1, 2, 3]", "[1, 2, 3]"),
+      Seq("[2, 3, 4]", "[2, 3, 4]"))
+    assert(df.getRows(10, 20) === expectedAnswer)
+  }
+
+  test("getRows: binary") {
+    val df = Seq(
+      ("12".getBytes(StandardCharsets.UTF_8), "ABC.".getBytes(StandardCharsets.UTF_8)),
+      ("34".getBytes(StandardCharsets.UTF_8), "12346".getBytes(StandardCharsets.UTF_8))
+    ).toDF()
+    val expectedAnswer = Seq(
+      Seq("_1", "_2"),
+      Seq("[31 32]", "[41 42 43 2E]"),
+      Seq("[33 34]", "[31 32 33 34 36]"))
+    assert(df.getRows(10, 20) === expectedAnswer)
+  }
+
   test("showString: truncate = [0, 20]") {
     val longString = Array.fill(21)("1").mkString
     val df = sparkContext.parallelize(Seq("1", longString)).toDF()

From 78ecb6d457970b136a2e0e0e27d170c84ea28eac Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 27 Jun 2018 10:57:29 -0700
Subject: [PATCH 18/79] [SPARK-24446][YARN] Properly quote library path for
 YARN.

Because the way YARN executes commands via bash -c, everything needs
to be quoted so that the whole command is fully contained inside a
bash string and is interpreted correctly when the string is read by
bash. This is a bit different than the quoting done when executing
things as if typing in a bash shell.

Tweaked unit tests to exercise the bad behavior, which would cause
existing tests to time out without the fix. Also tested on a real
cluster, verifying the shell script created by YARN to run the
container.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #21476 from vanzin/SPARK-24446.
---
 .../org/apache/spark/deploy/yarn/Client.scala | 22 +++++++++++++++++--
 .../spark/deploy/yarn/ExecutorRunnable.scala  | 11 +++++-----
 .../deploy/yarn/BaseYarnClusterSuite.scala    |  9 ++++++++
 3 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 7225ff03dc34e..793d012218490 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -899,7 +899,8 @@ private[spark] class Client(
       val libraryPaths = Seq(sparkConf.get(DRIVER_LIBRARY_PATH),
         sys.props.get("spark.driver.libraryPath")).flatten
       if (libraryPaths.nonEmpty) {
-        prefixEnv = Some(getClusterPath(sparkConf, Utils.libraryPathEnvPrefix(libraryPaths)))
+        prefixEnv = Some(createLibraryPathPrefix(libraryPaths.mkString(File.pathSeparator),
+          sparkConf))
       }
       if (sparkConf.get(AM_JAVA_OPTIONS).isDefined) {
         logWarning(s"${AM_JAVA_OPTIONS.key} will not take effect in cluster mode")
@@ -921,7 +922,7 @@ private[spark] class Client(
           .map(YarnSparkHadoopUtil.escapeForShell)
       }
       sparkConf.get(AM_LIBRARY_PATH).foreach { paths =>
-        prefixEnv = Some(getClusterPath(sparkConf, Utils.libraryPathEnvPrefix(Seq(paths))))
+        prefixEnv = Some(createLibraryPathPrefix(paths, sparkConf))
       }
     }
 
@@ -1485,6 +1486,23 @@ private object Client extends Logging {
     YarnAppReport(report.getYarnApplicationState(), report.getFinalApplicationStatus(), diagsOpt)
   }
 
+  /**
+   * Create a properly quoted and escaped library path string to be added as a prefix to the command
+   * executed by YARN. This is different from normal quoting / escaping due to YARN executing the
+   * command through "bash -c".
+   */
+  def createLibraryPathPrefix(libpath: String, conf: SparkConf): String = {
+    val cmdPrefix = if (Utils.isWindows) {
+      Utils.libraryPathEnvPrefix(Seq(libpath))
+    } else {
+      val envName = Utils.libraryPathEnvName
+      // For quotes, escape both the quote and the escape character when encoding in the command
+      // string.
+      val quoted = libpath.replace("\"", "\\\\\\\"")
+      envName + "=\\\"" + quoted + File.pathSeparator + "$" + envName + "\\\""
+    }
+    getClusterPath(conf, cmdPrefix)
+  }
 }
 
 private[spark] class YarnClusterApplication extends SparkApplication {
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
index a2a18cdff65af..49a0b93aa5c40 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
@@ -131,10 +131,6 @@ private[yarn] class ExecutorRunnable(
     // Extra options for the JVM
     val javaOpts = ListBuffer[String]()
 
-    // Set the environment variable through a command prefix
-    // to append to the existing value of the variable
-    var prefixEnv: Option[String] = None
-
     // Set the JVM memory
     val executorMemoryString = executorMemory + "m"
     javaOpts += "-Xmx" + executorMemoryString
@@ -144,8 +140,11 @@ private[yarn] class ExecutorRunnable(
       val subsOpt = Utils.substituteAppNExecIds(opts, appId, executorId)
       javaOpts ++= Utils.splitCommandString(subsOpt).map(YarnSparkHadoopUtil.escapeForShell)
     }
-    sparkConf.get(EXECUTOR_LIBRARY_PATH).foreach { p =>
-      prefixEnv = Some(Client.getClusterPath(sparkConf, Utils.libraryPathEnvPrefix(Seq(p))))
+
+    // Set the library path through a command prefix to append to the existing value of the
+    // env variable.
+    val prefixEnv = sparkConf.get(EXECUTOR_LIBRARY_PATH).map { libPath =>
+      Client.createLibraryPathPrefix(libPath, sparkConf)
     }
 
     javaOpts += "-Djava.io.tmpdir=" +
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
index ac67f2196e0a0..b0abcc9149d08 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
@@ -36,6 +36,7 @@ import org.scalatest.concurrent.Eventually._
 import org.apache.spark._
 import org.apache.spark.deploy.yarn.config._
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config._
 import org.apache.spark.launcher._
 import org.apache.spark.util.Utils
 
@@ -216,6 +217,14 @@ abstract class BaseYarnClusterSuite
     props.setProperty("spark.driver.extraJavaOptions", "-Dfoo=\"one two three\"")
     props.setProperty("spark.executor.extraJavaOptions", "-Dfoo=\"one two three\"")
 
+    // SPARK-24446: make sure special characters in the library path do not break containers.
+    if (!Utils.isWindows) {
+      val libPath = """/tmp/does not exist:$PWD/tmp:/tmp/quote":/tmp/ampersand&"""
+      props.setProperty(AM_LIBRARY_PATH.key, libPath)
+      props.setProperty(DRIVER_LIBRARY_PATH.key, libPath)
+      props.setProperty(EXECUTOR_LIBRARY_PATH.key, libPath)
+    }
+
     yarnCluster.getConfig().asScala.foreach { e =>
       props.setProperty("spark.hadoop." + e.getKey(), e.getValue())
     }

From c04cb2d1b72b1edaddf684755f5a9d6aaf00e03b Mon Sep 17 00:00:00 2001
From: debugger87 <yangchaozhong.2009@gmail.com>
Date: Wed, 27 Jun 2018 11:34:28 -0700
Subject: [PATCH 19/79] [SPARK-21687][SQL] Spark SQL should set createTime for
 Hive partition

## What changes were proposed in this pull request?

Set createTime for every hive partition created in Spark SQL, which could be used to manage data lifecycle in Hive warehouse. We found  that almost every partition modified by spark sql has not been set createTime.

```
mysql> select * from partitions where create_time=0 limit 1\G;
*************************** 1. row ***************************
         PART_ID: 1028584
     CREATE_TIME: 0
LAST_ACCESS_TIME: 1502203611
       PART_NAME: date=20170130
           SD_ID: 1543605
          TBL_ID: 211605
  LINK_TARGET_ID: NULL
1 row in set (0.27 sec)
```

## How was this patch tested?
 N/A

Author: debugger87 <yangchaozhong.2009@gmail.com>
Author: Chaozhong Yang <yangchaozhong.2009@gmail.com>

Closes #18900 from debugger87/fix/set-create-time-for-hive-partition.
---
 .../spark/sql/catalyst/catalog/interface.scala     |  6 ++++++
 .../sql/catalyst/catalog/SessionCatalogSuite.scala |  6 ++++--
 .../results/describe-part-after-analyze.sql.out    | 14 ++++++++++++++
 .../resources/sql-tests/results/describe.sql.out   |  4 ++++
 .../sql-tests/results/show-tables.sql.out          |  2 ++
 .../spark/sql/hive/client/HiveClientImpl.scala     |  4 ++++
 6 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index f3e67dc4e975c..c6105c5526049 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -93,12 +93,16 @@ object CatalogStorageFormat {
  * @param spec partition spec values indexed by column name
  * @param storage storage format of the partition
  * @param parameters some parameters for the partition
+ * @param createTime creation time of the partition, in milliseconds
+ * @param lastAccessTime last access time, in milliseconds
  * @param stats optional statistics (number of rows, total size, etc.)
  */
 case class CatalogTablePartition(
     spec: CatalogTypes.TablePartitionSpec,
     storage: CatalogStorageFormat,
     parameters: Map[String, String] = Map.empty,
+    createTime: Long = System.currentTimeMillis,
+    lastAccessTime: Long = -1,
     stats: Option[CatalogStatistics] = None) {
 
   def toLinkedHashMap: mutable.LinkedHashMap[String, String] = {
@@ -109,6 +113,8 @@ case class CatalogTablePartition(
     if (parameters.nonEmpty) {
       map.put("Partition Parameters", s"{${parameters.map(p => p._1 + "=" + p._2).mkString(", ")}}")
     }
+    map.put("Created Time", new Date(createTime).toString)
+    map.put("Last Access", new Date(lastAccessTime).toString)
     stats.foreach(s => map.put("Partition Statistics", s.simpleString))
     map
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
index 6abab0073cca3..6a7375ee186fa 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -1114,11 +1114,13 @@ abstract class SessionCatalogSuite extends AnalysisTest {
     // And for hive serde table, hive metastore will set some values(e.g.transient_lastDdlTime)
     // in table's parameters and storage's properties, here we also ignore them.
     val actualPartsNormalize = actualParts.map(p =>
-      p.copy(parameters = Map.empty, storage = p.storage.copy(
+      p.copy(parameters = Map.empty, createTime = -1, lastAccessTime = -1,
+        storage = p.storage.copy(
         properties = Map.empty, locationUri = None, serde = None))).toSet
 
     val expectedPartsNormalize = expectedParts.map(p =>
-        p.copy(parameters = Map.empty, storage = p.storage.copy(
+        p.copy(parameters = Map.empty, createTime = -1, lastAccessTime = -1,
+          storage = p.storage.copy(
           properties = Map.empty, locationUri = None, serde = None))).toSet
 
     actualPartsNormalize == expectedPartsNormalize
diff --git a/sql/core/src/test/resources/sql-tests/results/describe-part-after-analyze.sql.out b/sql/core/src/test/resources/sql-tests/results/describe-part-after-analyze.sql.out
index 58ed201e2a60f..8ba69c698b551 100644
--- a/sql/core/src/test/resources/sql-tests/results/describe-part-after-analyze.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/describe-part-after-analyze.sql.out
@@ -57,6 +57,8 @@ Database            	default
 Table               	t                   	                    
 Partition Values    	[ds=2017-08-01, hr=10]	                    
 Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=10	                    
+Created Time [not included in comparison]
+Last Access [not included in comparison]
                     	                    	                    
 # Storage Information	                    	                    
 Location [not included in comparison]sql/core/spark-warehouse/t
@@ -89,6 +91,8 @@ Database            	default
 Table               	t                   	                    
 Partition Values    	[ds=2017-08-01, hr=10]	                    
 Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=10	                    
+Created Time [not included in comparison]
+Last Access [not included in comparison]
 Partition Statistics	1121 bytes, 3 rows  	                    
                     	                    	                    
 # Storage Information	                    	                    
@@ -122,6 +126,8 @@ Database            	default
 Table               	t                   	                    
 Partition Values    	[ds=2017-08-01, hr=10]	                    
 Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=10	                    
+Created Time [not included in comparison]
+Last Access [not included in comparison]
 Partition Statistics	1121 bytes, 3 rows  	                    
                     	                    	                    
 # Storage Information	                    	                    
@@ -147,6 +153,8 @@ Database            	default
 Table               	t                   	                    
 Partition Values    	[ds=2017-08-01, hr=11]	                    
 Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=11	                    
+Created Time [not included in comparison]
+Last Access [not included in comparison]
 Partition Statistics	1098 bytes, 4 rows  	                    
                     	                    	                    
 # Storage Information	                    	                    
@@ -180,6 +188,8 @@ Database            	default
 Table               	t                   	                    
 Partition Values    	[ds=2017-08-01, hr=10]	                    
 Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=10	                    
+Created Time [not included in comparison]
+Last Access [not included in comparison]
 Partition Statistics	1121 bytes, 3 rows  	                    
                     	                    	                    
 # Storage Information	                    	                    
@@ -205,6 +215,8 @@ Database            	default
 Table               	t                   	                    
 Partition Values    	[ds=2017-08-01, hr=11]	                    
 Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=11	                    
+Created Time [not included in comparison]
+Last Access [not included in comparison]
 Partition Statistics	1098 bytes, 4 rows  	                    
                     	                    	                    
 # Storage Information	                    	                    
@@ -230,6 +242,8 @@ Database            	default
 Table               	t                   	                    
 Partition Values    	[ds=2017-09-01, hr=5]	                    
 Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-09-01/hr=5	                    
+Created Time [not included in comparison]
+Last Access [not included in comparison]
 Partition Statistics	1144 bytes, 2 rows  	                    
                     	                    	                    
 # Storage Information	                    	                    
diff --git a/sql/core/src/test/resources/sql-tests/results/describe.sql.out b/sql/core/src/test/resources/sql-tests/results/describe.sql.out
index 8c908b7625056..79390cb424444 100644
--- a/sql/core/src/test/resources/sql-tests/results/describe.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/describe.sql.out
@@ -282,6 +282,8 @@ Table               	t
 Partition Values    	[c=Us, d=1]         	                    
 Location [not included in comparison]sql/core/spark-warehouse/t/c=Us/d=1	                    
 Storage Properties  	[a=1, b=2]          	                    
+Created Time [not included in comparison]
+Last Access [not included in comparison]
                     	                    	                    
 # Storage Information	                    	                    
 Num Buckets         	2                   	                    
@@ -311,6 +313,8 @@ Table               	t
 Partition Values    	[c=Us, d=1]         	                    
 Location [not included in comparison]sql/core/spark-warehouse/t/c=Us/d=1	                    
 Storage Properties  	[a=1, b=2]          	                    
+Created Time [not included in comparison]
+Last Access [not included in comparison]
                     	                    	                    
 # Storage Information	                    	                    
 Num Buckets         	2                   	                    
diff --git a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out
index 975bb06124744..abeb7e18f031e 100644
--- a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out
@@ -178,6 +178,8 @@ struct<database:string,tableName:string,isTemporary:boolean,information:string>
 -- !query 14 output
 showdb	show_t1	false	Partition Values: [c=Us, d=1]
 Location [not included in comparison]sql/core/spark-warehouse/showdb.db/show_t1/c=Us/d=1
+Created Time [not included in comparison]
+Last Access [not included in comparison]
 
 
 -- !query 15
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index da9fe2d3088b4..1df46d7431a21 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -995,6 +995,8 @@ private[hive] object HiveClientImpl {
     tpart.setTableName(ht.getTableName)
     tpart.setValues(partValues.asJava)
     tpart.setSd(storageDesc)
+    tpart.setCreateTime((p.createTime / 1000).toInt)
+    tpart.setLastAccessTime((p.lastAccessTime / 1000).toInt)
     tpart.setParameters(mutable.Map(p.parameters.toSeq: _*).asJava)
     new HivePartition(ht, tpart)
   }
@@ -1019,6 +1021,8 @@ private[hive] object HiveClientImpl {
         compressed = apiPartition.getSd.isCompressed,
         properties = Option(apiPartition.getSd.getSerdeInfo.getParameters)
           .map(_.asScala.toMap).orNull),
+      createTime = apiPartition.getCreateTime.toLong * 1000,
+      lastAccessTime = apiPartition.getLastAccessTime.toLong * 1000,
       parameters = properties,
       stats = readHiveStats(properties))
   }

From 776befbfd5b3c317a713d4fa3882cda6264db9ba Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Wed, 27 Jun 2018 14:26:08 -0700
Subject: [PATCH 20/79] [SPARK-24660][SHS] Show correct error pages when
 downloading logs

## What changes were proposed in this pull request?

SHS is showing bad errors when trying to download logs is not successful. This may happen because the requested application doesn't exist or the user doesn't have permissions for it, for instance.

The PR fixes the response when errors occur, so that they are displayed properly.

## How was this patch tested?

manual tests

**Before the patch:**
 1. Unauthorized user
![screen shot 2018-06-26 at 3 53 33 pm](https://user-images.githubusercontent.com/8821783/41918118-f8b37e70-795b-11e8-91e8-d0250239f09d.png)

 2. Non-existing application
![screen shot 2018-06-26 at 3 25 19 pm](https://user-images.githubusercontent.com/8821783/41918082-e3034c72-795b-11e8-970e-cee4a1eae77f.png)

**After the patch**
 1. Unauthorized user
![screen shot 2018-06-26 at 3 41 29 pm](https://user-images.githubusercontent.com/8821783/41918155-0d950476-795c-11e8-8d26-7b7ce73e6fe1.png)

 2. Non-existing application
![screen shot 2018-06-26 at 3 40 37 pm](https://user-images.githubusercontent.com/8821783/41918175-1a14bb88-795c-11e8-91ab-eadf29190a02.png)

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #21644 from mgaido91/SPARK-24660.
---
 .../spark/status/api/v1/ApiRootResource.scala | 30 ++++---------------
 .../status/api/v1/JacksonMessageWriter.scala  |  5 +---
 .../api/v1/OneApplicationResource.scala       |  7 ++---
 .../scala/org/apache/spark/ui/UIUtils.scala   |  5 ++++
 4 files changed, 13 insertions(+), 34 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
index d121068718b8a..84c2ad48f1f27 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
@@ -28,7 +28,7 @@ import org.glassfish.jersey.server.ServerProperties
 import org.glassfish.jersey.servlet.ServletContainer
 
 import org.apache.spark.SecurityManager
-import org.apache.spark.ui.SparkUI
+import org.apache.spark.ui.{SparkUI, UIUtils}
 
 /**
  * Main entry point for serving spark application metrics as json, using JAX-RS.
@@ -148,38 +148,18 @@ private[v1] trait BaseAppResource extends ApiRequestContext {
 }
 
 private[v1] class ForbiddenException(msg: String) extends WebApplicationException(
-  Response.status(Response.Status.FORBIDDEN).entity(msg).build())
+    UIUtils.buildErrorResponse(Response.Status.FORBIDDEN, msg))
 
 private[v1] class NotFoundException(msg: String) extends WebApplicationException(
-  new NoSuchElementException(msg),
-    Response
-      .status(Response.Status.NOT_FOUND)
-      .entity(ErrorWrapper(msg))
-      .build()
-)
+    UIUtils.buildErrorResponse(Response.Status.NOT_FOUND, msg))
 
 private[v1] class ServiceUnavailable(msg: String) extends WebApplicationException(
-  new ServiceUnavailableException(msg),
-  Response
-    .status(Response.Status.SERVICE_UNAVAILABLE)
-    .entity(ErrorWrapper(msg))
-    .build()
-)
+    UIUtils.buildErrorResponse(Response.Status.SERVICE_UNAVAILABLE, msg))
 
 private[v1] class BadParameterException(msg: String) extends WebApplicationException(
-  new IllegalArgumentException(msg),
-  Response
-    .status(Response.Status.BAD_REQUEST)
-    .entity(ErrorWrapper(msg))
-    .build()
-) {
+    UIUtils.buildErrorResponse(Response.Status.BAD_REQUEST, msg)) {
   def this(param: String, exp: String, actual: String) = {
     this(raw"""Bad value for parameter "$param".  Expected a $exp, got "$actual"""")
   }
 }
 
-/**
- * Signal to JacksonMessageWriter to not convert the message into json (which would result in an
- * extra set of quotes).
- */
-private[v1] case class ErrorWrapper(s: String)
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/JacksonMessageWriter.scala b/core/src/main/scala/org/apache/spark/status/api/v1/JacksonMessageWriter.scala
index 76af33c1a18db..4560d300cb0c8 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/JacksonMessageWriter.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/JacksonMessageWriter.scala
@@ -68,10 +68,7 @@ private[v1] class JacksonMessageWriter extends MessageBodyWriter[Object]{
       mediaType: MediaType,
       multivaluedMap: MultivaluedMap[String, AnyRef],
       outputStream: OutputStream): Unit = {
-    t match {
-      case ErrorWrapper(err) => outputStream.write(err.getBytes(StandardCharsets.UTF_8))
-      case _ => mapper.writeValue(outputStream, t)
-    }
+    mapper.writeValue(outputStream, t)
   }
 
   override def getSize(
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala
index 974697890dd03..32100c5704538 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala
@@ -140,11 +140,8 @@ private[v1] class AbstractApplicationResource extends BaseAppResource {
         .header("Content-Type", MediaType.APPLICATION_OCTET_STREAM)
         .build()
     } catch {
-      case NonFatal(e) =>
-        Response.serverError()
-          .entity(s"Event logs are not available for app: $appId.")
-          .status(Response.Status.SERVICE_UNAVAILABLE)
-          .build()
+      case NonFatal(_) =>
+        throw new ServiceUnavailable(s"Event logs are not available for app: $appId.")
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
index 5d015b0531ef6..732b7528f499e 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
@@ -21,6 +21,7 @@ import java.net.URLDecoder
 import java.text.SimpleDateFormat
 import java.util.{Date, Locale, TimeZone}
 import javax.servlet.http.HttpServletRequest
+import javax.ws.rs.core.{MediaType, Response}
 
 import scala.util.control.NonFatal
 import scala.xml._
@@ -566,4 +567,8 @@ private[spark] object UIUtils extends Logging {
         NEWLINE_AND_SINGLE_QUOTE_REGEX.replaceAllIn(requestParameter, ""))
     }
   }
+
+  def buildErrorResponse(status: Response.Status, msg: String): Response = {
+    Response.status(status).entity(msg).`type`(MediaType.TEXT_PLAIN).build()
+  }
 }

From 221d03acca19bdf7a2624a29c180c99f098205d8 Mon Sep 17 00:00:00 2001
From: Sanket Chintapalli <schintap@yahoo-inc.com>
Date: Wed, 27 Jun 2018 14:37:19 -0700
Subject: [PATCH 21/79] [SPARK-24533] Typesafe rebranded to lightbend. Changing
 the build downloads path

Typesafe has rebranded to lightbend. Just changing the downloads path to avoid redirection

Tested by running build/mvn -DskipTests package

Author: Sanket Chintapalli <schintap@yahoo-inc.com>

Closes #21636 from redsanket/SPARK-24533.
---
 build/mvn | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/build/mvn b/build/mvn
index 1405983982d4c..ae4276dbc7e32 100755
--- a/build/mvn
+++ b/build/mvn
@@ -93,7 +93,7 @@ install_mvn() {
 install_zinc() {
   local zinc_path="zinc-0.3.15/bin/zinc"
   [ ! -f "${_DIR}/${zinc_path}" ] && ZINC_INSTALL_FLAG=1
-  local TYPESAFE_MIRROR=${TYPESAFE_MIRROR:-https://downloads.typesafe.com}
+  local TYPESAFE_MIRROR=${TYPESAFE_MIRROR:-https://downloads.lightbend.com}
 
   install_app \
     "${TYPESAFE_MIRROR}/zinc/0.3.15" \
@@ -109,7 +109,7 @@ install_scala() {
   # determine the Scala version used in Spark
   local scala_version=`grep "scala.version" "${_DIR}/../pom.xml" | head -n1 | awk -F '[<>]' '{print $3}'`
   local scala_bin="${_DIR}/scala-${scala_version}/bin/scala"
-  local TYPESAFE_MIRROR=${TYPESAFE_MIRROR:-https://downloads.typesafe.com}
+  local TYPESAFE_MIRROR=${TYPESAFE_MIRROR:-https://downloads.lightbend.com}
 
   install_app \
     "${TYPESAFE_MIRROR}/scala/${scala_version}" \

From 893ea224cc738766be207c87f4b913fe8fea4c94 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Wed, 27 Jun 2018 15:25:51 -0700
Subject: [PATCH 22/79] [SPARK-24204][SQL] Verify a schema in
 Json/Orc/ParquetFileFormat

## What changes were proposed in this pull request?
This pr added code to verify a schema in Json/Orc/ParquetFileFormat along with CSVFileFormat.

## How was this patch tested?
Added verification tests in `FileBasedDataSourceSuite` and  `HiveOrcSourceSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #21389 from maropu/SPARK-24204.
---
 .../datasources/DataSourceUtils.scala         | 106 +++++++++
 .../datasources/csv/CSVFileFormat.scala       |   4 +-
 .../execution/datasources/csv/CSVUtils.scala  |  19 --
 .../datasources/json/JsonFileFormat.scala     |   4 +
 .../datasources/orc/OrcFileFormat.scala       |   4 +
 .../parquet/ParquetFileFormat.scala           |   3 +
 .../spark/sql/FileBasedDataSourceSuite.scala  | 213 +++++++++++++++++-
 .../execution/datasources/csv/CSVSuite.scala  |  33 ---
 .../spark/sql/hive/orc/OrcFileFormat.scala    |   4 +
 .../sql/hive/orc/HiveOrcSourceSuite.scala     |  49 +++-
 10 files changed, 383 insertions(+), 56 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala
new file mode 100644
index 0000000000000..c5347218c4b40
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
+import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
+import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
+import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
+import org.apache.spark.sql.types._
+
+
+object DataSourceUtils {
+
+  /**
+   * Verify if the schema is supported in datasource in write path.
+   */
+  def verifyWriteSchema(format: FileFormat, schema: StructType): Unit = {
+    verifySchema(format, schema, isReadPath = false)
+  }
+
+  /**
+   * Verify if the schema is supported in datasource in read path.
+   */
+  def verifyReadSchema(format: FileFormat, schema: StructType): Unit = {
+    verifySchema(format, schema, isReadPath = true)
+  }
+
+  /**
+   * Verify if the schema is supported in datasource. This verification should be done
+   * in a driver side, e.g., `prepareWrite`, `buildReader`, and `buildReaderWithPartitionValues`
+   * in `FileFormat`.
+   *
+   * Unsupported data types of csv, json, orc, and parquet are as follows;
+   *  csv -> R/W: Interval, Null, Array, Map, Struct
+   *  json -> W: Interval
+   *  orc -> W: Interval, Null
+   *  parquet -> R/W: Interval, Null
+   */
+  private def verifySchema(format: FileFormat, schema: StructType, isReadPath: Boolean): Unit = {
+    def throwUnsupportedException(dataType: DataType): Unit = {
+      throw new UnsupportedOperationException(
+        s"$format data source does not support ${dataType.simpleString} data type.")
+    }
+
+    def verifyType(dataType: DataType): Unit = dataType match {
+      case BooleanType | ByteType | ShortType | IntegerType | LongType | FloatType | DoubleType |
+           StringType | BinaryType | DateType | TimestampType | _: DecimalType =>
+
+      // All the unsupported types for CSV
+      case _: NullType | _: CalendarIntervalType | _: StructType | _: ArrayType | _: MapType
+          if format.isInstanceOf[CSVFileFormat] =>
+        throwUnsupportedException(dataType)
+
+      case st: StructType => st.foreach { f => verifyType(f.dataType) }
+
+      case ArrayType(elementType, _) => verifyType(elementType)
+
+      case MapType(keyType, valueType, _) =>
+        verifyType(keyType)
+        verifyType(valueType)
+
+      case udt: UserDefinedType[_] => verifyType(udt.sqlType)
+
+      // Interval type not supported in all the write path
+      case _: CalendarIntervalType if !isReadPath =>
+        throwUnsupportedException(dataType)
+
+      // JSON and ORC don't support an Interval type, but we pass it in read pass
+      // for back-compatibility.
+      case _: CalendarIntervalType if format.isInstanceOf[JsonFileFormat] ||
+        format.isInstanceOf[OrcFileFormat] =>
+
+      // Interval type not supported in the other read path
+      case _: CalendarIntervalType =>
+        throwUnsupportedException(dataType)
+
+      // For JSON & ORC backward-compatibility
+      case _: NullType if format.isInstanceOf[JsonFileFormat] ||
+        (isReadPath && format.isInstanceOf[OrcFileFormat]) =>
+
+      // Null type not supported in the other path
+      case _: NullType =>
+        throwUnsupportedException(dataType)
+
+      // We keep this default case for safeguards
+      case _ => throwUnsupportedException(dataType)
+    }
+
+    schema.foreach(field => verifyType(field.dataType))
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
index b90275de9f40a..fa366ccce6b61 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
@@ -66,7 +66,7 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister {
       job: Job,
       options: Map[String, String],
       dataSchema: StructType): OutputWriterFactory = {
-    CSVUtils.verifySchema(dataSchema)
+    DataSourceUtils.verifyWriteSchema(this, dataSchema)
     val conf = job.getConfiguration
     val csvOptions = new CSVOptions(
       options,
@@ -98,7 +98,7 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister {
       filters: Seq[Filter],
       options: Map[String, String],
       hadoopConf: Configuration): (PartitionedFile) => Iterator[InternalRow] = {
-    CSVUtils.verifySchema(dataSchema)
+    DataSourceUtils.verifyReadSchema(this, dataSchema)
     val broadcastedHadoopConf =
       sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala
index 1012e774118e2..7ce65fa89b02d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala
@@ -117,25 +117,6 @@ object CSVUtils {
     }
   }
 
-  /**
-   * Verify if the schema is supported in CSV datasource.
-   */
-  def verifySchema(schema: StructType): Unit = {
-    def verifyType(dataType: DataType): Unit = dataType match {
-      case ByteType | ShortType | IntegerType | LongType | FloatType |
-           DoubleType | BooleanType | _: DecimalType | TimestampType |
-           DateType | StringType =>
-
-      case udt: UserDefinedType[_] => verifyType(udt.sqlType)
-
-      case _ =>
-        throw new UnsupportedOperationException(
-          s"CSV data source does not support ${dataType.simpleString} data type.")
-    }
-
-    schema.foreach(field => verifyType(field.dataType))
-  }
-
   /**
    * Sample CSV dataset as configured by `samplingRatio`.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
index e9a0b383b5f49..383bff1375a93 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
@@ -65,6 +65,8 @@ class JsonFileFormat extends TextBasedFileFormat with DataSourceRegister {
       job: Job,
       options: Map[String, String],
       dataSchema: StructType): OutputWriterFactory = {
+    DataSourceUtils.verifyWriteSchema(this, dataSchema)
+
     val conf = job.getConfiguration
     val parsedOptions = new JSONOptions(
       options,
@@ -96,6 +98,8 @@ class JsonFileFormat extends TextBasedFileFormat with DataSourceRegister {
       filters: Seq[Filter],
       options: Map[String, String],
       hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = {
+    DataSourceUtils.verifyReadSchema(this, dataSchema)
+
     val broadcastedHadoopConf =
       sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
index 1de2ca2914c44..df488a748e3e5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
@@ -89,6 +89,8 @@ class OrcFileFormat
       job: Job,
       options: Map[String, String],
       dataSchema: StructType): OutputWriterFactory = {
+    DataSourceUtils.verifyWriteSchema(this, dataSchema)
+
     val orcOptions = new OrcOptions(options, sparkSession.sessionState.conf)
 
     val conf = job.getConfiguration
@@ -141,6 +143,8 @@ class OrcFileFormat
       filters: Seq[Filter],
       options: Map[String, String],
       hadoopConf: Configuration): (PartitionedFile) => Iterator[InternalRow] = {
+    DataSourceUtils.verifyReadSchema(this, dataSchema)
+
     if (sparkSession.sessionState.conf.orcFilterPushDown) {
       OrcFilters.createFilter(dataSchema, filters).foreach { f =>
         OrcInputFormat.setSearchArgument(hadoopConf, f, dataSchema.fieldNames)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
index 60fc9ec7e1f82..9602a08911dea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -78,6 +78,7 @@ class ParquetFileFormat
       job: Job,
       options: Map[String, String],
       dataSchema: StructType): OutputWriterFactory = {
+    DataSourceUtils.verifyWriteSchema(this, dataSchema)
 
     val parquetOptions = new ParquetOptions(options, sparkSession.sessionState.conf)
 
@@ -302,6 +303,8 @@ class ParquetFileFormat
       filters: Seq[Filter],
       options: Map[String, String],
       hadoopConf: Configuration): (PartitionedFile) => Iterator[InternalRow] = {
+    DataSourceUtils.verifyReadSchema(this, dataSchema)
+
     hadoopConf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[ParquetReadSupport].getName)
     hadoopConf.set(
       ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
index 06303099f5310..86f9647b4ac4c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
@@ -17,12 +17,14 @@
 
 package org.apache.spark.sql
 
-import java.io.FileNotFoundException
+import java.io.{File, FileNotFoundException}
+import java.util.Locale
 
 import org.apache.hadoop.fs.Path
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.SparkException
+import org.apache.spark.sql.TestingUDT.{IntervalData, IntervalUDT, NullData, NullUDT}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -202,4 +204,213 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext with Befo
       }
     }
   }
+
+  // Unsupported data types of csv, json, orc, and parquet are as follows;
+  //  csv -> R/W: Interval, Null, Array, Map, Struct
+  //  json -> W: Interval
+  //  orc -> W: Interval, Null
+  //  parquet -> R/W: Interval, Null
+  test("SPARK-24204 error handling for unsupported Array/Map/Struct types - csv") {
+    withTempDir { dir =>
+      val csvDir = new File(dir, "csv").getCanonicalPath
+      var msg = intercept[UnsupportedOperationException] {
+        Seq((1, "Tesla")).toDF("a", "b").selectExpr("struct(a, b)").write.csv(csvDir)
+      }.getMessage
+      assert(msg.contains("CSV data source does not support struct<a:int,b:string> data type"))
+
+      msg = intercept[UnsupportedOperationException] {
+        val schema = StructType.fromDDL("a struct<b: Int>")
+        spark.range(1).write.mode("overwrite").csv(csvDir)
+        spark.read.schema(schema).csv(csvDir).collect()
+      }.getMessage
+      assert(msg.contains("CSV data source does not support struct<b:int> data type"))
+
+      msg = intercept[UnsupportedOperationException] {
+        Seq((1, Map("Tesla" -> 3))).toDF("id", "cars").write.mode("overwrite").csv(csvDir)
+      }.getMessage
+      assert(msg.contains("CSV data source does not support map<string,int> data type"))
+
+      msg = intercept[UnsupportedOperationException] {
+        val schema = StructType.fromDDL("a map<int, int>")
+        spark.range(1).write.mode("overwrite").csv(csvDir)
+        spark.read.schema(schema).csv(csvDir).collect()
+      }.getMessage
+      assert(msg.contains("CSV data source does not support map<int,int> data type"))
+
+      msg = intercept[UnsupportedOperationException] {
+        Seq((1, Array("Tesla", "Chevy", "Ford"))).toDF("id", "brands")
+          .write.mode("overwrite").csv(csvDir)
+      }.getMessage
+      assert(msg.contains("CSV data source does not support array<string> data type"))
+
+      msg = intercept[UnsupportedOperationException] {
+         val schema = StructType.fromDDL("a array<int>")
+         spark.range(1).write.mode("overwrite").csv(csvDir)
+         spark.read.schema(schema).csv(csvDir).collect()
+       }.getMessage
+      assert(msg.contains("CSV data source does not support array<int> data type"))
+
+      msg = intercept[UnsupportedOperationException] {
+        Seq((1, new UDT.MyDenseVector(Array(0.25, 2.25, 4.25)))).toDF("id", "vectors")
+          .write.mode("overwrite").csv(csvDir)
+      }.getMessage
+      assert(msg.contains("CSV data source does not support array<double> data type"))
+
+      msg = intercept[UnsupportedOperationException] {
+        val schema = StructType(StructField("a", new UDT.MyDenseVectorUDT(), true) :: Nil)
+        spark.range(1).write.mode("overwrite").csv(csvDir)
+        spark.read.schema(schema).csv(csvDir).collect()
+      }.getMessage
+      assert(msg.contains("CSV data source does not support array<double> data type."))
+    }
+  }
+
+  test("SPARK-24204 error handling for unsupported Interval data types - csv, json, parquet, orc") {
+    withTempDir { dir =>
+      val tempDir = new File(dir, "files").getCanonicalPath
+
+      // write path
+      Seq("csv", "json", "parquet", "orc").foreach { format =>
+        var msg = intercept[AnalysisException] {
+          sql("select interval 1 days").write.format(format).mode("overwrite").save(tempDir)
+        }.getMessage
+        assert(msg.contains("Cannot save interval data type into external storage."))
+
+        msg = intercept[UnsupportedOperationException] {
+          spark.udf.register("testType", () => new IntervalData())
+          sql("select testType()").write.format(format).mode("overwrite").save(tempDir)
+        }.getMessage
+        assert(msg.toLowerCase(Locale.ROOT)
+          .contains(s"$format data source does not support calendarinterval data type."))
+      }
+
+      // read path
+      Seq("parquet", "csv").foreach { format =>
+        var msg = intercept[UnsupportedOperationException] {
+          val schema = StructType(StructField("a", CalendarIntervalType, true) :: Nil)
+          spark.range(1).write.format(format).mode("overwrite").save(tempDir)
+          spark.read.schema(schema).format(format).load(tempDir).collect()
+        }.getMessage
+        assert(msg.toLowerCase(Locale.ROOT)
+          .contains(s"$format data source does not support calendarinterval data type."))
+
+        msg = intercept[UnsupportedOperationException] {
+          val schema = StructType(StructField("a", new IntervalUDT(), true) :: Nil)
+          spark.range(1).write.format(format).mode("overwrite").save(tempDir)
+          spark.read.schema(schema).format(format).load(tempDir).collect()
+        }.getMessage
+        assert(msg.toLowerCase(Locale.ROOT)
+          .contains(s"$format data source does not support calendarinterval data type."))
+      }
+
+      // We expect the types below should be passed for backward-compatibility
+      Seq("orc", "json").foreach { format =>
+        // Interval type
+        var schema = StructType(StructField("a", CalendarIntervalType, true) :: Nil)
+        spark.range(1).write.format(format).mode("overwrite").save(tempDir)
+        spark.read.schema(schema).format(format).load(tempDir).collect()
+
+        // UDT having interval data
+        schema = StructType(StructField("a", new IntervalUDT(), true) :: Nil)
+        spark.range(1).write.format(format).mode("overwrite").save(tempDir)
+        spark.read.schema(schema).format(format).load(tempDir).collect()
+      }
+    }
+  }
+
+  test("SPARK-24204 error handling for unsupported Null data types - csv, parquet, orc") {
+    withTempDir { dir =>
+      val tempDir = new File(dir, "files").getCanonicalPath
+
+      Seq("orc").foreach { format =>
+        // write path
+        var msg = intercept[UnsupportedOperationException] {
+          sql("select null").write.format(format).mode("overwrite").save(tempDir)
+        }.getMessage
+        assert(msg.toLowerCase(Locale.ROOT)
+          .contains(s"$format data source does not support null data type."))
+
+        msg = intercept[UnsupportedOperationException] {
+          spark.udf.register("testType", () => new NullData())
+          sql("select testType()").write.format(format).mode("overwrite").save(tempDir)
+        }.getMessage
+        assert(msg.toLowerCase(Locale.ROOT)
+          .contains(s"$format data source does not support null data type."))
+
+        // read path
+        // We expect the types below should be passed for backward-compatibility
+
+        // Null type
+        var schema = StructType(StructField("a", NullType, true) :: Nil)
+        spark.range(1).write.format(format).mode("overwrite").save(tempDir)
+        spark.read.schema(schema).format(format).load(tempDir).collect()
+
+        // UDT having null data
+        schema = StructType(StructField("a", new NullUDT(), true) :: Nil)
+        spark.range(1).write.format(format).mode("overwrite").save(tempDir)
+        spark.read.schema(schema).format(format).load(tempDir).collect()
+      }
+
+      Seq("parquet", "csv").foreach { format =>
+        // write path
+        var msg = intercept[UnsupportedOperationException] {
+          sql("select null").write.format(format).mode("overwrite").save(tempDir)
+        }.getMessage
+        assert(msg.toLowerCase(Locale.ROOT)
+          .contains(s"$format data source does not support null data type."))
+
+        msg = intercept[UnsupportedOperationException] {
+          spark.udf.register("testType", () => new NullData())
+          sql("select testType()").write.format(format).mode("overwrite").save(tempDir)
+        }.getMessage
+        assert(msg.toLowerCase(Locale.ROOT)
+          .contains(s"$format data source does not support null data type."))
+
+        // read path
+        msg = intercept[UnsupportedOperationException] {
+          val schema = StructType(StructField("a", NullType, true) :: Nil)
+          spark.range(1).write.format(format).mode("overwrite").save(tempDir)
+          spark.read.schema(schema).format(format).load(tempDir).collect()
+        }.getMessage
+        assert(msg.toLowerCase(Locale.ROOT)
+          .contains(s"$format data source does not support null data type."))
+
+        msg = intercept[UnsupportedOperationException] {
+          val schema = StructType(StructField("a", new NullUDT(), true) :: Nil)
+          spark.range(1).write.format(format).mode("overwrite").save(tempDir)
+          spark.read.schema(schema).format(format).load(tempDir).collect()
+        }.getMessage
+        assert(msg.toLowerCase(Locale.ROOT)
+          .contains(s"$format data source does not support null data type."))
+      }
+    }
+  }
+}
+
+object TestingUDT {
+
+  @SQLUserDefinedType(udt = classOf[IntervalUDT])
+  class IntervalData extends Serializable
+
+  class IntervalUDT extends UserDefinedType[IntervalData] {
+
+    override def sqlType: DataType = CalendarIntervalType
+    override def serialize(obj: IntervalData): Any =
+      throw new NotImplementedError("Not implemented")
+    override def deserialize(datum: Any): IntervalData =
+      throw new NotImplementedError("Not implemented")
+    override def userClass: Class[IntervalData] = classOf[IntervalData]
+  }
+
+  @SQLUserDefinedType(udt = classOf[NullUDT])
+  private[sql] class NullData extends Serializable
+
+  private[sql] class NullUDT extends UserDefinedType[NullData] {
+
+    override def sqlType: DataType = NullType
+    override def serialize(obj: NullData): Any = throw new NotImplementedError("Not implemented")
+    override def deserialize(datum: Any): NullData =
+      throw new NotImplementedError("Not implemented")
+    override def userClass: Class[NullData] = classOf[NullData]
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index d2f166c7d1877..365239d040ef2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -740,39 +740,6 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
     assert(numbers.count() == 8)
   }
 
-  test("error handling for unsupported data types.") {
-    withTempDir { dir =>
-      val csvDir = new File(dir, "csv").getCanonicalPath
-      var msg = intercept[UnsupportedOperationException] {
-        Seq((1, "Tesla")).toDF("a", "b").selectExpr("struct(a, b)").write.csv(csvDir)
-      }.getMessage
-      assert(msg.contains("CSV data source does not support struct<a:int,b:string> data type"))
-
-      msg = intercept[UnsupportedOperationException] {
-        Seq((1, Map("Tesla" -> 3))).toDF("id", "cars").write.csv(csvDir)
-      }.getMessage
-      assert(msg.contains("CSV data source does not support map<string,int> data type"))
-
-      msg = intercept[UnsupportedOperationException] {
-        Seq((1, Array("Tesla", "Chevy", "Ford"))).toDF("id", "brands").write.csv(csvDir)
-      }.getMessage
-      assert(msg.contains("CSV data source does not support array<string> data type"))
-
-      msg = intercept[UnsupportedOperationException] {
-        Seq((1, new UDT.MyDenseVector(Array(0.25, 2.25, 4.25)))).toDF("id", "vectors")
-          .write.csv(csvDir)
-      }.getMessage
-      assert(msg.contains("CSV data source does not support array<double> data type"))
-
-      msg = intercept[UnsupportedOperationException] {
-        val schema = StructType(StructField("a", new UDT.MyDenseVectorUDT(), true) :: Nil)
-        spark.range(1).write.csv(csvDir)
-        spark.read.schema(schema).csv(csvDir).collect()
-      }.getMessage
-      assert(msg.contains("CSV data source does not support array<double> data type."))
-    }
-  }
-
   test("SPARK-15585 turn off quotations") {
     val cars = spark.read
       .format("csv")
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index 237ed9bc05988..dd2144c5fcea8 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -72,6 +72,8 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
       job: Job,
       options: Map[String, String],
       dataSchema: StructType): OutputWriterFactory = {
+    DataSourceUtils.verifyWriteSchema(this, dataSchema)
+
     val orcOptions = new OrcOptions(options, sparkSession.sessionState.conf)
 
     val configuration = job.getConfiguration
@@ -121,6 +123,8 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
       filters: Seq[Filter],
       options: Map[String, String],
       hadoopConf: Configuration): (PartitionedFile) => Iterator[InternalRow] = {
+    DataSourceUtils.verifyReadSchema(this, dataSchema)
+
     if (sparkSession.sessionState.conf.orcFilterPushDown) {
       // Sets pushed predicates
       OrcFilters.createFilter(requiredSchema, filters.toArray).foreach { f =>
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
index d556a030e2186..69009e1b520c2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
@@ -19,11 +19,13 @@ package org.apache.spark.sql.hive.orc
 
 import java.io.File
 
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.{AnalysisException, Row}
+import org.apache.spark.sql.TestingUDT.{IntervalData, IntervalUDT}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.execution.datasources.orc.OrcSuite
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.HiveSerDe
+import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
 class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton {
@@ -133,4 +135,49 @@ class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton {
       Utils.deleteRecursively(location)
     }
   }
+
+  test("SPARK-24204 error handling for unsupported data types") {
+    withTempDir { dir =>
+      val orcDir = new File(dir, "orc").getCanonicalPath
+
+      // write path
+      var msg = intercept[AnalysisException] {
+        sql("select interval 1 days").write.mode("overwrite").orc(orcDir)
+      }.getMessage
+      assert(msg.contains("Cannot save interval data type into external storage."))
+
+      msg = intercept[UnsupportedOperationException] {
+        sql("select null").write.mode("overwrite").orc(orcDir)
+      }.getMessage
+      assert(msg.contains("ORC data source does not support null data type."))
+
+      msg = intercept[UnsupportedOperationException] {
+        spark.udf.register("testType", () => new IntervalData())
+        sql("select testType()").write.mode("overwrite").orc(orcDir)
+      }.getMessage
+      assert(msg.contains("ORC data source does not support calendarinterval data type."))
+
+      // read path
+      msg = intercept[UnsupportedOperationException] {
+        val schema = StructType(StructField("a", CalendarIntervalType, true) :: Nil)
+        spark.range(1).write.mode("overwrite").orc(orcDir)
+        spark.read.schema(schema).orc(orcDir).collect()
+      }.getMessage
+      assert(msg.contains("ORC data source does not support calendarinterval data type."))
+
+      msg = intercept[UnsupportedOperationException] {
+        val schema = StructType(StructField("a", NullType, true) :: Nil)
+        spark.range(1).write.mode("overwrite").orc(orcDir)
+        spark.read.schema(schema).orc(orcDir).collect()
+      }.getMessage
+      assert(msg.contains("ORC data source does not support null data type."))
+
+      msg = intercept[UnsupportedOperationException] {
+        val schema = StructType(StructField("a", new IntervalUDT(), true) :: Nil)
+        spark.range(1).write.mode("overwrite").orc(orcDir)
+        spark.read.schema(schema).orc(orcDir).collect()
+      }.getMessage
+      assert(msg.contains("ORC data source does not support calendarinterval data type."))
+    }
+  }
 }

From c5aa54d54b301555bad1ff0653df11293f0033ed Mon Sep 17 00:00:00 2001
From: "Kallman, Steven" <Steven.Kallman@CapitalOne.com>
Date: Wed, 27 Jun 2018 15:36:59 -0700
Subject: [PATCH 23/79] [SPARK-24553][WEB-UI] http 302 fixes for href redirect

## What changes were proposed in this pull request?

Updated URL/href links to include a '/' before '?id' to make links consistent and avoid http 302 redirect errors within UI port 4040 tabs.

## How was this patch tested?

Built a runnable distribution and executed jobs. Validated that http 302 redirects are no longer encountered when clicking on links within UI port 4040 tabs.

Author: Steven Kallman <SJKallmangmail.com>

Author: Kallman, Steven <Steven.Kallman@CapitalOne.com>

Closes #21600 from SJKallman/{Spark-24553}{WEB-UI}-redirect-href-fixes.
---
 .../src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala | 2 +-
 core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala  | 2 +-
 core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala | 2 +-
 .../org/apache/spark/sql/execution/ui/AllExecutionsPage.scala | 4 ++--
 .../org/apache/spark/sql/execution/ui/ExecutionPage.scala     | 2 +-
 .../spark/sql/hive/thriftserver/ui/ThriftServerPage.scala     | 4 ++--
 .../sql/hive/thriftserver/ui/ThriftServerSessionPage.scala    | 2 +-
 .../main/scala/org/apache/spark/streaming/ui/BatchPage.scala  | 2 +-
 8 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
index 178d2c8d1a10a..90e9a7a3630cf 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
@@ -464,7 +464,7 @@ private[ui] class JobDataSource(
 
     val jobDescription = UIUtils.makeDescription(lastStageDescription, basePath, plainText = false)
 
-    val detailUrl = "%s/jobs/job?id=%s".format(basePath, jobData.jobId)
+    val detailUrl = "%s/jobs/job/?id=%s".format(basePath, jobData.jobId)
 
     new JobTableRowData(
       jobData,
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index d4e6a7bc3effa..55eb989962668 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -282,7 +282,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
       val _taskTable = new TaskPagedTable(
         stageData,
         UIUtils.prependBaseUri(request, parent.basePath) +
-          s"/stages/stage?id=${stageId}&attempt=${stageAttemptId}",
+          s"/stages/stage/?id=${stageId}&attempt=${stageAttemptId}",
         currentTime,
         pageSize = taskPageSize,
         sortColumn = taskSortColumn,
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
index 56e4d6838a99a..d01acdae59c9f 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
@@ -370,7 +370,7 @@ private[ui] class StagePagedTable(
       Seq.empty
     }
 
-    val nameLinkUri = s"$basePathUri/stages/stage?id=${s.stageId}&attempt=${s.attemptId}"
+    val nameLinkUri = s"$basePathUri/stages/stage/?id=${s.stageId}&attempt=${s.attemptId}"
     val nameLink = <a href={nameLinkUri} class="name-link">{s.name}</a>
 
     val cachedRddInfos = store.rddList().filter { rdd => s.rddIds.contains(rdd.id) }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala
index bf46bc4cf904d..a7a24ac3641b5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala
@@ -214,11 +214,11 @@ private[ui] abstract class ExecutionTable(
   }
 
   private def jobURL(request: HttpServletRequest, jobId: Long): String =
-    "%s/jobs/job?id=%s".format(UIUtils.prependBaseUri(request, parent.basePath), jobId)
+    "%s/jobs/job/?id=%s".format(UIUtils.prependBaseUri(request, parent.basePath), jobId)
 
   private def executionURL(request: HttpServletRequest, executionID: Long): String =
     s"${UIUtils.prependBaseUri(
-      request, parent.basePath)}/${parent.prefix}/execution?id=$executionID"
+      request, parent.basePath)}/${parent.prefix}/execution/?id=$executionID"
 }
 
 private[ui] class RunningExecutionTable(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala
index 282f7b4bb5a58..877176b030f8b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala
@@ -122,7 +122,7 @@ class ExecutionPage(parent: SQLTab) extends WebUIPage("execution") with Logging
   }
 
   private def jobURL(request: HttpServletRequest, jobId: Long): String =
-    "%s/jobs/job?id=%s".format(UIUtils.prependBaseUri(request, parent.basePath), jobId)
+    "%s/jobs/job/?id=%s".format(UIUtils.prependBaseUri(request, parent.basePath), jobId)
 
   private def physicalPlanDescription(physicalPlanDescription: String): Seq[Node] = {
     <div>
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala
index 0950b30126773..771104ceb8842 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala
@@ -76,7 +76,7 @@ private[ui] class ThriftServerPage(parent: ThriftServerTab) extends WebUIPage(""
 
       def generateDataRow(info: ExecutionInfo): Seq[Node] = {
         val jobLink = info.jobId.map { id: String =>
-          <a href={"%s/jobs/job?id=%s".format(
+          <a href={"%s/jobs/job/?id=%s".format(
             UIUtils.prependBaseUri(request, parent.basePath), id)}>
             [{id}]
           </a>
@@ -147,7 +147,7 @@ private[ui] class ThriftServerPage(parent: ThriftServerTab) extends WebUIPage(""
       val headerRow = Seq("User", "IP", "Session ID", "Start Time", "Finish Time", "Duration",
         "Total Execute")
       def generateDataRow(session: SessionInfo): Seq[Node] = {
-        val sessionLink = "%s/%s/session?id=%s".format(
+        val sessionLink = "%s/%s/session/?id=%s".format(
           UIUtils.prependBaseUri(request, parent.basePath), parent.prefix, session.sessionId)
         <tr>
           <td> {session.userName} </td>
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala
index c884aa0ecbdf8..163eb43aabc72 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala
@@ -86,7 +86,7 @@ private[ui] class ThriftServerSessionPage(parent: ThriftServerTab)
 
       def generateDataRow(info: ExecutionInfo): Seq[Node] = {
         val jobLink = info.jobId.map { id: String =>
-          <a href={"%s/jobs/job?id=%s".format(
+          <a href={"%s/jobs/job/?id=%s".format(
               UIUtils.prependBaseUri(request, parent.basePath), id)}>
             [{id}]
           </a>
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
index ca9da6139649a..884d21d0afdd3 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
@@ -109,7 +109,7 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
       flatMap(info => info.failureReason).headOption.getOrElse("")
     val formattedDuration = duration.map(d => SparkUIUtils.formatDuration(d)).getOrElse("-")
     val detailUrl = s"${SparkUIUtils.prependBaseUri(
-      request, parent.basePath)}/jobs/job?id=${sparkJob.jobId}"
+      request, parent.basePath)}/jobs/job/?id=${sparkJob.jobId}"
 
     // In the first row, output op id and its information needs to be shown. In other rows, these
     // cells will be taken up due to "rowspan".

From bd32b509a1728366494cba13f8f6612b7bd46ec0 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Thu, 28 Jun 2018 09:19:25 +0800
Subject: [PATCH 24/79] [SPARK-24645][SQL] Skip parsing when csvColumnPruning
 enabled and partitions scanned only

## What changes were proposed in this pull request?
In the master, when `csvColumnPruning`(implemented in [this commit](https://github.com/apache/spark/commit/64fad0b519cf35b8c0a0dec18dd3df9488a5ed25#diff-d19881aceddcaa5c60620fdcda99b4c4)) enabled and partitions scanned only, it throws an exception below;

```
scala> val dir = "/tmp/spark-csv/csv"
scala> spark.range(10).selectExpr("id % 2 AS p", "id").write.mode("overwrite").partitionBy("p").csv(dir)
scala> spark.read.csv(dir).selectExpr("sum(p)").collect()
18/06/25 13:12:51 ERROR Executor: Exception in task 0.0 in stage 2.0 (TID 5)
java.lang.NullPointerException
        at org.apache.spark.sql.execution.datasources.csv.UnivocityParser.org$apache$spark$sql$execution$datasources$csv$UnivocityParser$$convert(UnivocityParser.scala:197)
        at org.apache.spark.sql.execution.datasources.csv.UnivocityParser.parse(UnivocityParser.scala:190)
        at org.apache.spark.sql.execution.datasources.csv.UnivocityParser$$anonfun$5.apply(UnivocityParser.scala:309)
        at org.apache.spark.sql.execution.datasources.csv.UnivocityParser$$anonfun$5.apply(UnivocityParser.scala:309)
        at org.apache.spark.sql.execution.datasources.FailureSafeParser.parse(FailureSafeParser.scala:61)
        ...
```
This pr modified code to skip CSV parsing in the case.

## How was this patch tested?
Added tests in `CSVSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #21631 from maropu/SPARK-24645.
---
 .../execution/datasources/csv/UnivocityParser.scala    | 10 +++++++++-
 .../spark/sql/execution/datasources/csv/CSVSuite.scala | 10 ++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index 5f7d5696b71a6..aa545e1a0c00a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -183,11 +183,19 @@ class UnivocityParser(
     }
   }
 
+  private val doParse = if (schema.nonEmpty) {
+    (input: String) => convert(tokenizer.parseLine(input))
+  } else {
+    // If `columnPruning` enabled and partition attributes scanned only,
+    // `schema` gets empty.
+    (_: String) => InternalRow.empty
+  }
+
   /**
    * Parses a single CSV string and turns it into either one resulting row or no row (if the
    * the record is malformed).
    */
-  def parse(input: String): InternalRow = convert(tokenizer.parseLine(input))
+  def parse(input: String): InternalRow = doParse(input)
 
   private def convert(tokens: Array[String]): InternalRow = {
     if (tokens.length != schema.length) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 365239d040ef2..84b91f6309fe8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1569,4 +1569,14 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
     assert(testAppender2.events.asScala
       .exists(msg => msg.getRenderedMessage.contains("CSV header does not conform to the schema")))
   }
+
+  test("SPARK-24645 skip parsing when columnPruning enabled and partitions scanned only") {
+    withSQLConf(SQLConf.CSV_PARSER_COLUMN_PRUNING.key -> "true") {
+      withTempPath { path =>
+        val dir = path.getAbsolutePath
+        spark.range(10).selectExpr("id % 2 AS p", "id").write.partitionBy("p").csv(dir)
+        checkAnswer(spark.read.csv(dir).selectExpr("sum(p)"), Row(5))
+      }
+    }
+  }
 }

From 1c9acc2438f9a97134ae5213a12112b2361fbb78 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Thu, 28 Jun 2018 09:21:10 +0800
Subject: [PATCH 25/79] [SPARK-24206][SQL][FOLLOW-UP] Update
 DataSourceReadBenchmark benchmark results

## What changes were proposed in this pull request?
This pr corrected the default configuration (`spark.master=local[1]`) for benchmarks. Also, this updated performance results on the AWS `r3.xlarge`.

## How was this patch tested?
N/A

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #21625 from maropu/FixDataSourceReadBenchmark.
---
 .../benchmark/DataSourceReadBenchmark.scala   | 296 +++++++++---------
 1 file changed, 152 insertions(+), 144 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala
index fc6d8abc03c09..8711f5a8fa1ce 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala
@@ -39,9 +39,11 @@ import org.apache.spark.util.{Benchmark, Utils}
 object DataSourceReadBenchmark {
   val conf = new SparkConf()
     .setAppName("DataSourceReadBenchmark")
-    .setIfMissing("spark.master", "local[1]")
+    // Since `spark.master` always exists, overrides this value
+    .set("spark.master", "local[1]")
     .setIfMissing("spark.driver.memory", "3g")
     .setIfMissing("spark.executor.memory", "3g")
+    .setIfMissing("spark.ui.enabled", "false")
 
   val spark = SparkSession.builder.config(conf).getOrCreate()
 
@@ -154,73 +156,73 @@ object DataSourceReadBenchmark {
           }
         }
 
-
         /*
-        Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
+        OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 4.14.33-51.37.amzn1.x86_64
+        Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
         SQL Single TINYINT Column Scan:      Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         --------------------------------------------------------------------------------------------
-        SQL CSV                                 15231 / 15267          1.0         968.3       1.0X
-        SQL Json                                  8476 / 8498          1.9         538.9       1.8X
-        SQL Parquet Vectorized                     121 /  127        130.0           7.7     125.9X
-        SQL Parquet MR                            1515 / 1543         10.4          96.3      10.1X
-        SQL ORC Vectorized                         164 /  171         95.9          10.4      92.9X
-        SQL ORC Vectorized with copy               228 /  234         69.0          14.5      66.8X
-        SQL ORC MR                                1297 / 1309         12.1          82.5      11.7X
+        SQL CSV                                 22964 / 23096          0.7        1460.0       1.0X
+        SQL Json                                  8469 / 8593          1.9         538.4       2.7X
+        SQL Parquet Vectorized                     164 /  177         95.8          10.4     139.9X
+        SQL Parquet MR                            1687 / 1706          9.3         107.2      13.6X
+        SQL ORC Vectorized                         191 /  197         82.3          12.2     120.2X
+        SQL ORC Vectorized with copy               215 /  219         73.2          13.7     106.9X
+        SQL ORC MR                                1392 / 1412         11.3          88.5      16.5X
 
 
         SQL Single SMALLINT Column Scan:     Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         --------------------------------------------------------------------------------------------
-        SQL CSV                                 16344 / 16374          1.0        1039.1       1.0X
-        SQL Json                                  8634 / 8648          1.8         548.9       1.9X
-        SQL Parquet Vectorized                     172 /  177         91.5          10.9      95.1X
-        SQL Parquet MR                            1744 / 1746          9.0         110.9       9.4X
-        SQL ORC Vectorized                         189 /  194         83.1          12.0      86.4X
-        SQL ORC Vectorized with copy               244 /  250         64.5          15.5      67.0X
-        SQL ORC MR                                1341 / 1386         11.7          85.3      12.2X
+        SQL CSV                                 24090 / 24097          0.7        1531.6       1.0X
+        SQL Json                                  8791 / 8813          1.8         558.9       2.7X
+        SQL Parquet Vectorized                     204 /  212         77.0          13.0     117.9X
+        SQL Parquet MR                            1813 / 1850          8.7         115.3      13.3X
+        SQL ORC Vectorized                         226 /  230         69.7          14.4     106.7X
+        SQL ORC Vectorized with copy               295 /  298         53.3          18.8      81.6X
+        SQL ORC MR                                1526 / 1549         10.3          97.1      15.8X
 
 
         SQL Single INT Column Scan:          Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         --------------------------------------------------------------------------------------------
-        SQL CSV                                 17874 / 17875          0.9        1136.4       1.0X
-        SQL Json                                  9190 / 9204          1.7         584.3       1.9X
-        SQL Parquet Vectorized                     141 /  160        111.2           9.0     126.4X
-        SQL Parquet MR                            1930 / 2049          8.2         122.7       9.3X
-        SQL ORC Vectorized                         259 /  264         60.7          16.5      69.0X
-        SQL ORC Vectorized with copy               265 /  272         59.4          16.8      67.5X
-        SQL ORC MR                                1528 / 1569         10.3          97.2      11.7X
+        SQL CSV                                 25637 / 25791          0.6        1629.9       1.0X
+        SQL Json                                  9532 / 9570          1.7         606.0       2.7X
+        SQL Parquet Vectorized                     181 /  191         86.8          11.5     141.5X
+        SQL Parquet MR                            2210 / 2227          7.1         140.5      11.6X
+        SQL ORC Vectorized                         309 /  317         50.9          19.6      83.0X
+        SQL ORC Vectorized with copy               316 /  322         49.8          20.1      81.2X
+        SQL ORC MR                                1650 / 1680          9.5         104.9      15.5X
 
 
         SQL Single BIGINT Column Scan:       Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         --------------------------------------------------------------------------------------------
-        SQL CSV                                 22812 / 22839          0.7        1450.4       1.0X
-        SQL Json                                12026 / 12054          1.3         764.6       1.9X
-        SQL Parquet Vectorized                     222 /  227         70.8          14.1     102.6X
-        SQL Parquet MR                            2199 / 2204          7.2         139.8      10.4X
-        SQL ORC Vectorized                         331 /  335         47.6          21.0      69.0X
-        SQL ORC Vectorized with copy               338 /  343         46.6          21.5      67.6X
-        SQL ORC MR                                1618 / 1622          9.7         102.9      14.1X
+        SQL CSV                                 31617 / 31764          0.5        2010.1       1.0X
+        SQL Json                                12440 / 12451          1.3         790.9       2.5X
+        SQL Parquet Vectorized                     284 /  315         55.4          18.0     111.4X
+        SQL Parquet MR                            2382 / 2390          6.6         151.5      13.3X
+        SQL ORC Vectorized                         398 /  403         39.5          25.3      79.5X
+        SQL ORC Vectorized with copy               410 /  413         38.3          26.1      77.1X
+        SQL ORC MR                                1783 / 1813          8.8         113.4      17.7X
 
 
         SQL Single FLOAT Column Scan:        Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         --------------------------------------------------------------------------------------------
-        SQL CSV                                 18703 / 18740          0.8        1189.1       1.0X
-        SQL Json                                11779 / 11869          1.3         748.9       1.6X
-        SQL Parquet Vectorized                     143 /  145        110.1           9.1     130.9X
-        SQL Parquet MR                            1954 / 1963          8.0         124.2       9.6X
-        SQL ORC Vectorized                         347 /  355         45.3          22.1      53.8X
-        SQL ORC Vectorized with copy               356 /  359         44.1          22.7      52.5X
-        SQL ORC MR                                1570 / 1598         10.0          99.8      11.9X
+        SQL CSV                                 26679 / 26742          0.6        1696.2       1.0X
+        SQL Json                                12490 / 12541          1.3         794.1       2.1X
+        SQL Parquet Vectorized                     174 /  183         90.4          11.1     153.3X
+        SQL Parquet MR                            2201 / 2223          7.1         140.0      12.1X
+        SQL ORC Vectorized                         415 /  429         37.9          26.4      64.3X
+        SQL ORC Vectorized with copy               422 /  428         37.2          26.9      63.2X
+        SQL ORC MR                                1767 / 1773          8.9         112.3      15.1X
 
 
         SQL Single DOUBLE Column Scan:       Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         --------------------------------------------------------------------------------------------
-        SQL CSV                                 23832 / 23838          0.7        1515.2       1.0X
-        SQL Json                                16204 / 16226          1.0        1030.2       1.5X
-        SQL Parquet Vectorized                     242 /  306         65.1          15.4      98.6X
-        SQL Parquet MR                            2462 / 2482          6.4         156.5       9.7X
-        SQL ORC Vectorized                         419 /  451         37.6          26.6      56.9X
-        SQL ORC Vectorized with copy               426 /  447         36.9          27.1      55.9X
-        SQL ORC MR                                1885 / 1931          8.3         119.8      12.6X
+        SQL CSV                                 34223 / 34324          0.5        2175.8       1.0X
+        SQL Json                                17784 / 17785          0.9        1130.7       1.9X
+        SQL Parquet Vectorized                     277 /  283         56.7          17.6     123.4X
+        SQL Parquet MR                            2356 / 2386          6.7         149.8      14.5X
+        SQL ORC Vectorized                         533 /  536         29.5          33.9      64.2X
+        SQL ORC Vectorized with copy               541 /  546         29.1          34.4      63.3X
+        SQL ORC MR                                2166 / 2177          7.3         137.7      15.8X
         */
         sqlBenchmark.run()
 
@@ -294,41 +296,42 @@ object DataSourceReadBenchmark {
         }
 
         /*
-        Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
+        OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 4.14.33-51.37.amzn1.x86_64
+        Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
         Single TINYINT Column Scan:          Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         --------------------------------------------------------------------------------------------
-        ParquetReader Vectorized                   187 /  201         84.2          11.9       1.0X
-        ParquetReader Vectorized -> Row            101 /  103        156.4           6.4       1.9X
+        ParquetReader Vectorized                   198 /  202         79.4          12.6       1.0X
+        ParquetReader Vectorized -> Row            119 /  121        132.3           7.6       1.7X
 
 
         Single SMALLINT Column Scan:         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         --------------------------------------------------------------------------------------------
-        ParquetReader Vectorized                   272 /  288         57.8          17.3       1.0X
-        ParquetReader Vectorized -> Row            213 /  219         73.7          13.6       1.3X
+        ParquetReader Vectorized                   282 /  287         55.8          17.9       1.0X
+        ParquetReader Vectorized -> Row            246 /  247         64.0          15.6       1.1X
 
 
         Single INT Column Scan:              Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         --------------------------------------------------------------------------------------------
-        ParquetReader Vectorized                   252 /  288         62.5          16.0       1.0X
-        ParquetReader Vectorized -> Row            232 /  246         67.7          14.8       1.1X
+        ParquetReader Vectorized                   258 /  262         60.9          16.4       1.0X
+        ParquetReader Vectorized -> Row            259 /  260         60.8          16.5       1.0X
 
 
         Single BIGINT Column Scan:           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         --------------------------------------------------------------------------------------------
-        ParquetReader Vectorized                   415 /  454         37.9          26.4       1.0X
-        ParquetReader Vectorized -> Row            407 /  432         38.6          25.9       1.0X
+        ParquetReader Vectorized                   361 /  369         43.6          23.0       1.0X
+        ParquetReader Vectorized -> Row            361 /  371         43.6          22.9       1.0X
 
 
         Single FLOAT Column Scan:            Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         --------------------------------------------------------------------------------------------
-        ParquetReader Vectorized                   251 /  302         62.7          16.0       1.0X
-        ParquetReader Vectorized -> Row            220 /  234         71.5          14.0       1.1X
+        ParquetReader Vectorized                   253 /  261         62.2          16.1       1.0X
+        ParquetReader Vectorized -> Row            254 /  256         61.9          16.2       1.0X
 
 
         Single DOUBLE Column Scan:           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         --------------------------------------------------------------------------------------------
-        ParquetReader Vectorized                   432 /  436         36.4          27.5       1.0X
-        ParquetReader Vectorized -> Row            414 /  422         38.0          26.4       1.0X
+        ParquetReader Vectorized                   357 /  364         44.0          22.7       1.0X
+        ParquetReader Vectorized -> Row            358 /  366         44.0          22.7       1.0X
         */
         parquetReaderBenchmark.run()
       }
@@ -382,16 +385,17 @@ object DataSourceReadBenchmark {
         }
 
         /*
-        Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
+        OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 4.14.33-51.37.amzn1.x86_64
+        Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
         Int and String Scan:                 Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         --------------------------------------------------------------------------------------------
-        SQL CSV                                 19172 / 19173          0.5        1828.4       1.0X
-        SQL Json                                12799 / 12873          0.8        1220.6       1.5X
-        SQL Parquet Vectorized                    2558 / 2564          4.1         244.0       7.5X
-        SQL Parquet MR                            4514 / 4583          2.3         430.4       4.2X
-        SQL ORC Vectorized                        2561 / 2697          4.1         244.3       7.5X
-        SQL ORC Vectorized with copy              3076 / 3110          3.4         293.4       6.2X
-        SQL ORC MR                                4197 / 4283          2.5         400.2       4.6X
+        SQL CSV                                 27145 / 27158          0.4        2588.7       1.0X
+        SQL Json                                12969 / 13337          0.8        1236.8       2.1X
+        SQL Parquet Vectorized                    2419 / 2448          4.3         230.7      11.2X
+        SQL Parquet MR                            4631 / 4633          2.3         441.7       5.9X
+        SQL ORC Vectorized                        2412 / 2465          4.3         230.0      11.3X
+        SQL ORC Vectorized with copy              2633 / 2675          4.0         251.1      10.3X
+        SQL ORC MR                                4280 / 4350          2.4         408.2       6.3X
         */
         benchmark.run()
       }
@@ -445,16 +449,17 @@ object DataSourceReadBenchmark {
         }
 
         /*
-        Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
+        OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 4.14.33-51.37.amzn1.x86_64
+        Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
         Repeated String:                     Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         --------------------------------------------------------------------------------------------
-        SQL CSV                                 10889 / 10924          1.0        1038.5       1.0X
-        SQL Json                                  7903 / 7931          1.3         753.7       1.4X
-        SQL Parquet Vectorized                     777 /  799         13.5          74.1      14.0X
-        SQL Parquet MR                            1682 / 1708          6.2         160.4       6.5X
-        SQL ORC Vectorized                         532 /  534         19.7          50.7      20.5X
-        SQL ORC Vectorized with copy               742 /  743         14.1          70.7      14.7X
-        SQL ORC MR                                1996 / 2002          5.3         190.4       5.5X
+        SQL CSV                                 17345 / 17424          0.6        1654.1       1.0X
+        SQL Json                                  8639 / 8664          1.2         823.9       2.0X
+        SQL Parquet Vectorized                     839 /  854         12.5          80.0      20.7X
+        SQL Parquet MR                            1771 / 1775          5.9         168.9       9.8X
+        SQL ORC Vectorized                         550 /  569         19.1          52.4      31.6X
+        SQL ORC Vectorized with copy               785 /  849         13.4          74.9      22.1X
+        SQL ORC MR                                2168 / 2202          4.8         206.7       8.0X
         */
         benchmark.run()
       }
@@ -574,30 +579,31 @@ object DataSourceReadBenchmark {
         }
 
         /*
-        Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
+        OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 4.14.33-51.37.amzn1.x86_64
+        Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
         Partitioned Table:                   Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         --------------------------------------------------------------------------------------------
-        Data column - CSV                       25428 / 25454          0.6        1616.7       1.0X
-        Data column - Json                      12689 / 12774          1.2         806.7       2.0X
-        Data column - Parquet Vectorized           222 /  231         70.7          14.1     114.3X
-        Data column - Parquet MR                  3355 / 3397          4.7         213.3       7.6X
-        Data column - ORC Vectorized               332 /  338         47.4          21.1      76.6X
-        Data column - ORC Vectorized with copy     338 /  341         46.5          21.5      75.2X
-        Data column - ORC MR                      2329 / 2356          6.8         148.0      10.9X
-        Partition column - CSV                  17465 / 17502          0.9        1110.4       1.5X
-        Partition column - Json                 10865 / 10876          1.4         690.8       2.3X
-        Partition column - Parquet Vectorized       48 /   52        325.4           3.1     526.1X
-        Partition column - Parquet MR             1695 / 1696          9.3         107.8      15.0X
-        Partition column - ORC Vectorized           49 /   54        319.9           3.1     517.2X
-        Partition column - ORC Vectorized with copy 49 /   52        324.1           3.1     524.0X
-        Partition column - ORC MR                 1548 / 1549         10.2          98.4      16.4X
-        Both columns - CSV                      25568 / 25595          0.6        1625.6       1.0X
-        Both columns - Json                     13658 / 13673          1.2         868.4       1.9X
-        Both columns - Parquet Vectorized          270 /  296         58.3          17.1      94.3X
-        Both columns - Parquet MR                 3501 / 3521          4.5         222.6       7.3X
-        Both columns - ORC Vectorized              377 /  380         41.7          24.0      67.4X
-        Both column - ORC Vectorized with copy     447 /  448         35.2          28.4      56.9X
-        Both columns - ORC MR                     2440 / 2446          6.4         155.2      10.4X
+        Data column - CSV                       32613 / 32841          0.5        2073.4       1.0X
+        Data column - Json                      13343 / 13469          1.2         848.3       2.4X
+        Data column - Parquet Vectorized           302 /  318         52.1          19.2     108.0X
+        Data column - Parquet MR                  2908 / 2924          5.4         184.9      11.2X
+        Data column - ORC Vectorized               412 /  425         38.1          26.2      79.1X
+        Data column - ORC Vectorized with copy     442 /  446         35.6          28.1      73.8X
+        Data column - ORC MR                      2390 / 2396          6.6         152.0      13.6X
+        Partition column - CSV                    9626 / 9683          1.6         612.0       3.4X
+        Partition column - Json                 10909 / 10923          1.4         693.6       3.0X
+        Partition column - Parquet Vectorized       69 /   76        228.4           4.4     473.6X
+        Partition column - Parquet MR             1898 / 1933          8.3         120.7      17.2X
+        Partition column - ORC Vectorized           67 /   74        236.0           4.2     489.4X
+        Partition column - ORC Vectorized with copy 65 /   72        241.9           4.1     501.6X
+        Partition column - ORC MR                 1743 / 1749          9.0         110.8      18.7X
+        Both columns - CSV                      35523 / 35552          0.4        2258.5       0.9X
+        Both columns - Json                     13676 / 13681          1.2         869.5       2.4X
+        Both columns - Parquet Vectorized          317 /  326         49.5          20.2     102.7X
+        Both columns - Parquet MR                 3333 / 3336          4.7         211.9       9.8X
+        Both columns - ORC Vectorized              441 /  446         35.6          28.1      73.9X
+        Both column - ORC Vectorized with copy     517 /  524         30.4          32.9      63.1X
+        Both columns - ORC MR                     2574 / 2577          6.1         163.6      12.7X
         */
         benchmark.run()
       }
@@ -684,41 +690,42 @@ object DataSourceReadBenchmark {
         }
 
         /*
-        Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
+        OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 4.14.33-51.37.amzn1.x86_64
+        Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
         String with Nulls Scan:              Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         --------------------------------------------------------------------------------------------
-        SQL CSV                                 13518 / 13529          0.8        1289.2       1.0X
-        SQL Json                                10895 / 10926          1.0        1039.0       1.2X
-        SQL Parquet Vectorized                    1539 / 1581          6.8         146.8       8.8X
-        SQL Parquet MR                            3746 / 3811          2.8         357.3       3.6X
-        ParquetReader Vectorized                  1070 / 1112          9.8         102.0      12.6X
-        SQL ORC Vectorized                        1389 / 1408          7.6         132.4       9.7X
-        SQL ORC Vectorized with copy              1736 / 1750          6.0         165.6       7.8X
-        SQL ORC MR                                3799 / 3892          2.8         362.3       3.6X
+        SQL CSV                                 14875 / 14920          0.7        1418.6       1.0X
+        SQL Json                                10974 / 10992          1.0        1046.5       1.4X
+        SQL Parquet Vectorized                    1711 / 1750          6.1         163.2       8.7X
+        SQL Parquet MR                            3838 / 3884          2.7         366.0       3.9X
+        ParquetReader Vectorized                  1155 / 1168          9.1         110.2      12.9X
+        SQL ORC Vectorized                        1341 / 1380          7.8         127.9      11.1X
+        SQL ORC Vectorized with copy              1659 / 1716          6.3         158.2       9.0X
+        SQL ORC MR                                3594 / 3634          2.9         342.7       4.1X
 
 
         String with Nulls Scan:              Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         --------------------------------------------------------------------------------------------
-        SQL CSV                                 10854 / 10892          1.0        1035.2       1.0X
-        SQL Json                                  8129 / 8138          1.3         775.3       1.3X
-        SQL Parquet Vectorized                    1053 / 1104         10.0         100.4      10.3X
-        SQL Parquet MR                            2840 / 2854          3.7         270.8       3.8X
-        ParquetReader Vectorized                   978 / 1008         10.7          93.2      11.1X
-        SQL ORC Vectorized                        1312 / 1387          8.0         125.1       8.3X
-        SQL ORC Vectorized with copy              1764 / 1772          5.9         168.2       6.2X
-        SQL ORC MR                                3435 / 3445          3.1         327.6       3.2X
+        SQL CSV                                 17219 / 17264          0.6        1642.1       1.0X
+        SQL Json                                  8843 / 8864          1.2         843.3       1.9X
+        SQL Parquet Vectorized                    1169 / 1178          9.0         111.4      14.7X
+        SQL Parquet MR                            2676 / 2697          3.9         255.2       6.4X
+        ParquetReader Vectorized                  1068 / 1071          9.8         101.8      16.1X
+        SQL ORC Vectorized                        1319 / 1319          7.9         125.8      13.1X
+        SQL ORC Vectorized with copy              1638 / 1639          6.4         156.2      10.5X
+        SQL ORC MR                                3230 / 3257          3.2         308.1       5.3X
 
 
         String with Nulls Scan:              Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         --------------------------------------------------------------------------------------------
-        SQL CSV                                   8043 / 8048          1.3         767.1       1.0X
-        SQL Json                                  4911 / 4923          2.1         468.4       1.6X
-        SQL Parquet Vectorized                     206 /  209         51.0          19.6      39.1X
-        SQL Parquet MR                            1528 / 1537          6.9         145.8       5.3X
-        ParquetReader Vectorized                   216 /  219         48.6          20.6      37.2X
-        SQL ORC Vectorized                         462 /  466         22.7          44.1      17.4X
-        SQL ORC Vectorized with copy               568 /  572         18.5          54.2      14.2X
-        SQL ORC MR                                1647 / 1649          6.4         157.1       4.9X
+        SQL CSV                                 13976 / 14053          0.8        1332.8       1.0X
+        SQL Json                                  5166 / 5176          2.0         492.6       2.7X
+        SQL Parquet Vectorized                     274 /  282         38.2          26.2      50.9X
+        SQL Parquet MR                            1553 / 1555          6.8         148.1       9.0X
+        ParquetReader Vectorized                   241 /  246         43.5          23.0      57.9X
+        SQL ORC Vectorized                         476 /  479         22.0          45.4      29.3X
+        SQL ORC Vectorized with copy               584 /  588         17.9          55.7      23.9X
+        SQL ORC MR                                1720 / 1734          6.1         164.1       8.1X
         */
         benchmark.run()
       }
@@ -773,38 +780,39 @@ object DataSourceReadBenchmark {
         }
 
         /*
-        Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
+        OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 4.14.33-51.37.amzn1.x86_64
+        Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
         Single Column Scan from 10 columns:  Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         --------------------------------------------------------------------------------------------
-        SQL CSV                                   3663 / 3665          0.3        3493.2       1.0X
-        SQL Json                                  3122 / 3160          0.3        2977.5       1.2X
-        SQL Parquet Vectorized                      40 /   42         26.2          38.2      91.5X
-        SQL Parquet MR                             189 /  192          5.5         180.2      19.4X
-        SQL ORC Vectorized                          48 /   51         21.6          46.2      75.6X
-        SQL ORC Vectorized with copy                49 /   52         21.4          46.7      74.9X
-        SQL ORC MR                                 280 /  289          3.7         267.1      13.1X
+        SQL CSV                                   3478 / 3481          0.3        3316.4       1.0X
+        SQL Json                                  2646 / 2654          0.4        2523.6       1.3X
+        SQL Parquet Vectorized                      67 /   72         15.8          63.5      52.2X
+        SQL Parquet MR                             207 /  214          5.1         197.6      16.8X
+        SQL ORC Vectorized                          69 /   76         15.2          66.0      50.3X
+        SQL ORC Vectorized with copy                70 /   76         15.0          66.5      49.9X
+        SQL ORC MR                                 299 /  303          3.5         285.1      11.6X
 
 
         Single Column Scan from 50 columns:  Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         --------------------------------------------------------------------------------------------
-        SQL CSV                                 11420 / 11505          0.1       10891.1       1.0X
-        SQL Json                                11905 / 12120          0.1       11353.6       1.0X
-        SQL Parquet Vectorized                      50 /   54         20.9          47.8     227.7X
-        SQL Parquet MR                             195 /  199          5.4         185.8      58.6X
-        SQL ORC Vectorized                          61 /   65         17.3          57.8     188.3X
-        SQL ORC Vectorized with copy                62 /   65         17.0          58.8     185.2X
-        SQL ORC MR                                 847 /  865          1.2         807.4      13.5X
+        SQL CSV                                   9214 / 9236          0.1        8786.7       1.0X
+        SQL Json                                  9943 / 9978          0.1        9482.7       0.9X
+        SQL Parquet Vectorized                      77 /   86         13.6          73.3     119.8X
+        SQL Parquet MR                             229 /  235          4.6         218.6      40.2X
+        SQL ORC Vectorized                          84 /   96         12.5          80.0     109.9X
+        SQL ORC Vectorized with copy                83 /   91         12.6          79.4     110.7X
+        SQL ORC MR                                 843 /  854          1.2         804.0      10.9X
 
 
-        Single Column Scan from 100 columns: Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        Single Column Scan from 100 columns  Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         --------------------------------------------------------------------------------------------
-        SQL CSV                                 21278 / 21404          0.0       20292.4       1.0X
-        SQL Json                                22455 / 22625          0.0       21414.7       0.9X
-        SQL Parquet Vectorized                      73 /   75         14.4          69.3     292.8X
-        SQL Parquet MR                             220 /  226          4.8         209.7      96.8X
-        SQL ORC Vectorized                          82 /   86         12.8          78.2     259.4X
-        SQL ORC Vectorized with copy                82 /   90         12.7          78.7     258.0X
-        SQL ORC MR                                1568 / 1582          0.7        1495.4      13.6X
+        SQL CSV                                 16503 / 16622          0.1       15738.9       1.0X
+        SQL Json                                19109 / 19184          0.1       18224.2       0.9X
+        SQL Parquet Vectorized                      99 /  108         10.6          94.3     166.8X
+        SQL Parquet MR                             253 /  264          4.1         241.6      65.1X
+        SQL ORC Vectorized                         107 /  114          9.8         101.6     154.8X
+        SQL ORC Vectorized with copy               107 /  118          9.8         102.1     154.1X
+        SQL ORC MR                                1526 / 1529          0.7        1455.3      10.8X
         */
         benchmark.run()
       }

From 6a97e8eb31da76fe5af912a6304c07b63735062f Mon Sep 17 00:00:00 2001
From: Fokko Driesprong <fokkodriesprong@godatadriven.com>
Date: Thu, 28 Jun 2018 09:59:00 +0800
Subject: [PATCH 26/79] [SPARK-24603][SQL] Fix findTightestCommonType reference
 in comments

findTightestCommonTypeOfTwo has been renamed to findTightestCommonType

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Fokko Driesprong <fokkodriesprong@godatadriven.com>

Closes #21597 from Fokko/fd-typo.
---
 .../sql/execution/datasources/json/JsonInferSchema.scala      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonInferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonInferSchema.scala
index f6edc7bfb3750..8e1b430f4eb33 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonInferSchema.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonInferSchema.scala
@@ -334,8 +334,8 @@ private[sql] object JsonInferSchema {
           ArrayType(compatibleType(elementType1, elementType2), containsNull1 || containsNull2)
 
         // The case that given `DecimalType` is capable of given `IntegralType` is handled in
-        // `findTightestCommonTypeOfTwo`. Both cases below will be executed only when
-        // the given `DecimalType` is not capable of the given `IntegralType`.
+        // `findTightestCommonType`. Both cases below will be executed only when the given
+        // `DecimalType` is not capable of the given `IntegralType`.
         case (t1: IntegralType, t2: DecimalType) =>
           compatibleType(DecimalType.forType(t1), t2)
         case (t1: DecimalType, t2: IntegralType) =>

From 5b0596648854c0c733b7c607661b78af7df18b89 Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Thu, 28 Jun 2018 14:19:50 +0800
Subject: [PATCH 27/79] [SPARK-24564][TEST] Add test suite for
 RecordBinaryComparator

## What changes were proposed in this pull request?

Add a new test suite to test RecordBinaryComparator.

## How was this patch tested?

New test suite.

Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #21570 from jiangxb1987/rbc-test.
---
 .../spark/memory/TestMemoryConsumer.java      |  10 +
 .../sort/RecordBinaryComparatorSuite.java     | 256 ++++++++++++++++++
 2 files changed, 266 insertions(+)
 create mode 100644 sql/core/src/test/java/test/org/apache/spark/sql/execution/sort/RecordBinaryComparatorSuite.java

diff --git a/core/src/test/java/org/apache/spark/memory/TestMemoryConsumer.java b/core/src/test/java/org/apache/spark/memory/TestMemoryConsumer.java
index db91329c94cb6..0bbaea6b834b8 100644
--- a/core/src/test/java/org/apache/spark/memory/TestMemoryConsumer.java
+++ b/core/src/test/java/org/apache/spark/memory/TestMemoryConsumer.java
@@ -17,6 +17,10 @@
 
 package org.apache.spark.memory;
 
+import com.google.common.annotations.VisibleForTesting;
+
+import org.apache.spark.unsafe.memory.MemoryBlock;
+
 import java.io.IOException;
 
 public class TestMemoryConsumer extends MemoryConsumer {
@@ -43,6 +47,12 @@ void free(long size) {
     used -= size;
     taskMemoryManager.releaseExecutionMemory(size, this);
   }
+
+  @VisibleForTesting
+  public void freePage(MemoryBlock page) {
+    used -= page.size();
+    taskMemoryManager.freePage(page, this);
+  }
 }
 
 
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/execution/sort/RecordBinaryComparatorSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/execution/sort/RecordBinaryComparatorSuite.java
new file mode 100644
index 0000000000000..a19ddbdbadba2
--- /dev/null
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/execution/sort/RecordBinaryComparatorSuite.java
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package test.org.apache.spark.sql.execution.sort;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.memory.TaskMemoryManager;
+import org.apache.spark.memory.TestMemoryConsumer;
+import org.apache.spark.memory.TestMemoryManager;
+import org.apache.spark.sql.catalyst.expressions.UnsafeArrayData;
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
+import org.apache.spark.sql.execution.RecordBinaryComparator;
+import org.apache.spark.unsafe.Platform;
+import org.apache.spark.unsafe.UnsafeAlignedOffset;
+import org.apache.spark.unsafe.array.LongArray;
+import org.apache.spark.unsafe.memory.MemoryBlock;
+import org.apache.spark.unsafe.types.UTF8String;
+import org.apache.spark.util.collection.unsafe.sort.*;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test the RecordBinaryComparator, which compares two UnsafeRows by their binary form.
+ */
+public class RecordBinaryComparatorSuite {
+
+  private final TaskMemoryManager memoryManager = new TaskMemoryManager(
+      new TestMemoryManager(new SparkConf().set("spark.memory.offHeap.enabled", "false")), 0);
+  private final TestMemoryConsumer consumer = new TestMemoryConsumer(memoryManager);
+
+  private final int uaoSize = UnsafeAlignedOffset.getUaoSize();
+
+  private MemoryBlock dataPage;
+  private long pageCursor;
+
+  private LongArray array;
+  private int pos;
+
+  @Before
+  public void beforeEach() {
+    // Only compare between two input rows.
+    array = consumer.allocateArray(2);
+    pos = 0;
+
+    dataPage = memoryManager.allocatePage(4096, consumer);
+    pageCursor = dataPage.getBaseOffset();
+  }
+
+  @After
+  public void afterEach() {
+    consumer.freePage(dataPage);
+    dataPage = null;
+    pageCursor = 0;
+
+    consumer.freeArray(array);
+    array = null;
+    pos = 0;
+  }
+
+  private void insertRow(UnsafeRow row) {
+    Object recordBase = row.getBaseObject();
+    long recordOffset = row.getBaseOffset();
+    int recordLength = row.getSizeInBytes();
+
+    Object baseObject = dataPage.getBaseObject();
+    assert(pageCursor + recordLength <= dataPage.getBaseOffset() + dataPage.size());
+    long recordAddress = memoryManager.encodePageNumberAndOffset(dataPage, pageCursor);
+    UnsafeAlignedOffset.putSize(baseObject, pageCursor, recordLength);
+    pageCursor += uaoSize;
+    Platform.copyMemory(recordBase, recordOffset, baseObject, pageCursor, recordLength);
+    pageCursor += recordLength;
+
+    assert(pos < 2);
+    array.set(pos, recordAddress);
+    pos++;
+  }
+
+  private int compare(int index1, int index2) {
+    Object baseObject = dataPage.getBaseObject();
+
+    long recordAddress1 = array.get(index1);
+    long baseOffset1 = memoryManager.getOffsetInPage(recordAddress1) + uaoSize;
+    int recordLength1 = UnsafeAlignedOffset.getSize(baseObject, baseOffset1 - uaoSize);
+
+    long recordAddress2 = array.get(index2);
+    long baseOffset2 = memoryManager.getOffsetInPage(recordAddress2) + uaoSize;
+    int recordLength2 = UnsafeAlignedOffset.getSize(baseObject, baseOffset2 - uaoSize);
+
+    return binaryComparator.compare(baseObject, baseOffset1, recordLength1, baseObject,
+        baseOffset2, recordLength2);
+  }
+
+  private final RecordComparator binaryComparator = new RecordBinaryComparator();
+
+  // Compute the most compact size for UnsafeRow's backing data.
+  private int computeSizeInBytes(int originalSize) {
+    // All the UnsafeRows in this suite contains less than 64 columns, so the bitSetSize shall
+    // always be 8.
+    return 8 + (originalSize + 7) / 8 * 8;
+  }
+
+  // Compute the relative offset of variable-length values.
+  private long relativeOffset(int numFields) {
+    // All the UnsafeRows in this suite contains less than 64 columns, so the bitSetSize shall
+    // always be 8.
+    return 8 + numFields * 8L;
+  }
+
+  @Test
+  public void testBinaryComparatorForSingleColumnRow() throws Exception {
+    int numFields = 1;
+
+    UnsafeRow row1 = new UnsafeRow(numFields);
+    byte[] data1 = new byte[100];
+    row1.pointTo(data1, computeSizeInBytes(numFields * 8));
+    row1.setInt(0, 11);
+
+    UnsafeRow row2 = new UnsafeRow(numFields);
+    byte[] data2 = new byte[100];
+    row2.pointTo(data2, computeSizeInBytes(numFields * 8));
+    row2.setInt(0, 42);
+
+    insertRow(row1);
+    insertRow(row2);
+
+    assert(compare(0, 0) == 0);
+    assert(compare(0, 1) < 0);
+  }
+
+  @Test
+  public void testBinaryComparatorForMultipleColumnRow() throws Exception {
+    int numFields = 5;
+
+    UnsafeRow row1 = new UnsafeRow(numFields);
+    byte[] data1 = new byte[100];
+    row1.pointTo(data1, computeSizeInBytes(numFields * 8));
+    for (int i = 0; i < numFields; i++) {
+      row1.setDouble(i, i * 3.14);
+    }
+
+    UnsafeRow row2 = new UnsafeRow(numFields);
+    byte[] data2 = new byte[100];
+    row2.pointTo(data2, computeSizeInBytes(numFields * 8));
+    for (int i = 0; i < numFields; i++) {
+      row2.setDouble(i, 198.7 / (i + 1));
+    }
+
+    insertRow(row1);
+    insertRow(row2);
+
+    assert(compare(0, 0) == 0);
+    assert(compare(0, 1) < 0);
+  }
+
+  @Test
+  public void testBinaryComparatorForArrayColumn() throws Exception {
+    int numFields = 1;
+
+    UnsafeRow row1 = new UnsafeRow(numFields);
+    byte[] data1 = new byte[100];
+    UnsafeArrayData arrayData1 = UnsafeArrayData.fromPrimitiveArray(new int[]{11, 42, -1});
+    row1.pointTo(data1, computeSizeInBytes(numFields * 8 + arrayData1.getSizeInBytes()));
+    row1.setLong(0, (relativeOffset(numFields) << 32) | (long) arrayData1.getSizeInBytes());
+    Platform.copyMemory(arrayData1.getBaseObject(), arrayData1.getBaseOffset(), data1,
+        row1.getBaseOffset() + relativeOffset(numFields), arrayData1.getSizeInBytes());
+
+    UnsafeRow row2 = new UnsafeRow(numFields);
+    byte[] data2 = new byte[100];
+    UnsafeArrayData arrayData2 = UnsafeArrayData.fromPrimitiveArray(new int[]{22});
+    row2.pointTo(data2, computeSizeInBytes(numFields * 8 + arrayData2.getSizeInBytes()));
+    row2.setLong(0, (relativeOffset(numFields) << 32) | (long) arrayData2.getSizeInBytes());
+    Platform.copyMemory(arrayData2.getBaseObject(), arrayData2.getBaseOffset(), data2,
+        row2.getBaseOffset() + relativeOffset(numFields), arrayData2.getSizeInBytes());
+
+    insertRow(row1);
+    insertRow(row2);
+
+    assert(compare(0, 0) == 0);
+    assert(compare(0, 1) > 0);
+  }
+
+  @Test
+  public void testBinaryComparatorForMixedColumns() throws Exception {
+    int numFields = 4;
+
+    UnsafeRow row1 = new UnsafeRow(numFields);
+    byte[] data1 = new byte[100];
+    UTF8String str1 = UTF8String.fromString("Milk tea");
+    row1.pointTo(data1, computeSizeInBytes(numFields * 8 + str1.numBytes()));
+    row1.setInt(0, 11);
+    row1.setDouble(1, 3.14);
+    row1.setInt(2, -1);
+    row1.setLong(3, (relativeOffset(numFields) << 32) | (long) str1.numBytes());
+    Platform.copyMemory(str1.getBaseObject(), str1.getBaseOffset(), data1,
+        row1.getBaseOffset() + relativeOffset(numFields), str1.numBytes());
+
+    UnsafeRow row2 = new UnsafeRow(numFields);
+    byte[] data2 = new byte[100];
+    UTF8String str2 = UTF8String.fromString("Java");
+    row2.pointTo(data2, computeSizeInBytes(numFields * 8 + str2.numBytes()));
+    row2.setInt(0, 11);
+    row2.setDouble(1, 3.14);
+    row2.setInt(2, -1);
+    row2.setLong(3, (relativeOffset(numFields) << 32) | (long) str2.numBytes());
+    Platform.copyMemory(str2.getBaseObject(), str2.getBaseOffset(), data2,
+        row2.getBaseOffset() + relativeOffset(numFields), str2.numBytes());
+
+    insertRow(row1);
+    insertRow(row2);
+
+    assert(compare(0, 0) == 0);
+    assert(compare(0, 1) > 0);
+  }
+
+  @Test
+  public void testBinaryComparatorForNullColumns() throws Exception {
+    int numFields = 3;
+
+    UnsafeRow row1 = new UnsafeRow(numFields);
+    byte[] data1 = new byte[100];
+    row1.pointTo(data1, computeSizeInBytes(numFields * 8));
+    for (int i = 0; i < numFields; i++) {
+      row1.setNullAt(i);
+    }
+
+    UnsafeRow row2 = new UnsafeRow(numFields);
+    byte[] data2 = new byte[100];
+    row2.pointTo(data2, computeSizeInBytes(numFields * 8));
+    for (int i = 0; i < numFields - 1; i++) {
+      row2.setNullAt(i);
+    }
+    row2.setDouble(numFields - 1, 3.14);
+
+    insertRow(row1);
+    insertRow(row2);
+
+    assert(compare(0, 0) == 0);
+    assert(compare(0, 1) > 0);
+  }
+}

From 524827f0626281847582ec3056982db7eb83f8b1 Mon Sep 17 00:00:00 2001
From: bravo-zhang <mzhang1230@gmail.com>
Date: Thu, 28 Jun 2018 12:40:39 -0700
Subject: [PATCH 28/79] [SPARK-14712][ML] LogisticRegressionModel.toString
 should summarize model

## What changes were proposed in this pull request?

[SPARK-14712](https://issues.apache.org/jira/browse/SPARK-14712)
spark.mllib LogisticRegressionModel overrides toString to print a little model info. We should do the same in spark.ml and override repr in pyspark.

## How was this patch tested?

LogisticRegressionSuite.scala
Python doctest in pyspark.ml.classification.py

Author: bravo-zhang <mzhang1230@gmail.com>

Closes #18826 from bravo-zhang/spark-14712.
---
 .../apache/spark/ml/classification/LogisticRegression.scala | 5 +++++
 .../spark/ml/classification/LogisticRegressionSuite.scala   | 6 ++++++
 python/pyspark/ml/classification.py                         | 5 +++++
 python/pyspark/mllib/classification.py                      | 3 +++
 4 files changed, 19 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 06ca37bc75146..92e342ed4a464 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -1202,6 +1202,11 @@ class LogisticRegressionModel private[spark] (
    */
   @Since("1.6.0")
   override def write: MLWriter = new LogisticRegressionModel.LogisticRegressionModelWriter(this)
+
+  override def toString: String = {
+    s"LogisticRegressionModel: " +
+    s"uid = ${super.toString}, numClasses = $numClasses, numFeatures = $numFeatures"
+  }
 }
 
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 36b7e51f93d01..75c2aeb146786 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -2751,6 +2751,12 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
       assert(model.getFamily === family)
     }
   }
+
+  test("toString") {
+    val model = new LogisticRegressionModel("logReg", Vectors.dense(0.1, 0.2, 0.3), 0.0)
+    val expected = "LogisticRegressionModel: uid = logReg, numClasses = 2, numFeatures = 3"
+    assert(model.toString === expected)
+  }
 }
 
 object LogisticRegressionSuite {
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 1754c48937a62..d5963f4f7042c 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -239,6 +239,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
     True
     >>> blorModel.intercept == model2.intercept
     True
+    >>> model2
+    LogisticRegressionModel: uid = ..., numClasses = 2, numFeatures = 2
 
     .. versionadded:: 1.3.0
     """
@@ -562,6 +564,9 @@ def evaluate(self, dataset):
         java_blr_summary = self._call_java("evaluate", dataset)
         return BinaryLogisticRegressionSummary(java_blr_summary)
 
+    def __repr__(self):
+        return self._call_java("toString")
+
 
 class LogisticRegressionSummary(JavaWrapper):
     """
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index bb281981fd56b..e00ed95ef0701 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -258,6 +258,9 @@ def load(cls, sc, path):
         model.setThreshold(threshold)
         return model
 
+    def __repr__(self):
+        return self._call_java("toString")
+
 
 class LogisticRegressionWithSGD(object):
     """

From a95a4af76459016b0d52df90adab68a49904da99 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Thu, 28 Jun 2018 13:20:08 -0700
Subject: [PATCH 29/79] [SPARK-23120][PYSPARK][ML] Add basic PMML export
 support to PySpark

## What changes were proposed in this pull request?

Adds basic PMML export support for Spark ML stages to PySpark as was previously done in Scala. Includes LinearRegressionModel as the first stage to implement.

## How was this patch tested?

Doctest, the main testing work for this is on the Scala side. (TODO holden add the unittest once I finish locally).

Author: Holden Karau <holden@pigscanfly.ca>

Closes #21172 from holdenk/SPARK-23120-add-pmml-export-support-to-pyspark.
---
 python/pyspark/ml/regression.py |  3 ++-
 python/pyspark/ml/tests.py      | 17 ++++++++++++
 python/pyspark/ml/util.py       | 46 +++++++++++++++++++++++++++++++++
 3 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index dba0e57b01a0b..83f0edb397271 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -95,6 +95,7 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction
     True
     >>> model.numFeatures
     1
+    >>> model.write().format("pmml").save(model_path + "_2")
 
     .. versionadded:: 1.4.0
     """
@@ -161,7 +162,7 @@ def getEpsilon(self):
         return self.getOrDefault(self.epsilon)
 
 
-class LinearRegressionModel(JavaModel, JavaPredictionModel, JavaMLWritable, JavaMLReadable):
+class LinearRegressionModel(JavaModel, JavaPredictionModel, GeneralJavaMLWritable, JavaMLReadable):
     """
     Model fitted by :class:`LinearRegression`.
 
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index ebd36cbb5f7a7..bc782138292bf 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -1362,6 +1362,23 @@ def test_linear_regression(self):
         except OSError:
             pass
 
+    def test_linear_regression_pmml_basic(self):
+        # Most of the validation is done in the Scala side, here we just check
+        # that we output text rather than parquet (e.g. that the format flag
+        # was respected).
+        df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
+                                         (0.0, 2.0, Vectors.sparse(1, [], []))],
+                                        ["label", "weight", "features"])
+        lr = LinearRegression(maxIter=1)
+        model = lr.fit(df)
+        path = tempfile.mkdtemp()
+        lr_path = path + "/lr-pmml"
+        model.write().format("pmml").save(lr_path)
+        pmml_text_list = self.sc.textFile(lr_path).collect()
+        pmml_text = "\n".join(pmml_text_list)
+        self.assertIn("Apache Spark", pmml_text)
+        self.assertIn("PMML", pmml_text)
+
     def test_logistic_regression(self):
         lr = LogisticRegression(maxIter=1)
         path = tempfile.mkdtemp()
diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index 9fa85664939b8..080cd299f4fde 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -148,6 +148,23 @@ def overwrite(self):
         return self
 
 
+@inherit_doc
+class GeneralMLWriter(MLWriter):
+    """
+    Utility class that can save ML instances in different formats.
+
+    .. versionadded:: 2.4.0
+    """
+
+    def format(self, source):
+        """
+        Specifies the format of ML export (e.g. "pmml", "internal", or the fully qualified class
+        name for export).
+        """
+        self.source = source
+        return self
+
+
 @inherit_doc
 class JavaMLWriter(MLWriter):
     """
@@ -192,6 +209,24 @@ def session(self, sparkSession):
         return self
 
 
+@inherit_doc
+class GeneralJavaMLWriter(JavaMLWriter):
+    """
+    (Private) Specialization of :py:class:`GeneralMLWriter` for :py:class:`JavaParams` types
+    """
+
+    def __init__(self, instance):
+        super(GeneralJavaMLWriter, self).__init__(instance)
+
+    def format(self, source):
+        """
+        Specifies the format of ML export (e.g. "pmml", "internal", or the fully qualified class
+        name for export).
+        """
+        self._jwrite.format(source)
+        return self
+
+
 @inherit_doc
 class MLWritable(object):
     """
@@ -220,6 +255,17 @@ def write(self):
         return JavaMLWriter(self)
 
 
+@inherit_doc
+class GeneralJavaMLWritable(JavaMLWritable):
+    """
+    (Private) Mixin for ML instances that provide :py:class:`GeneralJavaMLWriter`.
+    """
+
+    def write(self):
+        """Returns an GeneralMLWriter instance for this ML instance."""
+        return GeneralJavaMLWriter(self)
+
+
 @inherit_doc
 class MLReader(BaseReadWrite):
     """

From e1d3f80103f6df2eb8a962607dd5427df4b355dd Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek@japila.pl>
Date: Thu, 28 Jun 2018 13:22:52 -0700
Subject: [PATCH 30/79] [SPARK-24408][SQL][DOC] Move abs function to math_funcs
 group

## What changes were proposed in this pull request?

A few math functions (`abs` , `bitwiseNOT`, `isnan`, `nanvl`) are not in **math_funcs** group. They should really be.

## How was this patch tested?

Awaiting Jenkins

Author: Jacek Laskowski <jacek@japila.pl>

Closes #21448 from jaceklaskowski/SPARK-24408-math-funcs-doc.
---
 .../scala/org/apache/spark/sql/functions.scala | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 0b4f526799578..acca9572cb14c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1031,14 +1031,6 @@ object functions {
   // Non-aggregate functions
   //////////////////////////////////////////////////////////////////////////////////////////////
 
-  /**
-   * Computes the absolute value.
-   *
-   * @group normal_funcs
-   * @since 1.3.0
-   */
-  def abs(e: Column): Column = withExpr { Abs(e.expr) }
-
   /**
    * Creates a new array column. The input columns must all have the same data type.
    *
@@ -1336,7 +1328,7 @@ object functions {
   }
 
   /**
-   * Computes bitwise NOT.
+   * Computes bitwise NOT (~) of a number.
    *
    * @group normal_funcs
    * @since 1.4.0
@@ -1364,6 +1356,14 @@ object functions {
   // Math Functions
   //////////////////////////////////////////////////////////////////////////////////////////////
 
+  /**
+   * Computes the absolute value of a numeric value.
+   *
+   * @group math_funcs
+   * @since 1.3.0
+   */
+  def abs(e: Column): Column = withExpr { Abs(e.expr) }
+
   /**
    * @return inverse cosine of `e` in radians, as if computed by `java.lang.Math.acos`
    *

From 2224861f2f93830d736b625c9a4cb72c918512b2 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Thu, 28 Jun 2018 14:07:28 -0700
Subject: [PATCH 31/79] [SPARK-24439][ML][PYTHON] Add distanceMeasure to
 BisectingKMeans in PySpark

## What changes were proposed in this pull request?

add  distanceMeasure to BisectingKMeans in Python.

## How was this patch tested?

added doctest and also manually tested it.

Author: Huaxin Gao <huaxing@us.ibm.com>

Closes #21557 from huaxingao/spark-24439.
---
 python/pyspark/ml/clustering.py               | 35 +++++++++++++------
 .../ml/param/_shared_params_code_gen.py       |  4 ++-
 python/pyspark/ml/param/shared.py             | 24 +++++++++++++
 3 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 4aa1cf84b5824..6d77baf7349e4 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -349,8 +349,8 @@ def summary(self):
 
 
 @inherit_doc
-class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed,
-             JavaMLWritable, JavaMLReadable):
+class KMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPredictionCol, HasMaxIter,
+             HasTol, HasSeed, JavaMLWritable, JavaMLReadable):
     """
     K-means clustering with a k-means++ like initialization mode
     (the k-means|| algorithm by Bahmani et al).
@@ -406,9 +406,6 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
                      typeConverter=TypeConverters.toString)
     initSteps = Param(Params._dummy(), "initSteps", "The number of steps for k-means|| " +
                       "initialization mode. Must be > 0.", typeConverter=TypeConverters.toInt)
-    distanceMeasure = Param(Params._dummy(), "distanceMeasure", "The distance measure. " +
-                            "Supported options: 'euclidean' and 'cosine'.",
-                            typeConverter=TypeConverters.toString)
 
     @keyword_only
     def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
@@ -544,8 +541,8 @@ def summary(self):
 
 
 @inherit_doc
-class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasSeed,
-                      JavaMLWritable, JavaMLReadable):
+class BisectingKMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPredictionCol,
+                      HasMaxIter, HasSeed, JavaMLWritable, JavaMLReadable):
     """
     A bisecting k-means algorithm based on the paper "A comparison of document clustering
     techniques" by Steinbach, Karypis, and Kumar, with modification to fit Spark.
@@ -585,6 +582,8 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
     >>> bkm2 = BisectingKMeans.load(bkm_path)
     >>> bkm2.getK()
     2
+    >>> bkm2.getDistanceMeasure()
+    'euclidean'
     >>> model_path = temp_path + "/bkm_model"
     >>> model.save(model_path)
     >>> model2 = BisectingKMeansModel.load(model_path)
@@ -607,10 +606,10 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
 
     @keyword_only
     def __init__(self, featuresCol="features", predictionCol="prediction", maxIter=20,
-                 seed=None, k=4, minDivisibleClusterSize=1.0):
+                 seed=None, k=4, minDivisibleClusterSize=1.0, distanceMeasure="euclidean"):
         """
         __init__(self, featuresCol="features", predictionCol="prediction", maxIter=20, \
-                 seed=None, k=4, minDivisibleClusterSize=1.0)
+                 seed=None, k=4, minDivisibleClusterSize=1.0, distanceMeasure="euclidean")
         """
         super(BisectingKMeans, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.BisectingKMeans",
@@ -622,10 +621,10 @@ def __init__(self, featuresCol="features", predictionCol="prediction", maxIter=2
     @keyword_only
     @since("2.0.0")
     def setParams(self, featuresCol="features", predictionCol="prediction", maxIter=20,
-                  seed=None, k=4, minDivisibleClusterSize=1.0):
+                  seed=None, k=4, minDivisibleClusterSize=1.0, distanceMeasure="euclidean"):
         """
         setParams(self, featuresCol="features", predictionCol="prediction", maxIter=20, \
-                  seed=None, k=4, minDivisibleClusterSize=1.0)
+                  seed=None, k=4, minDivisibleClusterSize=1.0, distanceMeasure="euclidean")
         Sets params for BisectingKMeans.
         """
         kwargs = self._input_kwargs
@@ -659,6 +658,20 @@ def getMinDivisibleClusterSize(self):
         """
         return self.getOrDefault(self.minDivisibleClusterSize)
 
+    @since("2.4.0")
+    def setDistanceMeasure(self, value):
+        """
+        Sets the value of :py:attr:`distanceMeasure`.
+        """
+        return self._set(distanceMeasure=value)
+
+    @since("2.4.0")
+    def getDistanceMeasure(self):
+        """
+        Gets the value of `distanceMeasure` or its default value.
+        """
+        return self.getOrDefault(self.distanceMeasure)
+
     def _create_model(self, java_model):
         return BisectingKMeansModel(java_model)
 
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 6e9e0a34cdfde..e45ba840b412b 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -162,7 +162,9 @@ def get$Name(self):
          "fitting. If set to true, then all sub-models will be available. Warning: For large " +
          "models, collecting all sub-models can cause OOMs on the Spark driver.",
          "False", "TypeConverters.toBoolean"),
-        ("loss", "the loss function to be optimized.", None, "TypeConverters.toString")]
+        ("loss", "the loss function to be optimized.", None, "TypeConverters.toString"),
+        ("distanceMeasure", "the distance measure. Supported options: 'euclidean' and 'cosine'.",
+         "'euclidean'", "TypeConverters.toString")]
 
     code = []
     for name, doc, defaultValueStr, typeConverter in shared:
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index 08408ee8fbfcc..618f5bf0a8103 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -790,3 +790,27 @@ def getCacheNodeIds(self):
         """
         return self.getOrDefault(self.cacheNodeIds)
 
+
+class HasDistanceMeasure(Params):
+    """
+    Mixin for param distanceMeasure: the distance measure. Supported options: 'euclidean' and 'cosine'.
+    """
+
+    distanceMeasure = Param(Params._dummy(), "distanceMeasure", "the distance measure. Supported options: 'euclidean' and 'cosine'.", typeConverter=TypeConverters.toString)
+
+    def __init__(self):
+        super(HasDistanceMeasure, self).__init__()
+        self._setDefault(distanceMeasure='euclidean')
+
+    def setDistanceMeasure(self, value):
+        """
+        Sets the value of :py:attr:`distanceMeasure`.
+        """
+        return self._set(distanceMeasure=value)
+
+    def getDistanceMeasure(self):
+        """
+        Gets the value of distanceMeasure or its default value.
+        """
+        return self.getOrDefault(self.distanceMeasure)
+

From f6e6899a8b8af99cd06e84cae7c69e0fc35bc60a Mon Sep 17 00:00:00 2001
From: Jose Torres <torres.joseph.f+github@gmail.com>
Date: Thu, 28 Jun 2018 16:25:40 -0700
Subject: [PATCH 32/79] [SPARK-24386][SS] coalesce(1) aggregates in continuous
 processing

## What changes were proposed in this pull request?

Provide a continuous processing implementation of coalesce(1), as well as allowing aggregates on top of it.

The changes in ContinuousQueuedDataReader and such are to use split.index (the ID of the partition within the RDD currently being compute()d) rather than context.partitionId() (the partition ID of the scheduled task within the Spark job - that is, the post coalesce writer). In the absence of a narrow dependency, these values were previously always the same, so there was no need to distinguish.

## How was this patch tested?

new unit test

Author: Jose Torres <torres.joseph.f+github@gmail.com>

Closes #21560 from jose-torres/coalesce.
---
 .../UnsupportedOperationChecker.scala         |  11 ++
 .../datasources/v2/DataSourceV2Strategy.scala |  16 ++-
 .../continuous/ContinuousCoalesceExec.scala   |  51 +++++++
 .../continuous/ContinuousCoalesceRDD.scala    | 136 ++++++++++++++++++
 .../continuous/ContinuousDataSourceRDD.scala  |   7 +-
 .../continuous/ContinuousExecution.scala      |   4 +
 .../ContinuousQueuedDataReader.scala          |   6 +-
 .../shuffle/ContinuousShuffleReadRDD.scala    |  10 +-
 .../shuffle/RPCContinuousShuffleReader.scala  |   4 +-
 .../sources/ContinuousMemoryStream.scala      |  11 +-
 .../ContinuousAggregationSuite.scala          |  63 +++++++-
 .../ContinuousQueuedDataReaderSuite.scala     |   2 +-
 .../shuffle/ContinuousShuffleSuite.scala      |   7 +-
 13 files changed, 310 insertions(+), 18 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousCoalesceExec.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousCoalesceRDD.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
index 2bed41672fe33..5ced1ca200daa 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
@@ -349,6 +349,17 @@ object UnsupportedOperationChecker {
               _: DeserializeToObject | _: SerializeFromObject | _: SubqueryAlias |
               _: TypedFilter) =>
         case node if node.nodeName == "StreamingRelationV2" =>
+        case Repartition(1, false, _) =>
+        case node: Aggregate =>
+          val aboveSinglePartitionCoalesce = node.find {
+            case Repartition(1, false, _) => true
+            case _ => false
+          }.isDefined
+
+          if (!aboveSinglePartitionCoalesce) {
+            throwError(s"In continuous processing mode, coalesce(1) must be called before " +
+              s"aggregate operation ${node.nodeName}.")
+          }
         case node =>
           throwError(s"Continuous processing does not support ${node.nodeName} operations.")
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
index 182aa2906cf1e..2a7f1de2c7c19 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
@@ -22,11 +22,12 @@ import scala.collection.mutable
 import org.apache.spark.sql.{sources, Strategy}
 import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, AttributeSet, Expression}
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Repartition}
 import org.apache.spark.sql.execution.{FilterExec, ProjectExec, SparkPlan}
 import org.apache.spark.sql.execution.datasources.DataSourceStrategy
-import org.apache.spark.sql.execution.streaming.continuous.{WriteToContinuousDataSource, WriteToContinuousDataSourceExec}
+import org.apache.spark.sql.execution.streaming.continuous.{ContinuousCoalesceExec, WriteToContinuousDataSource, WriteToContinuousDataSourceExec}
 import org.apache.spark.sql.sources.v2.reader.{DataSourceReader, SupportsPushDownCatalystFilters, SupportsPushDownFilters, SupportsPushDownRequiredColumns}
+import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousReader
 
 object DataSourceV2Strategy extends Strategy {
 
@@ -141,6 +142,17 @@ object DataSourceV2Strategy extends Strategy {
     case WriteToContinuousDataSource(writer, query) =>
       WriteToContinuousDataSourceExec(writer, planLater(query)) :: Nil
 
+    case Repartition(1, false, child) =>
+      val isContinuous = child.collectFirst {
+        case StreamingDataSourceV2Relation(_, _, _, r: ContinuousReader) => r
+      }.isDefined
+
+      if (isContinuous) {
+        ContinuousCoalesceExec(1, planLater(child)) :: Nil
+      } else {
+        Nil
+      }
+
     case _ => Nil
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousCoalesceExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousCoalesceExec.scala
new file mode 100644
index 0000000000000..5f60343bacfaa
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousCoalesceExec.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.continuous
+
+import java.util.UUID
+
+import org.apache.spark.{HashPartitioner, SparkEnv}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeRow}
+import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, SinglePartition}
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.streaming.continuous.shuffle.{ContinuousShuffleReadPartition, ContinuousShuffleReadRDD}
+
+/**
+ * Physical plan for coalescing a continuous processing plan.
+ *
+ * Currently, only coalesces to a single partition are supported. `numPartitions` must be 1.
+ */
+case class ContinuousCoalesceExec(numPartitions: Int, child: SparkPlan) extends SparkPlan {
+  override def output: Seq[Attribute] = child.output
+
+  override def children: Seq[SparkPlan] = child :: Nil
+
+  override def outputPartitioning: Partitioning = SinglePartition
+
+  override def doExecute(): RDD[InternalRow] = {
+    assert(numPartitions == 1)
+    new ContinuousCoalesceRDD(
+      sparkContext,
+      numPartitions,
+      conf.continuousStreamingExecutorQueueSize,
+      sparkContext.getLocalProperty(ContinuousExecution.EPOCH_INTERVAL_KEY).toLong,
+      child.execute())
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousCoalesceRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousCoalesceRDD.scala
new file mode 100644
index 0000000000000..ba85b355f974f
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousCoalesceRDD.scala
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.continuous
+
+import java.util.UUID
+
+import org.apache.spark._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.execution.streaming.continuous.shuffle._
+import org.apache.spark.util.ThreadUtils
+
+case class ContinuousCoalesceRDDPartition(
+    index: Int,
+    endpointName: String,
+    queueSize: Int,
+    numShuffleWriters: Int,
+    epochIntervalMs: Long)
+  extends Partition {
+  // Initialized only on the executor, and only once even as we call compute() multiple times.
+  lazy val (reader: ContinuousShuffleReader, endpoint) = {
+    val env = SparkEnv.get.rpcEnv
+    val receiver = new RPCContinuousShuffleReader(
+      queueSize, numShuffleWriters, epochIntervalMs, env)
+    val endpoint = env.setupEndpoint(endpointName, receiver)
+
+    TaskContext.get().addTaskCompletionListener { ctx =>
+      env.stop(endpoint)
+    }
+    (receiver, endpoint)
+  }
+  // This flag will be flipped on the executors to indicate that the threads processing
+  // partitions of the write-side RDD have been started. These will run indefinitely
+  // asynchronously as epochs of the coalesce RDD complete on the read side.
+  private[continuous] var writersInitialized: Boolean = false
+}
+
+/**
+ * RDD for continuous coalescing. Asynchronously writes all partitions of `prev` into a local
+ * continuous shuffle, and then reads them in the task thread using `reader`.
+ */
+class ContinuousCoalesceRDD(
+    context: SparkContext,
+    numPartitions: Int,
+    readerQueueSize: Int,
+    epochIntervalMs: Long,
+    prev: RDD[InternalRow])
+  extends RDD[InternalRow](context, Nil) {
+
+  // When we support more than 1 target partition, we'll need to figure out how to pass in the
+  // required partitioner.
+  private val outputPartitioner = new HashPartitioner(1)
+
+  private val readerEndpointNames = (0 until numPartitions).map { i =>
+    s"ContinuousCoalesceRDD-part$i-${UUID.randomUUID()}"
+  }
+
+  override def getPartitions: Array[Partition] = {
+    (0 until numPartitions).map { partIndex =>
+      ContinuousCoalesceRDDPartition(
+        partIndex,
+        readerEndpointNames(partIndex),
+        readerQueueSize,
+        prev.getNumPartitions,
+        epochIntervalMs)
+    }.toArray
+  }
+
+  private lazy val threadPool = ThreadUtils.newDaemonFixedThreadPool(
+    prev.getNumPartitions,
+    this.name)
+
+  override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = {
+    val part = split.asInstanceOf[ContinuousCoalesceRDDPartition]
+
+    if (!part.writersInitialized) {
+      val rpcEnv = SparkEnv.get.rpcEnv
+
+      // trigger lazy initialization
+      part.endpoint
+      val endpointRefs = readerEndpointNames.map { endpointName =>
+        rpcEnv.setupEndpointRef(rpcEnv.address, endpointName)
+      }
+
+      val runnables = prev.partitions.map { prevSplit =>
+        new Runnable() {
+          override def run(): Unit = {
+            TaskContext.setTaskContext(context)
+
+            val writer: ContinuousShuffleWriter = new RPCContinuousShuffleWriter(
+              prevSplit.index, outputPartitioner, endpointRefs.toArray)
+
+            EpochTracker.initializeCurrentEpoch(
+              context.getLocalProperty(ContinuousExecution.START_EPOCH_KEY).toLong)
+            while (!context.isInterrupted() && !context.isCompleted()) {
+              writer.write(prev.compute(prevSplit, context).asInstanceOf[Iterator[UnsafeRow]])
+              // Note that current epoch is a non-inheritable thread local, so each writer thread
+              // can properly increment its own epoch without affecting the main task thread.
+              EpochTracker.incrementCurrentEpoch()
+            }
+          }
+        }
+      }
+
+      context.addTaskCompletionListener { ctx =>
+        threadPool.shutdownNow()
+      }
+
+      part.writersInitialized = true
+
+      runnables.foreach(threadPool.execute)
+    }
+
+    part.reader.read()
+  }
+
+  override def clearDependencies(): Unit = {
+    throw new IllegalStateException("Continuous RDDs cannot be checkpointed")
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDD.scala
index a7ccce10b0cee..73868d5967e90 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDD.scala
@@ -51,11 +51,11 @@ class ContinuousDataSourceRDD(
     sc: SparkContext,
     dataQueueSize: Int,
     epochPollIntervalMs: Long,
-    @transient private val readerFactories: Seq[InputPartition[UnsafeRow]])
+    private val readerInputPartitions: Seq[InputPartition[UnsafeRow]])
   extends RDD[UnsafeRow](sc, Nil) {
 
   override protected def getPartitions: Array[Partition] = {
-    readerFactories.zipWithIndex.map {
+    readerInputPartitions.zipWithIndex.map {
       case (inputPartition, index) => new ContinuousDataSourceRDDPartition(index, inputPartition)
     }.toArray
   }
@@ -74,8 +74,7 @@ class ContinuousDataSourceRDD(
       val partition = split.asInstanceOf[ContinuousDataSourceRDDPartition]
       if (partition.queueReader == null) {
         partition.queueReader =
-          new ContinuousQueuedDataReader(
-            partition.inputPartition, context, dataQueueSize, epochPollIntervalMs)
+          new ContinuousQueuedDataReader(partition, context, dataQueueSize, epochPollIntervalMs)
       }
 
       partition.queueReader
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
index e3d0cea608b2a..a0bb8292d7766 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -216,6 +216,9 @@ class ContinuousExecution(
     currentEpochCoordinatorId = epochCoordinatorId
     sparkSessionForQuery.sparkContext.setLocalProperty(
       ContinuousExecution.EPOCH_COORDINATOR_ID_KEY, epochCoordinatorId)
+    sparkSessionForQuery.sparkContext.setLocalProperty(
+      ContinuousExecution.EPOCH_INTERVAL_KEY,
+      trigger.asInstanceOf[ContinuousTrigger].intervalMs.toString)
 
     // Use the parent Spark session for the endpoint since it's where this query ID is registered.
     val epochEndpoint =
@@ -382,4 +385,5 @@ class ContinuousExecution(
 object ContinuousExecution {
   val START_EPOCH_KEY = "__continuous_start_epoch"
   val EPOCH_COORDINATOR_ID_KEY = "__epoch_coordinator_id"
+  val EPOCH_INTERVAL_KEY = "__continuous_epoch_interval"
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousQueuedDataReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousQueuedDataReader.scala
index f38577b6a9f16..8c74b8244d096 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousQueuedDataReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousQueuedDataReader.scala
@@ -37,11 +37,11 @@ import org.apache.spark.util.ThreadUtils
  * offsets across epochs. Each compute() should call the next() method here until null is returned.
  */
 class ContinuousQueuedDataReader(
-    partition: InputPartition[UnsafeRow],
+    partition: ContinuousDataSourceRDDPartition,
     context: TaskContext,
     dataQueueSize: Int,
     epochPollIntervalMs: Long) extends Closeable {
-  private val reader = partition.createPartitionReader()
+  private val reader = partition.inputPartition.createPartitionReader()
 
   // Important sequencing - we must get our starting point before the provider threads start running
   private var currentOffset: PartitionOffset =
@@ -113,7 +113,7 @@ class ContinuousQueuedDataReader(
     currentEntry match {
       case EpochMarker =>
         epochCoordEndpoint.send(ReportPartitionOffset(
-          context.partitionId(), EpochTracker.getCurrentEpoch.get, currentOffset))
+          partition.index, EpochTracker.getCurrentEpoch.get, currentOffset))
         null
       case ContinuousRow(row, offset) =>
         currentOffset = offset
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/shuffle/ContinuousShuffleReadRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/shuffle/ContinuousShuffleReadRDD.scala
index cf6572d3de1f7..518223f3cd008 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/shuffle/ContinuousShuffleReadRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/shuffle/ContinuousShuffleReadRDD.scala
@@ -21,12 +21,14 @@ import java.util.UUID
 
 import org.apache.spark.{Partition, SparkContext, SparkEnv, TaskContext}
 import org.apache.spark.rdd.RDD
+import org.apache.spark.rpc.RpcAddress
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.util.NextIterator
 
 case class ContinuousShuffleReadPartition(
       index: Int,
+      endpointName: String,
       queueSize: Int,
       numShuffleWriters: Int,
       epochIntervalMs: Long)
@@ -36,7 +38,7 @@ case class ContinuousShuffleReadPartition(
     val env = SparkEnv.get.rpcEnv
     val receiver = new RPCContinuousShuffleReader(
       queueSize, numShuffleWriters, epochIntervalMs, env)
-    val endpoint = env.setupEndpoint(s"RPCContinuousShuffleReader-${UUID.randomUUID()}", receiver)
+    val endpoint = env.setupEndpoint(endpointName, receiver)
 
     TaskContext.get().addTaskCompletionListener { ctx =>
       env.stop(endpoint)
@@ -61,12 +63,14 @@ class ContinuousShuffleReadRDD(
     numPartitions: Int,
     queueSize: Int = 1024,
     numShuffleWriters: Int = 1,
-    epochIntervalMs: Long = 1000)
+    epochIntervalMs: Long = 1000,
+    val endpointNames: Seq[String] = Seq(s"RPCContinuousShuffleReader-${UUID.randomUUID()}"))
   extends RDD[UnsafeRow](sc, Nil) {
 
   override protected def getPartitions: Array[Partition] = {
     (0 until numPartitions).map { partIndex =>
-      ContinuousShuffleReadPartition(partIndex, queueSize, numShuffleWriters, epochIntervalMs)
+      ContinuousShuffleReadPartition(
+        partIndex, endpointNames(partIndex), queueSize, numShuffleWriters, epochIntervalMs)
     }.toArray
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/shuffle/RPCContinuousShuffleReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/shuffle/RPCContinuousShuffleReader.scala
index 834e84675c7d5..502ae0d4822e8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/shuffle/RPCContinuousShuffleReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/shuffle/RPCContinuousShuffleReader.scala
@@ -46,7 +46,7 @@ private[shuffle] case class ReceiverEpochMarker(writerId: Int) extends RPCContin
  * TODO: Support multiple source tasks. We need to output a single epoch marker once all
  * source tasks have sent one.
  */
-private[shuffle] class RPCContinuousShuffleReader(
+private[continuous] class RPCContinuousShuffleReader(
       queueSize: Int,
       numShuffleWriters: Int,
       epochIntervalMs: Long,
@@ -107,7 +107,7 @@ private[shuffle] class RPCContinuousShuffleReader(
               }
               logWarning(
                 s"Completion service failed to make progress after $epochIntervalMs ms. Waiting " +
-                  s"for writers $writerIdsUncommitted to send epoch markers.")
+                  s"for writers ${writerIdsUncommitted.mkString(",")} to send epoch markers.")
 
             // The completion service guarantees this future will be available immediately.
             case future => future.get() match {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ContinuousMemoryStream.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ContinuousMemoryStream.scala
index d1c3498450096..0bf90b8063326 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ContinuousMemoryStream.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ContinuousMemoryStream.scala
@@ -23,12 +23,13 @@ import java.util.concurrent.atomic.AtomicInteger
 import javax.annotation.concurrent.GuardedBy
 
 import scala.collection.JavaConverters._
+import scala.collection.SortedMap
 import scala.collection.mutable.ListBuffer
 
 import org.json4s.NoTypeHints
 import org.json4s.jackson.Serialization
 
-import org.apache.spark.SparkEnv
+import org.apache.spark.{SparkEnv, TaskContext}
 import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, RpcEnv, ThreadSafeRpcEndpoint}
 import org.apache.spark.sql.{Encoder, Row, SQLContext}
 import org.apache.spark.sql.execution.streaming._
@@ -184,6 +185,14 @@ class ContinuousMemoryStreamInputPartitionReader(
   private var currentOffset = startOffset
   private var current: Option[Row] = None
 
+  // Defense-in-depth against failing to propagate the task context. Since it's not inheritable,
+  // we have to do a bit of error prone work to get it into every thread used by continuous
+  // processing. We hope that some unit test will end up instantiating a continuous memory stream
+  // in such cases.
+  if (TaskContext.get() == null) {
+    throw new IllegalStateException("Task context was not set!")
+  }
+
   override def next(): Boolean = {
     current = getRecord
     while (current.isEmpty) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousAggregationSuite.scala
index b7ef637f5270e..0223812600961 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousAggregationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousAggregationSuite.scala
@@ -31,7 +31,8 @@ class ContinuousAggregationSuite extends ContinuousSuiteBase {
       testStream(input.toDF().agg(max('value)), OutputMode.Complete)()
     }
 
-    assert(ex.getMessage.contains("Continuous processing does not support Aggregate operations"))
+    assert(ex.getMessage.contains(
+      "In continuous processing mode, coalesce(1) must be called before aggregate operation"))
   }
 
   test("basic") {
@@ -50,6 +51,66 @@ class ContinuousAggregationSuite extends ContinuousSuiteBase {
     }
   }
 
+  test("multiple partitions with coalesce") {
+    val input = ContinuousMemoryStream[Int]
+
+    val df = input.toDF().coalesce(1).agg(max('value))
+
+    testStream(df, OutputMode.Complete)(
+      AddData(input, 0, 1, 2),
+      CheckAnswer(2),
+      StopStream,
+      AddData(input, 3, 4, 5),
+      StartStream(),
+      CheckAnswer(5),
+      AddData(input, -1, -2, -3),
+      CheckAnswer(5))
+  }
+
+  test("multiple partitions with coalesce - multiple transformations") {
+    val input = ContinuousMemoryStream[Int]
+
+    // We use a barrier to make sure predicates both before and after coalesce work
+    val df = input.toDF()
+      .select('value as 'copy, 'value)
+      .where('copy =!= 1)
+      .planWithBarrier
+      .coalesce(1)
+      .where('copy =!= 2)
+      .agg(max('value))
+
+    testStream(df, OutputMode.Complete)(
+      AddData(input, 0, 1, 2),
+      CheckAnswer(0),
+      StopStream,
+      AddData(input, 3, 4, 5),
+      StartStream(),
+      CheckAnswer(5),
+      AddData(input, -1, -2, -3),
+      CheckAnswer(5))
+  }
+
+  test("multiple partitions with multiple coalesce") {
+    val input = ContinuousMemoryStream[Int]
+
+    val df = input.toDF()
+      .coalesce(1)
+      .planWithBarrier
+      .coalesce(1)
+      .select('value as 'copy, 'value)
+      .agg(max('value))
+
+    testStream(df, OutputMode.Complete)(
+      AddData(input, 0, 1, 2),
+      CheckAnswer(2),
+      StopStream,
+      AddData(input, 3, 4, 5),
+      StartStream(),
+      CheckAnswer(5),
+      AddData(input, -1, -2, -3),
+      CheckAnswer(5))
+  }
+
   test("repeated restart") {
     withSQLConf(("spark.sql.streaming.unsupportedOperationCheck", "false")) {
       val input = ContinuousMemoryStream.singlePartition[Int]
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousQueuedDataReaderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousQueuedDataReaderSuite.scala
index e663fa8312da4..0e7e6febb53df 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousQueuedDataReaderSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousQueuedDataReaderSuite.scala
@@ -92,7 +92,7 @@ class ContinuousQueuedDataReaderSuite extends StreamTest with MockitoSugar {
       }
     }
     val reader = new ContinuousQueuedDataReader(
-      factory,
+      new ContinuousDataSourceRDDPartition(0, factory),
       mockContext,
       dataQueueSize = sqlContext.conf.continuousStreamingExecutorQueueSize,
       epochPollIntervalMs = sqlContext.conf.continuousStreamingExecutorPollIntervalMs)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/shuffle/ContinuousShuffleSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/shuffle/ContinuousShuffleSuite.scala
index a8e3611b585cf..f84f3d49707bf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/shuffle/ContinuousShuffleSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/shuffle/ContinuousShuffleSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution.streaming.continuous.shuffle
 
+import java.util.UUID
+
 import org.apache.spark.{HashPartitioner, Partition, TaskContext, TaskContextImpl}
 import org.apache.spark.rpc.RpcEndpointRef
 import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection, UnsafeRow}
@@ -124,7 +126,10 @@ class ContinuousShuffleSuite extends StreamTest {
   }
 
   test("reader - multiple partitions") {
-    val rdd = new ContinuousShuffleReadRDD(sparkContext, numPartitions = 5)
+    val rdd = new ContinuousShuffleReadRDD(
+      sparkContext,
+      numPartitions = 5,
+      endpointNames = Seq.fill(5)(s"endpt-${UUID.randomUUID()}"))
     // Send all data before processing to ensure there's no crossover.
     for (p <- rdd.partitions) {
       val part = p.asInstanceOf[ContinuousShuffleReadPartition]

From f71e8da5efde96aacc89e59c6e27b71fffcbc25f Mon Sep 17 00:00:00 2001
From: xueyu <278006819@qq.com>
Date: Fri, 29 Jun 2018 10:44:17 -0700
Subject: [PATCH 33/79] [SPARK-24566][CORE] Fix
 spark.storage.blockManagerSlaveTimeoutMs default config

This PR use spark.network.timeout in place of spark.storage.blockManagerSlaveTimeoutMs when it is not configured, as configuration doc said

manual test

Author: xueyu <278006819@qq.com>

Closes #21575 from xueyumusic/slaveTimeOutConfig.
---
 core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala | 5 ++---
 .../cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala   | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
index ff960b396dbf1..bcbc8df0d5865 100644
--- a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
+++ b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
@@ -74,10 +74,9 @@ private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock)
 
   // "spark.network.timeout" uses "seconds", while `spark.storage.blockManagerSlaveTimeoutMs` uses
   // "milliseconds"
-  private val slaveTimeoutMs =
-    sc.conf.getTimeAsMs("spark.storage.blockManagerSlaveTimeoutMs", "120s")
   private val executorTimeoutMs =
-    sc.conf.getTimeAsSeconds("spark.network.timeout", s"${slaveTimeoutMs}ms") * 1000
+    sc.conf.getTimeAsMs("spark.storage.blockManagerSlaveTimeoutMs",
+      s"${sc.conf.getTimeAsSeconds("spark.network.timeout", "120s")}s")
 
   // "spark.network.timeoutInterval" uses "seconds", while
   // "spark.storage.blockManagerTimeoutIntervalMs" uses "milliseconds"
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
index d35bea4aca311..1ce2f816dffb2 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
@@ -634,7 +634,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
             slave.hostname,
             externalShufflePort,
             sc.conf.getTimeAsMs("spark.storage.blockManagerSlaveTimeoutMs",
-              s"${sc.conf.getTimeAsSeconds("spark.network.timeout", "120s") * 1000L}ms"),
+              s"${sc.conf.getTimeAsSeconds("spark.network.timeout", "120s")}s"),
             sc.conf.getTimeAsMs("spark.executor.heartbeatInterval", "10s"))
         slave.shuffleRegistered = true
       }

From 03545ce6de08bd0ad685c5f59b73bc22dfc40887 Mon Sep 17 00:00:00 2001
From: Yuming Wang <yumwang@ebay.com>
Date: Sat, 30 Jun 2018 13:58:50 +0800
Subject: [PATCH 34/79] [SPARK-24638][SQL] StringStartsWith support push down

## What changes were proposed in this pull request?

`StringStartsWith` support push down. About 50% savings in compute time.

## How was this patch tested?
unit tests, manual tests and performance test:
```scala
cat <<EOF > SPARK-24638.scala
def benchmark(func: () => Unit): Long = {
  val start = System.currentTimeMillis()
  for(i <- 0 until 100) { func() }
  val end = System.currentTimeMillis()
  end - start
}
val path = "/tmp/spark/parquet/string/"
spark.range(10000000).selectExpr("concat(id, 'str', id) as id").coalesce(1).write.mode("overwrite").option("parquet.block.size", 1048576).parquet(path)
val df = spark.read.parquet(path)

spark.sql("set spark.sql.parquet.filterPushdown.string.startsWith=true")
val pushdownEnable = benchmark(() => df.where("id like '999998%'").count())

spark.sql("set spark.sql.parquet.filterPushdown.string.startsWith=false")
val pushdownDisable = benchmark(() => df.where("id like '999998%'").count())

val improvements = pushdownDisable - pushdownEnable
println(s"improvements: $improvements")
EOF

bin/spark-shell -i SPARK-24638.scala
```
result:
```scala
Loading SPARK-24638.scala...
benchmark: (func: () => Unit)Long
path: String = /tmp/spark/parquet/string/
df: org.apache.spark.sql.DataFrame = [id: string]
res1: org.apache.spark.sql.DataFrame = [key: string, value: string]
pushdownEnable: Long = 11608
res2: org.apache.spark.sql.DataFrame = [key: string, value: string]
pushdownDisable: Long = 31981
improvements: Long = 20373
```

Author: Yuming Wang <yumwang@ebay.com>

Closes #21623 from wangyum/SPARK-24638.
---
 .../apache/spark/sql/internal/SQLConf.scala   | 11 +++
 .../parquet/ParquetFileFormat.scala           |  4 +-
 .../datasources/parquet/ParquetFilters.scala  | 35 +++++++-
 .../parquet/ParquetFilterSuite.scala          | 84 ++++++++++++++++++-
 4 files changed, 130 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index e1752ff997b69..da1c34cdc78f2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -378,6 +378,14 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
+  val PARQUET_FILTER_PUSHDOWN_STRING_STARTSWITH_ENABLED =
+    buildConf("spark.sql.parquet.filterPushdown.string.startsWith")
+    .doc("If true, enables Parquet filter push-down optimization for string startsWith function. " +
+      "This configuration only has an effect when 'spark.sql.parquet.filterPushdown' is enabled.")
+    .internal()
+    .booleanConf
+    .createWithDefault(true)
+
   val PARQUET_WRITE_LEGACY_FORMAT = buildConf("spark.sql.parquet.writeLegacyFormat")
     .doc("Whether to be compatible with the legacy Parquet format adopted by Spark 1.4 and prior " +
       "versions, when converting Parquet schema to Spark SQL schema and vice versa.")
@@ -1459,6 +1467,9 @@ class SQLConf extends Serializable with Logging {
 
   def parquetFilterPushDownDate: Boolean = getConf(PARQUET_FILTER_PUSHDOWN_DATE_ENABLED)
 
+  def parquetFilterPushDownStringStartWith: Boolean =
+    getConf(PARQUET_FILTER_PUSHDOWN_STRING_STARTSWITH_ENABLED)
+
   def orcFilterPushDown: Boolean = getConf(ORC_FILTER_PUSHDOWN_ENABLED)
 
   def verifyPartitionPath: Boolean = getConf(HIVE_VERIFY_PARTITION_PATH)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
index 9602a08911dea..93de1faef527a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -348,6 +348,7 @@ class ParquetFileFormat
     // Whole stage codegen (PhysicalRDD) is able to deal with batches directly
     val returningBatch = supportBatch(sparkSession, resultSchema)
     val pushDownDate = sqlConf.parquetFilterPushDownDate
+    val pushDownStringStartWith = sqlConf.parquetFilterPushDownStringStartWith
 
     (file: PartitionedFile) => {
       assert(file.partitionValues.numFields == partitionSchema.size)
@@ -358,7 +359,8 @@ class ParquetFileFormat
           // Collects all converted Parquet filter predicates. Notice that not all predicates can be
           // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap`
           // is used here.
-          .flatMap(new ParquetFilters(pushDownDate).createFilter(requiredSchema, _))
+          .flatMap(new ParquetFilters(pushDownDate, pushDownStringStartWith)
+          .createFilter(requiredSchema, _))
           .reduceOption(FilterApi.and)
       } else {
         None
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
index 310626197a763..21c9e2e4f82b4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
@@ -22,16 +22,18 @@ import java.sql.Date
 import org.apache.parquet.filter2.predicate._
 import org.apache.parquet.filter2.predicate.FilterApi._
 import org.apache.parquet.io.api.Binary
+import org.apache.parquet.schema.PrimitiveComparator
 
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.catalyst.util.DateTimeUtils.SQLDate
 import org.apache.spark.sql.sources
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * Some utility function to convert Spark data source filters to Parquet filters.
  */
-private[parquet] class ParquetFilters(pushDownDate: Boolean) {
+private[parquet] class ParquetFilters(pushDownDate: Boolean, pushDownStartWith: Boolean) {
 
   private def dateToDays(date: Date): SQLDate = {
     DateTimeUtils.fromJavaDate(date)
@@ -270,6 +272,37 @@ private[parquet] class ParquetFilters(pushDownDate: Boolean) {
       case sources.Not(pred) =>
         createFilter(schema, pred).map(FilterApi.not)
 
+      case sources.StringStartsWith(name, prefix) if pushDownStartWith && canMakeFilterOn(name) =>
+        Option(prefix).map { v =>
+          FilterApi.userDefined(binaryColumn(name),
+            new UserDefinedPredicate[Binary] with Serializable {
+              private val strToBinary = Binary.fromReusedByteArray(v.getBytes)
+              private val size = strToBinary.length
+
+              override def canDrop(statistics: Statistics[Binary]): Boolean = {
+                val comparator = PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR
+                val max = statistics.getMax
+                val min = statistics.getMin
+                comparator.compare(max.slice(0, math.min(size, max.length)), strToBinary) < 0 ||
+                  comparator.compare(min.slice(0, math.min(size, min.length)), strToBinary) > 0
+              }
+
+              override def inverseCanDrop(statistics: Statistics[Binary]): Boolean = {
+                val comparator = PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR
+                val max = statistics.getMax
+                val min = statistics.getMin
+                comparator.compare(max.slice(0, math.min(size, max.length)), strToBinary) == 0 &&
+                  comparator.compare(min.slice(0, math.min(size, min.length)), strToBinary) == 0
+              }
+
+              override def keep(value: Binary): Boolean = {
+                UTF8String.fromBytes(value.getBytes).startsWith(
+                  UTF8String.fromBytes(strToBinary.getBytes))
+              }
+            }
+          )
+        }
+
       case _ => None
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index 90da7eb8c4fb5..d9ae5858e5ed0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -55,7 +55,8 @@ import org.apache.spark.util.{AccumulatorContext, AccumulatorV2}
  */
 class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContext {
 
-  private lazy val parquetFilters = new ParquetFilters(conf.parquetFilterPushDownDate)
+  private lazy val parquetFilters =
+    new ParquetFilters(conf.parquetFilterPushDownDate, conf.parquetFilterPushDownStringStartWith)
 
   override def beforeEach(): Unit = {
     super.beforeEach()
@@ -82,6 +83,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
     withSQLConf(
       SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true",
       SQLConf.PARQUET_FILTER_PUSHDOWN_DATE_ENABLED.key -> "true",
+      SQLConf.PARQUET_FILTER_PUSHDOWN_STRING_STARTSWITH_ENABLED.key -> "true",
       SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") {
         val query = df
           .select(output.map(e => Column(e)): _*)
@@ -140,6 +142,31 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
     checkBinaryFilterPredicate(predicate, filterClass, Seq(Row(expected)))(df)
   }
 
+  // This function tests that exactly go through the `canDrop` and `inverseCanDrop`.
+  private def testStringStartsWith(dataFrame: DataFrame, filter: String): Unit = {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      dataFrame.write.option("parquet.block.size", 512).parquet(path)
+      Seq(true, false).foreach { pushDown =>
+        withSQLConf(
+          SQLConf.PARQUET_FILTER_PUSHDOWN_STRING_STARTSWITH_ENABLED.key -> pushDown.toString) {
+          val accu = new NumRowGroupsAcc
+          sparkContext.register(accu)
+
+          val df = spark.read.parquet(path).filter(filter)
+          df.foreachPartition((it: Iterator[Row]) => it.foreach(v => accu.add(0)))
+          if (pushDown) {
+            assert(accu.value == 0)
+          } else {
+            assert(accu.value > 0)
+          }
+
+          AccumulatorContext.remove(accu.id)
+        }
+      }
+    }
+  }
+
   test("filter pushdown - boolean") {
     withParquetDataFrame((true :: false :: Nil).map(b => Tuple1.apply(Option(b)))) { implicit df =>
       checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
@@ -574,7 +601,6 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
 
             val df = spark.read.parquet(path).filter("a < 100")
             df.foreachPartition((it: Iterator[Row]) => it.foreach(v => accu.add(0)))
-            df.collect
 
             if (enablePushDown) {
               assert(accu.value == 0)
@@ -660,6 +686,60 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
       assert(df.where("col > 0").count() === 2)
     }
   }
+
+  test("filter pushdown - StringStartsWith") {
+    withParquetDataFrame((1 to 4).map(i => Tuple1(i + "str" + i))) { implicit df =>
+      checkFilterPredicate(
+        '_1.startsWith("").asInstanceOf[Predicate],
+        classOf[UserDefinedByInstance[_, _]],
+        Seq("1str1", "2str2", "3str3", "4str4").map(Row(_)))
+
+      Seq("2", "2s", "2st", "2str", "2str2").foreach { prefix =>
+        checkFilterPredicate(
+          '_1.startsWith(prefix).asInstanceOf[Predicate],
+          classOf[UserDefinedByInstance[_, _]],
+          "2str2")
+      }
+
+      Seq("2S", "null", "2str22").foreach { prefix =>
+        checkFilterPredicate(
+          '_1.startsWith(prefix).asInstanceOf[Predicate],
+          classOf[UserDefinedByInstance[_, _]],
+          Seq.empty[Row])
+      }
+
+      checkFilterPredicate(
+        !'_1.startsWith("").asInstanceOf[Predicate],
+        classOf[UserDefinedByInstance[_, _]],
+        Seq().map(Row(_)))
+
+      Seq("2", "2s", "2st", "2str", "2str2").foreach { prefix =>
+        checkFilterPredicate(
+          !'_1.startsWith(prefix).asInstanceOf[Predicate],
+          classOf[UserDefinedByInstance[_, _]],
+          Seq("1str1", "3str3", "4str4").map(Row(_)))
+      }
+
+      Seq("2S", "null", "2str22").foreach { prefix =>
+        checkFilterPredicate(
+          !'_1.startsWith(prefix).asInstanceOf[Predicate],
+          classOf[UserDefinedByInstance[_, _]],
+          Seq("1str1", "2str2", "3str3", "4str4").map(Row(_)))
+      }
+
+      assertResult(None) {
+        parquetFilters.createFilter(
+          df.schema,
+          sources.StringStartsWith("_1", null))
+      }
+    }
+
+    import testImplicits._
+    // Test canDrop() has taken effect
+    testStringStartsWith(spark.range(1024).map(_.toString).toDF(), "value like 'a%'")
+    // Test inverseCanDrop() has taken effect
+    testStringStartsWith(spark.range(1024).map(c => "100").toDF(), "value not like '10%'")
+  }
 }
 
 class NumRowGroupsAcc extends AccumulatorV2[Integer, Integer] {

From 797971ed42cab41cbc3d039c0af4b26199bff783 Mon Sep 17 00:00:00 2001
From: maryannxue <maryannxue@apache.org>
Date: Fri, 29 Jun 2018 23:46:12 -0700
Subject: [PATCH 35/79] [SPARK-24696][SQL] ColumnPruning rule fails to remove
 extra Project

## What changes were proposed in this pull request?

The ColumnPruning rule tries adding an extra Project if an input node produces fields more than needed, but as a post-processing step, it needs to remove the lower Project in the form of "Project - Filter - Project" otherwise it would conflict with PushPredicatesThroughProject and would thus cause a infinite optimization loop. The current post-processing method is defined as:
```
  private def removeProjectBeforeFilter(plan: LogicalPlan): LogicalPlan = plan transform {
    case p1  Project(_, f  Filter(_, p2  Project(_, child)))
      if p2.outputSet.subsetOf(child.outputSet) =>
      p1.copy(child = f.copy(child = child))
  }
```
This method works well when there is only one Filter but would not if there's two or more Filters. In this case, there is a deterministic filter and a non-deterministic filter so they stay as separate filter nodes and cannot be combined together.

An simplified illustration of the optimization process that forms the infinite loop is shown below (F1 stands for the 1st filter, F2 for the 2nd filter, P for project, S for scan of relation, PredicatePushDown as abbrev. of PushPredicatesThroughProject):
```
                             F1 - F2 - P - S
PredicatePushDown      =>    F1 - P - F2 - S
ColumnPruning          =>    F1 - P - F2 - P - S
                       =>    F1 - P - F2 - S        (Project removed)
PredicatePushDown      =>    P - F1 - F2 - S
ColumnPruning          =>    P - F1 - P - F2 - S
                       =>    P - F1 - P - F2 - P - S
                       =>    P - F1 - F2 - P - S    (only one Project removed)
RemoveRedundantProject =>    F1 - F2 - P - S        (goes back to the loop start)
```
So the problem is the ColumnPruning rule adds a Project under a Filter (and fails to remove it in the end), and that new Project triggers PushPredicateThroughProject. Once the filters have been push through the Project, a new Project will be added by the ColumnPruning rule and this goes on and on.
The fix should be when adding Projects, the rule applies top-down, but later when removing extra Projects, the process should go bottom-up to ensure all extra Projects can be matched.

## How was this patch tested?

Added a optimization rule test in ColumnPruningSuite; and a end-to-end test in SQLQuerySuite.

Author: maryannxue <maryannxue@apache.org>

Closes #21674 from maryannxue/spark-24696.
---
 .../spark/sql/catalyst/dsl/package.scala      |  1 +
 .../sql/catalyst/optimizer/Optimizer.scala    |  5 +++--
 .../optimizer/ColumnPruningSuite.scala        |  9 +++++++-
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 21 +++++++++++++++++++
 4 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index efb2eba655e15..8cf69c6f3c922 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -149,6 +149,7 @@ package object dsl {
       }
     }
 
+    def rand(e: Long): Expression = Rand(Literal.create(e, LongType))
     def sum(e: Expression): Expression = Sum(e).toAggregateExpression()
     def sumDistinct(e: Expression): Expression = Sum(e).toAggregateExpression(isDistinct = true)
     def count(e: Expression): Expression = Count(e).toAggregateExpression()
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index aa992def1ce6c..2cc27d82f7d20 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -526,9 +526,10 @@ object ColumnPruning extends Rule[LogicalPlan] {
 
   /**
    * The Project before Filter is not necessary but conflict with PushPredicatesThroughProject,
-   * so remove it.
+   * so remove it. Since the Projects have been added top-down, we need to remove in bottom-up
+   * order, otherwise lower Projects can be missed.
    */
-  private def removeProjectBeforeFilter(plan: LogicalPlan): LogicalPlan = plan transform {
+  private def removeProjectBeforeFilter(plan: LogicalPlan): LogicalPlan = plan transformUp {
     case p1 @ Project(_, f @ Filter(_, p2 @ Project(_, child)))
       if p2.outputSet.subsetOf(child.outputSet) =>
       p1.copy(child = f.copy(child = child))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
index 3f41f4b144096..8b05ba32e6eef 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.catalyst.optimizer
 
 import scala.reflect.runtime.universe.TypeTag
 
-import org.apache.spark.sql.catalyst.analysis
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
@@ -370,5 +369,13 @@ class ColumnPruningSuite extends PlanTest {
     comparePlans(optimized2, expected2.analyze)
   }
 
+  test("SPARK-24696 ColumnPruning rule fails to remove extra Project") {
+    val input = LocalRelation('key.int, 'value.string)
+    val query = input.select('key).where(rand(0L) > 0.5).where('key < 10).analyze
+    val optimized = Optimize.execute(query)
+    val expected = input.where(rand(0L) > 0.5).where('key < 10).select('key).analyze
+    comparePlans(optimized, expected)
+  }
+
   // todo: add more tests for column pruning
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 640affc10ee58..dfb9c137b74f0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2792,4 +2792,25 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  test("SPARK-24696 ColumnPruning rule fails to remove extra Project") {
+    withTable("fact_stats", "dim_stats") {
+      val factData = Seq((1, 1, 99, 1), (2, 2, 99, 2), (3, 1, 99, 3), (4, 2, 99, 4))
+      val storeData = Seq((1, "BW", "DE"), (2, "AZ", "US"))
+      spark.udf.register("filterND", udf((value: Int) => value > 2).asNondeterministic)
+      factData.toDF("date_id", "store_id", "product_id", "units_sold")
+        .write.mode("overwrite").partitionBy("store_id").format("parquet").saveAsTable("fact_stats")
+      storeData.toDF("store_id", "state_province", "country")
+        .write.mode("overwrite").format("parquet").saveAsTable("dim_stats")
+      val df = sql(
+        """
+          |SELECT f.date_id, f.product_id, f.store_id FROM
+          |(SELECT date_id, product_id, store_id
+          |   FROM fact_stats WHERE filterND(date_id)) AS f
+          |JOIN dim_stats s
+          |ON f.store_id = s.store_id WHERE s.country = 'DE'
+        """.stripMargin)
+      checkAnswer(df, Seq(Row(3, 99, 1)))
+    }
+  }
 }

From d54d8b86301581142293341af25fd78b3278a2e8 Mon Sep 17 00:00:00 2001
From: Xiao Li <gatorsmile@gmail.com>
Date: Fri, 29 Jun 2018 23:51:13 -0700
Subject: [PATCH 36/79] simplify rand in dsl/package.scala

---
 .../main/scala/org/apache/spark/sql/catalyst/dsl/package.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index 8cf69c6f3c922..89e8c998f740d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -149,7 +149,7 @@ package object dsl {
       }
     }
 
-    def rand(e: Long): Expression = Rand(Literal.create(e, LongType))
+    def rand(e: Long): Expression = Rand(e)
     def sum(e: Expression): Expression = Sum(e).toAggregateExpression()
     def sumDistinct(e: Expression): Expression = Sum(e).toAggregateExpression(isDistinct = true)
     def count(e: Expression): Expression = Count(e).toAggregateExpression()

From f825847c82042a9eee7bd5cfab106310d279fc32 Mon Sep 17 00:00:00 2001
From: Sean Owen <srowen@gmail.com>
Date: Sat, 30 Jun 2018 19:27:16 -0500
Subject: [PATCH 37/79] [SPARK-24654][BUILD] Update, fix LICENSE and NOTICE,
 and specialize for source vs binary

Whew, lots of work to track down again all the license requirements, but this ought to be a pretty good pass. Below, find a writeup on how I approached it for future reference.

- LICENSE and NOTICE and licenses/ now reflect the *source* release
- LICENSE-binary and NOTICE-binary and licenses-binary now reflect the binary release
- Recreated all the license info from scratch
- Added notes about how this was constructed for next time
- License-oriented info was moved from NOTICE to LICENSE, esp. for Cat B deps
- Some seemingly superfluous or stale license info was removed, especially for test-scope deps
- Updated release script to put binary-oriented versions in binary releases

----

# Principles

ASF projects distribute source and binary code under the Apache License 2.0. However these project distributions frequently include copies of source or binary code from third parties, under possibly other license terms. This triggers conditions of those licenses, which essentially amount to including license information in a LICENSE and/or NOTICE file, and including copies of license texts (here, in a directory called `license/`).

See http://www.apache.org/dev/licensing-howto.html and https://www.apache.org/legal/resolved.html#required-third-party-notices

# In Spark

Spark produces source releases, and also binary releases of that code. Spark source code may contain source from third parties, possibly modified. This is true in Scala, Java, Python and R, and in the UI's JavaScript and CSS files. These must be handled appropriately per above in a LICENSE and NOTICE file created for the source release.

Separately, the binary releases may contain binary code from third parties. This is very much true for Scala and Java, as Spark produces an 'assembly' binary release which includes all transitive binary dependencies of this part of Spark. With perhaps the exception of py4j, this doesn't occur in the same way for Python or R because of the way these ecosystems work. (Note that the JS and CSS for the UI will be in both 'source' and 'binary' releases.) These must also be handled in a separate LICENSE and NOTICE file for the binary release.

# Binary Release License

## Transitive Maven Dependencies

We'll first tackle the binary release, and that almost entirely means assessing the transitive dependencies of the Scala/Java backbone of Spark.

Run `project-info-reports:dependencies` with essentially all profiles: a set that would bring in all different possible transitive dependencies. However, don't activate any of the '-lgpl' profiles as these would bring in LGPL-licensed dependencies that are explicitly excluded from Spark binary releases.

```
mvn -Phadoop-2.7 -Pyarn -Phive -Pmesos -Pkubernetes -Pflume -Pkinesis-asl -Pdocker-integration-tests -Phive-thriftserver -Pkafka-0-8 -Ddependency.locations.enabled=false project-info-reports:dependencies
```

Open `assembly/target/site/dependencies.html`. Find "Project Transitive Dependencies", and find "compile" and "runtime" (if exists). This is a list of all the dependencies that Spark is going to ship in its binary "assembly" distro and therefore whose licenses need to be appropriately considered in LICENSE and NOTICE. Copy this table into a spreadsheet for easy management.

Next job is to fill in some blanks, as a few projects will not have clearly declared their licenses in a POM. Sort by license.

This is a good time to verify all the dependencies are at least Cat A/B licenses, and not Cat X! http://www.apache.org/legal/resolved.html

### Apache License 2

The Apache License 2 variants are typically easiest to deal with as they will not require you to modify LICENSE, nor add to license/. It's still good form to list the ALv2 dependencies in LICENSE for completeness, but optional.

They may require you to propagate bits from NOTICE. It's tedious to track down all the NOTICE files and evaluate what if anything needs to be copied to NOTICE.

Fortunately, this can be made easier as the assembly module can be temporarily modified to produce a NOTICE file that concatenates all NOTICE files bundled with transitive dependencies.

First change the packaging of `assembly/spark-assembly_2.11/pom.xml` to `<packaging>jar</packaging>`. Next add this stanza somewhere in the body of the same POM file:

```
<plugin>
  <groupId>org.apache.maven.plugins</groupId>
  <artifactId>maven-shade-plugin</artifactId>
  <configuration>
    <shadedArtifactAttached>false</shadedArtifactAttached>
    <artifactSet>
      <includes>
        <include>*:*</include>
      </includes>
    </artifactSet>
  </configuration>
  <executions>
    <execution>
      <phase>package</phase>
      <goals>
        <goal>shade</goal>
      </goals>
      <configuration>
        <transformers>
          <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer"/>
        </transformers>
      </configuration>
    </execution>
  </executions>
</plugin>
```

Finally execute `mvn ... package` with all of the same `-P` profile flags as above. In the JAR file at `assembly/target/spark-assembly_2.11....jar` you'll find a file `META-INF/NOTICE` that concatenates all NOTICE files bundled with transitive dependencies. This should be the starting point for the binary release's NOTICE file.

Some elements in the file are from Spark itself, like:

```
Spark Project Assembly
Copyright 2018 The Apache Software Foundation

Spark Project Core
Copyright 2018 The Apache Software Foundation
```

These can be removed.

Remove elements of the combined NOTICE file that aren't relevant to Spark. It's actually rare that we are sure that some element is completely irrelevant to Spark, because each transitive dependency includes all its transitive dependencies. So there may be nothing that can be done here.

Of course, some projects may not publish NOTICE in their Maven artifacts. Ideally, search for the NOTICE file of projects that don't seem to have produced any text in NOTICE, but, there is some argument that projects that don't produce a NOTICE in their Maven artifacts don't entail an obligation on projects that depend solely on their Maven artifacts.

### Other Licenses

Next are "Cat A" permissively licensed (BSD 2-Clause, BSD 3-Clause, MIT) components. List the components grouped by their license type in LICENSE. Then add the text of the license to licenses/. For example if you list "foo bar" as a BSD-licensed dependency, add its license text as licenses/LICENSE-foo-bar.txt.

Public domain and similar works are treated like permissively licensed dependencies.

And the same goes for all Cat B licenses too, like CDDL. However these additional require at least a URL pointer to the project's page. Use the artifact hyperlink in your spreadsheet if possible; if non-existent or doesn't resolve, do your best to determine a URL for the project's source.

### Shaded third-party dependencies

Some third party dependencies actually copy in other dependencies rather than depend on them as Maven artifacts. This means they don't show up in the process above. These can be quite hard to track down, but are rare. A key example is reflectasm, embedded in kryo.

### Examples module

The above _almost_ considers everything bundled in a Spark binary release. The main assembly won't include examples. The same must be done for dependencies marked as 'compile' for the examples module. See `examples/target/site/dependencies.html`. At the time of this writing however this just adds one dependency: `scopt`.

### provided scope

Above we considered just compile and runtime scope dependencies, which makes sense as they are the ones that are packaged. However, for complicated reasons (shading), a few components that Spark does bundle are not marked as compile dependencies in the assembly. Therefore it's also necessary to consider 'provided' dependencies from `assembly/target/site/dependencies.html` actually! Right now that's just Jetty and JPMML artifacts.

## Python, R

Don't forget that Py4J is also distributed in the binary release, actually. There should be no other R, Python code in the binary release. That's it.

## Sense checking

Compare the contents of `jars/`, `examples/jars/` and `python/lib` from a recent binary release to see if anything appears there that doesn't seem to have been covered above. These additional components will have to be handled manually, but should be few or none of this type.

# Source Release License

While there are relatively fewer third-party source artifacts included as source code, there is no automated way to detect it, really. It requires some degree of manual auditing. Most third party source comes from included JS and CSS files.

At the time of this writing, some places to look or consider: `build/sbt-launch-lib.bash`, `python/lib`, third party source in `python/pyspark` like `heapq3.py`, `docs/js/vendor`, and `core/src/main/resources/org/apache/spark/ui/static`.

The principles are the same as above.

Remember some JS files copy in other JS files! Look out for Modernizr.

# One More Thing: JS and CSS in Binary Release

Now that you've got a handle on source licenses, recall that all the JS and CSS source code will *also* be part of the binary release. Copy that info from source to binary license files accordingly.

Author: Sean Owen <srowen@gmail.com>

Closes #21640 from srowen/SPARK-24654.
---
 LICENSE                                       |  158 +--
 LICENSE-binary                                |  520 ++++++++
 NOTICE                                        |  661 ----------
 NOTICE-binary                                 | 1170 +++++++++++++++++
 dev/.rat-excludes                             |    4 +
 dev/make-distribution.sh                      |    7 +-
 .../LICENSE-AnchorJS.txt                      |    0
 licenses-binary/LICENSE-CC0.txt               |  121 ++
 .../LICENSE-antlr.txt                         |    0
 licenses-binary/LICENSE-arpack.txt            |    8 +
 licenses-binary/LICENSE-automaton.txt         |   24 +
 licenses-binary/LICENSE-bootstrap.txt         |   13 +
 .../LICENSE-bouncycastle-bcprov.txt           |    7 +
 licenses-binary/LICENSE-cloudpickle.txt       |   28 +
 licenses-binary/LICENSE-d3.min.js.txt         |   26 +
 .../LICENSE-dagre-d3.txt                      |    4 +-
 licenses-binary/LICENSE-datatables.txt        |    7 +
 {licenses => licenses-binary}/LICENSE-f2j.txt |    0
 licenses-binary/LICENSE-graphlib-dot.txt      |   19 +
 licenses-binary/LICENSE-heapq.txt             |  280 ++++
 licenses-binary/LICENSE-janino.txt            |   31 +
 licenses-binary/LICENSE-javassist.html        |  373 ++++++
 .../LICENSE-javolution.txt                    |    0
 .../LICENSE-jline.txt                         |    0
 .../LICENSE-jodd.txt                          |   12 +-
 .../LICENSE-join.txt                          |    0
 licenses-binary/LICENSE-jquery.txt            |   20 +
 licenses-binary/LICENSE-json-formatter.txt    |    6 +
 licenses-binary/LICENSE-jtransforms.html      |  388 ++++++
 .../LICENSE-kryo.txt                          |    0
 licenses-binary/LICENSE-leveldbjni.txt        |   27 +
 licenses-binary/LICENSE-machinist.txt         |   19 +
 .../LICENSE-matchMedia-polyfill.txt           |    1 +
 .../LICENSE-minlog.txt                        |    0
 licenses-binary/LICENSE-modernizr.txt         |   21 +
 .../LICENSE-netlib.txt                        |    0
 .../LICENSE-paranamer.txt                     |    0
 .../LICENSE-pmml-model.txt                    |    0
 .../LICENSE-protobuf.txt                      |    0
 licenses-binary/LICENSE-py4j.txt              |   27 +
 .../LICENSE-pyrolite.txt                      |    0
 .../LICENSE-reflectasm.txt                    |    0
 licenses-binary/LICENSE-respond.txt           |   22 +
 licenses-binary/LICENSE-sbt-launch-lib.txt    |   26 +
 .../LICENSE-scala.txt                         |    0
 licenses-binary/LICENSE-scopt.txt             |    9 +
 .../LICENSE-slf4j.txt                         |    0
 licenses-binary/LICENSE-sorttable.js.txt      |   16 +
 .../LICENSE-spire.txt                         |    0
 licenses-binary/LICENSE-vis.txt               |   22 +
 .../LICENSE-xmlenc.txt                        |    0
 .../LICENSE-zstd-jni.txt                      |    0
 .../LICENSE-zstd.txt                          |    0
 licenses/LICENSE-CC0.txt                      |  121 ++
 licenses/LICENSE-SnapTree.txt                 |   35 -
 licenses/LICENSE-bootstrap.txt                |   13 +
 licenses/LICENSE-boto.txt                     |   20 -
 licenses/LICENSE-datatables.txt               |    7 +
 licenses/LICENSE-graphlib-dot.txt             |    2 +-
 licenses/LICENSE-jbcrypt.txt                  |   17 -
 .../{LICENSE-jmock.txt => LICENSE-join.txt}   |   22 +-
 licenses/LICENSE-jquery.txt                   |   23 +-
 licenses/LICENSE-json-formatter.txt           |    6 +
 licenses/LICENSE-matchMedia-polyfill.txt      |    1 +
 licenses/LICENSE-postgresql.txt               |   24 -
 licenses/LICENSE-respond.txt                  |   22 +
 licenses/LICENSE-scalacheck.txt               |   32 -
 licenses/LICENSE-vis.txt                      |   22 +
 68 files changed, 3526 insertions(+), 918 deletions(-)
 create mode 100644 LICENSE-binary
 create mode 100644 NOTICE-binary
 rename licenses/LICENSE-scopt.txt => licenses-binary/LICENSE-AnchorJS.txt (100%)
 create mode 100644 licenses-binary/LICENSE-CC0.txt
 rename {licenses => licenses-binary}/LICENSE-antlr.txt (100%)
 create mode 100644 licenses-binary/LICENSE-arpack.txt
 create mode 100644 licenses-binary/LICENSE-automaton.txt
 create mode 100644 licenses-binary/LICENSE-bootstrap.txt
 create mode 100644 licenses-binary/LICENSE-bouncycastle-bcprov.txt
 create mode 100644 licenses-binary/LICENSE-cloudpickle.txt
 create mode 100644 licenses-binary/LICENSE-d3.min.js.txt
 rename licenses/LICENSE-Mockito.txt => licenses-binary/LICENSE-dagre-d3.txt (93%)
 create mode 100644 licenses-binary/LICENSE-datatables.txt
 rename {licenses => licenses-binary}/LICENSE-f2j.txt (100%)
 create mode 100644 licenses-binary/LICENSE-graphlib-dot.txt
 create mode 100644 licenses-binary/LICENSE-heapq.txt
 create mode 100644 licenses-binary/LICENSE-janino.txt
 create mode 100644 licenses-binary/LICENSE-javassist.html
 rename {licenses => licenses-binary}/LICENSE-javolution.txt (100%)
 rename {licenses => licenses-binary}/LICENSE-jline.txt (100%)
 rename licenses/LICENSE-junit-interface.txt => licenses-binary/LICENSE-jodd.txt (67%)
 rename licenses/LICENSE-DPark.txt => licenses-binary/LICENSE-join.txt (100%)
 create mode 100644 licenses-binary/LICENSE-jquery.txt
 create mode 100644 licenses-binary/LICENSE-json-formatter.txt
 create mode 100644 licenses-binary/LICENSE-jtransforms.html
 rename {licenses => licenses-binary}/LICENSE-kryo.txt (100%)
 create mode 100644 licenses-binary/LICENSE-leveldbjni.txt
 create mode 100644 licenses-binary/LICENSE-machinist.txt
 create mode 100644 licenses-binary/LICENSE-matchMedia-polyfill.txt
 rename {licenses => licenses-binary}/LICENSE-minlog.txt (100%)
 create mode 100644 licenses-binary/LICENSE-modernizr.txt
 rename {licenses => licenses-binary}/LICENSE-netlib.txt (100%)
 rename {licenses => licenses-binary}/LICENSE-paranamer.txt (100%)
 rename licenses/LICENSE-jpmml-model.txt => licenses-binary/LICENSE-pmml-model.txt (100%)
 rename {licenses => licenses-binary}/LICENSE-protobuf.txt (100%)
 create mode 100644 licenses-binary/LICENSE-py4j.txt
 rename {licenses => licenses-binary}/LICENSE-pyrolite.txt (100%)
 rename {licenses => licenses-binary}/LICENSE-reflectasm.txt (100%)
 create mode 100644 licenses-binary/LICENSE-respond.txt
 create mode 100644 licenses-binary/LICENSE-sbt-launch-lib.txt
 rename {licenses => licenses-binary}/LICENSE-scala.txt (100%)
 create mode 100644 licenses-binary/LICENSE-scopt.txt
 rename {licenses => licenses-binary}/LICENSE-slf4j.txt (100%)
 create mode 100644 licenses-binary/LICENSE-sorttable.js.txt
 rename {licenses => licenses-binary}/LICENSE-spire.txt (100%)
 create mode 100644 licenses-binary/LICENSE-vis.txt
 rename {licenses => licenses-binary}/LICENSE-xmlenc.txt (100%)
 rename {licenses => licenses-binary}/LICENSE-zstd-jni.txt (100%)
 rename {licenses => licenses-binary}/LICENSE-zstd.txt (100%)
 create mode 100644 licenses/LICENSE-CC0.txt
 delete mode 100644 licenses/LICENSE-SnapTree.txt
 create mode 100644 licenses/LICENSE-bootstrap.txt
 delete mode 100644 licenses/LICENSE-boto.txt
 create mode 100644 licenses/LICENSE-datatables.txt
 delete mode 100644 licenses/LICENSE-jbcrypt.txt
 rename licenses/{LICENSE-jmock.txt => LICENSE-join.txt} (60%)
 create mode 100644 licenses/LICENSE-json-formatter.txt
 create mode 100644 licenses/LICENSE-matchMedia-polyfill.txt
 delete mode 100644 licenses/LICENSE-postgresql.txt
 create mode 100644 licenses/LICENSE-respond.txt
 delete mode 100644 licenses/LICENSE-scalacheck.txt
 create mode 100644 licenses/LICENSE-vis.txt

diff --git a/LICENSE b/LICENSE
index 6f5d9452e800d..b771bd552b762 100644
--- a/LICENSE
+++ b/LICENSE
@@ -201,103 +201,61 @@
    limitations under the License.
 
 
-=======================================================================
-Apache Spark Subcomponents:
-
-The Apache Spark project contains subcomponents with separate copyright
-notices and license terms. Your use of the source code for the these
-subcomponents is subject to the terms and conditions of the following
-licenses.
-
-
-========================================================================
-For heapq (pyspark/heapq3.py):
-========================================================================
-
-See license/LICENSE-heapq.txt
-
-========================================================================
-For SnapTree:
-========================================================================
-
-See license/LICENSE-SnapTree.txt
-
-========================================================================
-For jbcrypt:
-========================================================================
-
-See license/LICENSE-jbcrypt.txt
-
-========================================================================
-BSD-style licenses
-========================================================================
-
-The following components are provided under a BSD-style license. See project link for details.
-The text of each license is also included at licenses/LICENSE-[project].txt.
-
-     (BSD 3 Clause) netlib core (com.github.fommil.netlib:core:1.1.2 - https://github.com/fommil/netlib-java/core)
-     (BSD 3 Clause) JPMML-Model (org.jpmml:pmml-model:1.2.7 - https://github.com/jpmml/jpmml-model)
-     (BSD 3 Clause) jmock (org.jmock:jmock-junit4:2.8.4 - http://jmock.org/)
-     (BSD License) AntLR Parser Generator (antlr:antlr:2.7.7 - http://www.antlr.org/)
-     (BSD License) ANTLR 4.5.2-1 (org.antlr:antlr4:4.5.2-1 - http://wwww.antlr.org/)
-     (BSD licence) ANTLR ST4 4.0.4 (org.antlr:ST4:4.0.4 - http://www.stringtemplate.org)
-     (BSD licence) ANTLR StringTemplate (org.antlr:stringtemplate:3.2.1 - http://www.stringtemplate.org)
-     (BSD License) Javolution (javolution:javolution:5.5.1 - http://javolution.org)
-     (BSD) JLine (jline:jline:2.14.3 - https://github.com/jline/jline2)
-     (BSD) ParaNamer Core (com.thoughtworks.paranamer:paranamer:2.3 - http://paranamer.codehaus.org/paranamer)
-     (BSD) ParaNamer Core (com.thoughtworks.paranamer:paranamer:2.6 - http://paranamer.codehaus.org/paranamer)
-     (BSD 3 Clause) Scala (http://www.scala-lang.org/download/#License)
-        (Interpreter classes (all .scala files in repl/src/main/scala
-        except for Main.Scala, SparkHelper.scala and ExecutorClassLoader.scala),
-        and for SerializableMapWrapper in JavaUtils.scala)
-     (BSD-like) Scala Actors library (org.scala-lang:scala-actors:2.11.12 - http://www.scala-lang.org/)
-     (BSD-like) Scala Compiler (org.scala-lang:scala-compiler:2.11.12 - http://www.scala-lang.org/)
-     (BSD-like) Scala Compiler (org.scala-lang:scala-reflect:2.11.12 - http://www.scala-lang.org/)
-     (BSD-like) Scala Library (org.scala-lang:scala-library:2.11.12 - http://www.scala-lang.org/)
-     (BSD-like) Scalap (org.scala-lang:scalap:2.11.12 - http://www.scala-lang.org/)
-     (BSD-style) scalacheck (org.scalacheck:scalacheck_2.11:1.10.0 - http://www.scalacheck.org)
-     (BSD-style) spire (org.spire-math:spire_2.11:0.7.1 - http://spire-math.org)
-     (BSD-style) spire-macros (org.spire-math:spire-macros_2.11:0.7.1 - http://spire-math.org)
-     (New BSD License) Kryo (com.esotericsoftware:kryo:3.0.3 - https://github.com/EsotericSoftware/kryo)
-     (New BSD License) MinLog (com.esotericsoftware:minlog:1.3.0 - https://github.com/EsotericSoftware/minlog)
-     (New BSD license) Protocol Buffer Java API (com.google.protobuf:protobuf-java:2.5.0 - http://code.google.com/p/protobuf)
-     (New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
-     (The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
-     (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
-     (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.7 - http://py4j.sourceforge.net/)
-     (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
-     (BSD licence) sbt and sbt-launch-lib.bash
-     (BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)
-     (BSD 3 Clause) DPark (https://github.com/douban/dpark/blob/master/LICENSE)
-     (BSD 3 Clause) CloudPickle (https://github.com/cloudpipe/cloudpickle/blob/master/LICENSE)
-     (BSD 2 Clause) Zstd-jni (https://github.com/luben/zstd-jni/blob/master/LICENSE)
-     (BSD license) Zstd (https://github.com/facebook/zstd/blob/v1.3.1/LICENSE)
-
-========================================================================
-MIT licenses
-========================================================================
-
-The following components are provided under the MIT License. See project link for details.
-The text of each license is also included at licenses/LICENSE-[project].txt.
-
-     (MIT License) JCL 1.1.1 implemented over SLF4J (org.slf4j:jcl-over-slf4j:1.7.5 - http://www.slf4j.org)
-     (MIT License) JUL to SLF4J bridge (org.slf4j:jul-to-slf4j:1.7.5 - http://www.slf4j.org)
-     (MIT License) SLF4J API Module (org.slf4j:slf4j-api:1.7.5 - http://www.slf4j.org)
-     (MIT License) SLF4J LOG4J-12 Binding (org.slf4j:slf4j-log4j12:1.7.5 - http://www.slf4j.org)
-     (MIT License) pyrolite (org.spark-project:pyrolite:2.0.1 - http://pythonhosted.org/Pyro4/)
-     (MIT License) scopt (com.github.scopt:scopt_2.11:3.2.0 - https://github.com/scopt/scopt)
-     (The MIT License) Mockito (org.mockito:mockito-core:1.9.5 - http://www.mockito.org)
-     (MIT License) jquery (https://jquery.org/license/)
-     (MIT License) AnchorJS (https://github.com/bryanbraun/anchorjs)
-     (MIT License) graphlib-dot (https://github.com/cpettitt/graphlib-dot)
-     (MIT License) dagre-d3 (https://github.com/cpettitt/dagre-d3)
-     (MIT License) sorttable (https://github.com/stuartlangridge/sorttable)
-     (MIT License) boto (https://github.com/boto/boto/blob/develop/LICENSE)
-     (MIT License) datatables (http://datatables.net/license)
-     (MIT License) mustache (https://github.com/mustache/mustache/blob/master/LICENSE)
-     (MIT License) cookies (http://code.google.com/p/cookies/wiki/License)
-     (MIT License) blockUI (http://jquery.malsup.com/block/)
-     (MIT License) RowsGroup (http://datatables.net/license/mit)
-     (MIT License) jsonFormatter (http://www.jqueryscript.net/other/jQuery-Plugin-For-Pretty-JSON-Formatting-jsonFormatter.html)
-     (MIT License) modernizr (https://github.com/Modernizr/Modernizr/blob/master/LICENSE)
-     (MIT License) machinist (https://github.com/typelevel/machinist)
+------------------------------------------------------------------------------------
+This product bundles various third-party components under other open source licenses.
+This section summarizes those components and their licenses. See licenses/
+for text of these licenses.
+
+
+Apache Software Foundation License 2.0
+--------------------------------------
+
+common/network-common/src/main/java/org/apache/spark/network/util/LimitedInputStream.java
+core/src/main/java/org/apache/spark/util/collection/TimSort.java
+core/src/main/resources/org/apache/spark/ui/static/bootstrap*
+core/src/main/resources/org/apache/spark/ui/static/jsonFormatter*
+core/src/main/resources/org/apache/spark/ui/static/vis*
+docs/js/vendor/bootstrap.js
+
+
+Python Software Foundation License
+----------------------------------
+
+pyspark/heapq3.py
+
+
+BSD 3-Clause
+------------
+
+python/lib/py4j-*-src.zip
+python/pyspark/cloudpickle.py
+python/pyspark/join.py
+core/src/main/resources/org/apache/spark/ui/static/d3.min.js
+
+The CSS style for the navigation sidebar of the documentation was originally
+submitted by Óscar Nájera for the scikit-learn project. The scikit-learn project
+is distributed under the 3-Clause BSD license.
+
+
+MIT License
+-----------
+
+core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js
+core/src/main/resources/org/apache/spark/ui/static/*dataTables*
+core/src/main/resources/org/apache/spark/ui/static/graphlib-dot.min.js
+ore/src/main/resources/org/apache/spark/ui/static/jquery*
+core/src/main/resources/org/apache/spark/ui/static/sorttable.js
+docs/js/vendor/anchor.min.js
+docs/js/vendor/jquery*
+docs/js/vendor/modernizer*
+
+
+Creative Commons CC0 1.0 Universal Public Domain Dedication
+-----------------------------------------------------------
+(see LICENSE-CC0.txt)
+
+data/mllib/images/kittens/29.5.a_b_EGDP022204.jpg
+data/mllib/images/kittens/54893.jpg
+data/mllib/images/kittens/DP153539.jpg
+data/mllib/images/kittens/DP802813.jpg
+data/mllib/images/multi-channel/chr30.4.184.jpg
\ No newline at end of file
diff --git a/LICENSE-binary b/LICENSE-binary
new file mode 100644
index 0000000000000..c033dd8ad2e6a
--- /dev/null
+++ b/LICENSE-binary
@@ -0,0 +1,520 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+------------------------------------------------------------------------------------
+This project bundles some components that are also licensed under the Apache
+License Version 2.0:
+
+commons-beanutils:commons-beanutils
+org.apache.zookeeper:zookeeper
+oro:oro
+commons-configuration:commons-configuration
+commons-digester:commons-digester
+com.chuusai:shapeless_2.11
+com.googlecode.javaewah:JavaEWAH
+com.twitter:chill-java
+com.twitter:chill_2.11
+com.univocity:univocity-parsers
+javax.jdo:jdo-api
+joda-time:joda-time
+net.sf.opencsv:opencsv
+org.apache.derby:derby
+org.objenesis:objenesis
+org.roaringbitmap:RoaringBitmap
+org.scalanlp:breeze-macros_2.11
+org.scalanlp:breeze_2.11
+org.typelevel:macro-compat_2.11
+org.yaml:snakeyaml
+org.apache.xbean:xbean-asm5-shaded
+com.squareup.okhttp3:logging-interceptor
+com.squareup.okhttp3:okhttp
+com.squareup.okio:okio
+net.java.dev.jets3t:jets3t
+org.apache.spark:spark-catalyst_2.11
+org.apache.spark:spark-kvstore_2.11
+org.apache.spark:spark-launcher_2.11
+org.apache.spark:spark-mllib-local_2.11
+org.apache.spark:spark-network-common_2.11
+org.apache.spark:spark-network-shuffle_2.11
+org.apache.spark:spark-sketch_2.11
+org.apache.spark:spark-tags_2.11
+org.apache.spark:spark-unsafe_2.11
+commons-httpclient:commons-httpclient
+com.vlkan:flatbuffers
+com.ning:compress-lzf
+io.airlift:aircompressor
+io.dropwizard.metrics:metrics-core
+io.dropwizard.metrics:metrics-ganglia
+io.dropwizard.metrics:metrics-graphite
+io.dropwizard.metrics:metrics-json
+io.dropwizard.metrics:metrics-jvm
+org.iq80.snappy:snappy
+com.clearspring.analytics:stream
+com.jamesmurty.utils:java-xmlbuilder
+commons-codec:commons-codec
+commons-collections:commons-collections
+io.fabric8:kubernetes-client
+io.fabric8:kubernetes-model
+io.netty:netty
+io.netty:netty-all
+net.hydromatic:eigenbase-properties
+net.sf.supercsv:super-csv
+org.apache.arrow:arrow-format
+org.apache.arrow:arrow-memory
+org.apache.arrow:arrow-vector
+org.apache.calcite:calcite-avatica
+org.apache.calcite:calcite-core
+org.apache.calcite:calcite-linq4j
+org.apache.commons:commons-crypto
+org.apache.commons:commons-lang3
+org.apache.hadoop:hadoop-annotations
+org.apache.hadoop:hadoop-auth
+org.apache.hadoop:hadoop-client
+org.apache.hadoop:hadoop-common
+org.apache.hadoop:hadoop-hdfs
+org.apache.hadoop:hadoop-mapreduce-client-app
+org.apache.hadoop:hadoop-mapreduce-client-common
+org.apache.hadoop:hadoop-mapreduce-client-core
+org.apache.hadoop:hadoop-mapreduce-client-jobclient
+org.apache.hadoop:hadoop-mapreduce-client-shuffle
+org.apache.hadoop:hadoop-yarn-api
+org.apache.hadoop:hadoop-yarn-client
+org.apache.hadoop:hadoop-yarn-common
+org.apache.hadoop:hadoop-yarn-server-common
+org.apache.hadoop:hadoop-yarn-server-web-proxy
+org.apache.httpcomponents:httpclient
+org.apache.httpcomponents:httpcore
+org.apache.orc:orc-core
+org.apache.orc:orc-mapreduce
+org.mortbay.jetty:jetty
+org.mortbay.jetty:jetty-util
+com.jolbox:bonecp
+org.json4s:json4s-ast_2.11
+org.json4s:json4s-core_2.11
+org.json4s:json4s-jackson_2.11
+org.json4s:json4s-scalap_2.11
+com.carrotsearch:hppc
+com.fasterxml.jackson.core:jackson-annotations
+com.fasterxml.jackson.core:jackson-core
+com.fasterxml.jackson.core:jackson-databind
+com.fasterxml.jackson.dataformat:jackson-dataformat-yaml
+com.fasterxml.jackson.module:jackson-module-jaxb-annotations
+com.fasterxml.jackson.module:jackson-module-paranamer
+com.fasterxml.jackson.module:jackson-module-scala_2.11
+com.github.mifmif:generex
+com.google.code.findbugs:jsr305
+com.google.code.gson:gson
+com.google.inject:guice
+com.google.inject.extensions:guice-servlet
+com.twitter:parquet-hadoop-bundle
+commons-beanutils:commons-beanutils-core
+commons-cli:commons-cli
+commons-dbcp:commons-dbcp
+commons-io:commons-io
+commons-lang:commons-lang
+commons-logging:commons-logging
+commons-net:commons-net
+commons-pool:commons-pool
+io.fabric8:zjsonpatch
+javax.inject:javax.inject
+javax.validation:validation-api
+log4j:apache-log4j-extras
+log4j:log4j
+net.sf.jpam:jpam
+org.apache.avro:avro
+org.apache.avro:avro-ipc
+org.apache.avro:avro-mapred
+org.apache.commons:commons-compress
+org.apache.commons:commons-math3
+org.apache.curator:curator-client
+org.apache.curator:curator-framework
+org.apache.curator:curator-recipes
+org.apache.directory.api:api-asn1-api
+org.apache.directory.api:api-util
+org.apache.directory.server:apacheds-i18n
+org.apache.directory.server:apacheds-kerberos-codec
+org.apache.htrace:htrace-core
+org.apache.ivy:ivy
+org.apache.mesos:mesos
+org.apache.parquet:parquet-column
+org.apache.parquet:parquet-common
+org.apache.parquet:parquet-encoding
+org.apache.parquet:parquet-format
+org.apache.parquet:parquet-hadoop
+org.apache.parquet:parquet-jackson
+org.apache.thrift:libfb303
+org.apache.thrift:libthrift
+org.codehaus.jackson:jackson-core-asl
+org.codehaus.jackson:jackson-mapper-asl
+org.datanucleus:datanucleus-api-jdo
+org.datanucleus:datanucleus-core
+org.datanucleus:datanucleus-rdbms
+org.lz4:lz4-java
+org.spark-project.hive:hive-beeline
+org.spark-project.hive:hive-cli
+org.spark-project.hive:hive-exec
+org.spark-project.hive:hive-jdbc
+org.spark-project.hive:hive-metastore
+org.xerial.snappy:snappy-java
+stax:stax-api
+xerces:xercesImpl
+org.codehaus.jackson:jackson-jaxrs
+org.codehaus.jackson:jackson-xc
+org.eclipse.jetty:jetty-client
+org.eclipse.jetty:jetty-continuation
+org.eclipse.jetty:jetty-http
+org.eclipse.jetty:jetty-io
+org.eclipse.jetty:jetty-jndi
+org.eclipse.jetty:jetty-plus
+org.eclipse.jetty:jetty-proxy
+org.eclipse.jetty:jetty-security
+org.eclipse.jetty:jetty-server
+org.eclipse.jetty:jetty-servlet
+org.eclipse.jetty:jetty-servlets
+org.eclipse.jetty:jetty-util
+org.eclipse.jetty:jetty-webapp
+org.eclipse.jetty:jetty-xml
+
+core/src/main/java/org/apache/spark/util/collection/TimSort.java
+core/src/main/resources/org/apache/spark/ui/static/bootstrap*
+core/src/main/resources/org/apache/spark/ui/static/jsonFormatter*
+core/src/main/resources/org/apache/spark/ui/static/vis*
+docs/js/vendor/bootstrap.js
+
+
+------------------------------------------------------------------------------------
+This product bundles various third-party components under other open source licenses.
+This section summarizes those components and their licenses. See licenses-binary/
+for text of these licenses.
+
+
+BSD 2-Clause
+------------
+
+com.github.luben:zstd-jni
+javolution:javolution
+com.esotericsoftware:kryo-shaded
+com.esotericsoftware:minlog
+com.esotericsoftware:reflectasm
+com.google.protobuf:protobuf-java
+org.codehaus.janino:commons-compiler
+org.codehaus.janino:janino
+jline:jline
+org.jodd:jodd-core
+
+
+BSD 3-Clause
+------------
+
+dk.brics.automaton:automaton
+org.antlr:antlr-runtime
+org.antlr:ST4
+org.antlr:stringtemplate
+org.antlr:antlr4-runtime
+antlr:antlr
+com.github.fommil.netlib:core
+com.thoughtworks.paranamer:paranamer
+org.scala-lang:scala-compiler
+org.scala-lang:scala-library
+org.scala-lang:scala-reflect
+org.scala-lang.modules:scala-parser-combinators_2.11
+org.scala-lang.modules:scala-xml_2.11
+org.fusesource.leveldbjni:leveldbjni-all
+net.sourceforge.f2j:arpack_combined_all
+xmlenc:xmlenc
+net.sf.py4j:py4j
+org.jpmml:pmml-model
+org.jpmml:pmml-schema
+
+python/lib/py4j-*-src.zip
+python/pyspark/cloudpickle.py
+python/pyspark/join.py
+core/src/main/resources/org/apache/spark/ui/static/d3.min.js
+
+The CSS style for the navigation sidebar of the documentation was originally
+submitted by Óscar Nájera for the scikit-learn project. The scikit-learn project
+is distributed under the 3-Clause BSD license.
+
+
+MIT License
+-----------
+
+org.spire-math:spire-macros_2.11
+org.spire-math:spire_2.11
+org.typelevel:machinist_2.11
+net.razorvine:pyrolite
+org.slf4j:jcl-over-slf4j
+org.slf4j:jul-to-slf4j
+org.slf4j:slf4j-api
+org.slf4j:slf4j-log4j12
+com.github.scopt:scopt_2.11
+org.bouncycastle:bcprov-jdk15on
+
+core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js
+core/src/main/resources/org/apache/spark/ui/static/*dataTables*
+core/src/main/resources/org/apache/spark/ui/static/graphlib-dot.min.js
+ore/src/main/resources/org/apache/spark/ui/static/jquery*
+core/src/main/resources/org/apache/spark/ui/static/sorttable.js
+docs/js/vendor/anchor.min.js
+docs/js/vendor/jquery*
+docs/js/vendor/modernizer*
+
+
+Common Development and Distribution License (CDDL) 1.0
+------------------------------------------------------
+
+javax.activation:activation  http://www.oracle.com/technetwork/java/javase/tech/index-jsp-138795.html
+javax.xml.stream:stax-api    https://jcp.org/en/jsr/detail?id=173
+
+
+Common Development and Distribution License (CDDL) 1.1
+------------------------------------------------------
+
+javax.annotation:javax.annotation-api    https://jcp.org/en/jsr/detail?id=250
+javax.servlet:javax.servlet-api   https://javaee.github.io/servlet-spec/
+javax.transaction:jta http://www.oracle.com/technetwork/java/index.html
+javax.ws.rs:javax.ws.rs-api https://github.com/jax-rs
+javax.xml.bind:jaxb-api    https://github.com/javaee/jaxb-v2
+org.glassfish.hk2:hk2-api https://github.com/javaee/glassfish
+org.glassfish.hk2:hk2-locator (same)
+org.glassfish.hk2:hk2-utils
+org.glassfish.hk2:osgi-resource-locator
+org.glassfish.hk2.external:aopalliance-repackaged
+org.glassfish.hk2.external:javax.inject
+org.glassfish.jersey.bundles.repackaged:jersey-guava
+org.glassfish.jersey.containers:jersey-container-servlet
+org.glassfish.jersey.containers:jersey-container-servlet-core
+org.glassfish.jersey.core:jersey-client
+org.glassfish.jersey.core:jersey-common
+org.glassfish.jersey.core:jersey-server
+org.glassfish.jersey.media:jersey-media-jaxb
+
+
+Mozilla Public License (MPL) 1.1
+--------------------------------
+
+com.github.rwl:jtransforms https://sourceforge.net/projects/jtransforms/
+
+
+Python Software Foundation License
+----------------------------------
+
+pyspark/heapq3.py
+
+
+Public Domain
+-------------
+
+aopalliance:aopalliance
+net.iharder:base64
+org.tukaani:xz
+
+
+Creative Commons CC0 1.0 Universal Public Domain Dedication
+-----------------------------------------------------------
+(see LICENSE-CC0.txt)
+
+data/mllib/images/kittens/29.5.a_b_EGDP022204.jpg
+data/mllib/images/kittens/54893.jpg
+data/mllib/images/kittens/DP153539.jpg
+data/mllib/images/kittens/DP802813.jpg
+data/mllib/images/multi-channel/chr30.4.184.jpg
diff --git a/NOTICE b/NOTICE
index 6ec240efbf12e..9246cc54caa3a 100644
--- a/NOTICE
+++ b/NOTICE
@@ -4,664 +4,3 @@ Copyright 2014 and onwards The Apache Software Foundation.
 This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).
 
-
-========================================================================
-Common Development and Distribution License 1.0
-========================================================================
-
-The following components are provided under the Common Development and Distribution License 1.0. See project link for details.
-
-     (CDDL 1.0) Glassfish Jasper (org.mortbay.jetty:jsp-2.1:6.1.14 - http://jetty.mortbay.org/project/modules/jsp-2.1)
-     (CDDL 1.0) JAX-RS (https://jax-rs-spec.java.net/)
-     (CDDL 1.0) Servlet Specification 2.5 API (org.mortbay.jetty:servlet-api-2.5:6.1.14 - http://jetty.mortbay.org/project/modules/servlet-api-2.5)
-     (CDDL 1.0) (GPL2 w/ CPE) javax.annotation API (https://glassfish.java.net/nonav/public/CDDL+GPL.html)
-     (COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0) (GNU General Public Library) Streaming API for XML (javax.xml.stream:stax-api:1.0-2 - no url defined)
-     (Common Development and Distribution License (CDDL) v1.0) JavaBeans Activation Framework (JAF) (javax.activation:activation:1.1 - http://java.sun.com/products/javabeans/jaf/index.jsp)
-
-========================================================================
-Common Development and Distribution License 1.1
-========================================================================
-
-The following components are provided under the Common Development and Distribution License 1.1. See project link for details.
-
-     (CDDL 1.1) (GPL2 w/ CPE) org.glassfish.hk2 (https://hk2.java.net)
-     (CDDL 1.1) (GPL2 w/ CPE) JAXB API bundle for GlassFish V3 (javax.xml.bind:jaxb-api:2.2.2 - https://jaxb.dev.java.net/)
-     (CDDL 1.1) (GPL2 w/ CPE) JAXB RI (com.sun.xml.bind:jaxb-impl:2.2.3-1 - http://jaxb.java.net/)
-     (CDDL 1.1) (GPL2 w/ CPE) Jersey 2 (https://jersey.java.net)
-
-========================================================================
-Common Public License 1.0
-========================================================================
-
-The following components are provided under the Common Public 1.0 License. See project link for details.
-
-     (Common Public License Version 1.0) JUnit (junit:junit-dep:4.10 - http://junit.org)
-     (Common Public License Version 1.0) JUnit (junit:junit:3.8.1 - http://junit.org)
-     (Common Public License Version 1.0) JUnit (junit:junit:4.8.2 - http://junit.org)
-
-========================================================================
-Eclipse Public License 1.0
-========================================================================
-
-The following components are provided under the Eclipse Public License 1.0. See project link for details.
-
-     (Eclipse Public License v1.0) Eclipse JDT Core (org.eclipse.jdt:core:3.1.1 - http://www.eclipse.org/jdt/)
-
-========================================================================
-Mozilla Public License 1.0
-========================================================================
-
-The following components are provided under the Mozilla Public License 1.0. See project link for details.
-
-     (GPL) (LGPL) (MPL) JTransforms (com.github.rwl:jtransforms:2.4.0 - http://sourceforge.net/projects/jtransforms/)
-     (Mozilla Public License Version 1.1) jamon-runtime (org.jamon:jamon-runtime:2.3.1 - http://www.jamon.org/jamon-runtime/)
-
-
-
-========================================================================
-NOTICE files
-========================================================================
-
-The following NOTICEs are pertain to software distributed with this project.
-
-
-// ------------------------------------------------------------------
-// NOTICE file corresponding to the section 4d of The Apache License,
-// Version 2.0, in this case for
-// ------------------------------------------------------------------
-
-Apache Avro
-Copyright 2009-2013 The Apache Software Foundation
-
-This product includes software developed at
-The Apache Software Foundation (http://www.apache.org/).
-
-Apache Commons Codec
-Copyright 2002-2009 The Apache Software Foundation
-
-This product includes software developed by
-The Apache Software Foundation (http://www.apache.org/).
-
---------------------------------------------------------------------------------
-src/test/org/apache/commons/codec/language/DoubleMetaphoneTest.java contains
-test data from http://aspell.sourceforge.net/test/batch0.tab.
-
-Copyright (C) 2002 Kevin Atkinson (kevina@gnu.org). Verbatim copying
-and distribution of this entire article is permitted in any medium,
-provided this notice is preserved.
---------------------------------------------------------------------------------
-
-Apache HttpComponents HttpClient
-Copyright 1999-2011 The Apache Software Foundation
-
-This project contains annotations derived from JCIP-ANNOTATIONS
-Copyright (c) 2005 Brian Goetz and Tim Peierls. See http://www.jcip.net
-
-Apache HttpComponents HttpCore
-Copyright 2005-2011 The Apache Software Foundation
-
-Curator Recipes
-Copyright 2011-2014 The Apache Software Foundation
-
-Curator Framework
-Copyright 2011-2014 The Apache Software Foundation
-
-Curator Client
-Copyright 2011-2014 The Apache Software Foundation
-
-Apache Geronimo
-Copyright 2003-2008 The Apache Software Foundation
-
-Activation 1.1
-Copyright 2003-2007 The Apache Software Foundation
-
-Apache Commons Lang
-Copyright 2001-2014 The Apache Software Foundation
-
-This product includes software from the Spring Framework,
-under the Apache License 2.0 (see: StringUtils.containsWhitespace())
-
-Apache log4j
-Copyright 2007 The Apache Software Foundation
-
-# Compress LZF
-
-This library contains efficient implementation of LZF compression format,
-as well as additional helper classes that build on JDK-provided gzip (deflat)
-codec.
-
-## Licensing
-
-Library is licensed under Apache License 2.0, as per accompanying LICENSE file.
-
-## Credit
-
-Library has been written by Tatu Saloranta (tatu.saloranta@iki.fi).
-It was started at Ning, inc., as an official Open Source process used by
-platform backend, but after initial versions has been developed outside of
-Ning by supporting community.
-
-Other contributors include:
-
-* Jon Hartlaub (first versions of streaming reader/writer; unit tests)
-* Cedrik Lime: parallel LZF implementation
-
-Various community members have contributed bug reports, and suggested minor
-fixes; these can be found from file "VERSION.txt" in SCM.
-
-Objenesis
-Copyright 2006-2009 Joe Walnes, Henri Tremblay, Leonardo Mesquita
-
-Apache Commons Net
-Copyright 2001-2010 The Apache Software Foundation
-
-                            The Netty Project
-                            =================
-
-Please visit the Netty web site for more information:
-
-  * http://netty.io/
-
-Copyright 2011 The Netty Project
-
-The Netty Project licenses this file to you under the Apache License,
-version 2.0 (the "License"); you may not use this file except in compliance
-with the License. You may obtain a copy of the License at:
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-License for the specific language governing permissions and limitations
-under the License.
-
-Also, please refer to each LICENSE.<component>.txt file, which is located in
-the 'license' directory of the distribution file, for the license terms of the
-components that this product depends on.
-
--------------------------------------------------------------------------------
-This product contains the extensions to Java Collections Framework which has
-been derived from the works by JSR-166 EG, Doug Lea, and Jason T. Greene:
-
-  * LICENSE:
-    * license/LICENSE.jsr166y.txt (Public Domain)
-  * HOMEPAGE:
-    * http://gee.cs.oswego.edu/cgi-bin/viewcvs.cgi/jsr166/
-    * http://viewvc.jboss.org/cgi-bin/viewvc.cgi/jbosscache/experimental/jsr166/
-
-This product contains a modified version of Robert Harder's Public Domain
-Base64 Encoder and Decoder, which can be obtained at:
-
-  * LICENSE:
-    * license/LICENSE.base64.txt (Public Domain)
-  * HOMEPAGE:
-    * http://iharder.sourceforge.net/current/java/base64/
-
-This product contains a modified version of 'JZlib', a re-implementation of
-zlib in pure Java, which can be obtained at:
-
-  * LICENSE:
-    * license/LICENSE.jzlib.txt (BSD Style License)
-  * HOMEPAGE:
-    * http://www.jcraft.com/jzlib/
-
-This product optionally depends on 'Protocol Buffers', Google's data
-interchange format, which can be obtained at:
-
-  * LICENSE:
-    * license/LICENSE.protobuf.txt (New BSD License)
-  * HOMEPAGE:
-    * http://code.google.com/p/protobuf/
-
-This product optionally depends on 'SLF4J', a simple logging facade for Java,
-which can be obtained at:
-
-  * LICENSE:
-    * license/LICENSE.slf4j.txt (MIT License)
-  * HOMEPAGE:
-    * http://www.slf4j.org/
-
-This product optionally depends on 'Apache Commons Logging', a logging
-framework, which can be obtained at:
-
-  * LICENSE:
-    * license/LICENSE.commons-logging.txt (Apache License 2.0)
-  * HOMEPAGE:
-    * http://commons.apache.org/logging/
-
-This product optionally depends on 'Apache Log4J', a logging framework,
-which can be obtained at:
-
-  * LICENSE:
-    * license/LICENSE.log4j.txt (Apache License 2.0)
-  * HOMEPAGE:
-    * http://logging.apache.org/log4j/
-
-This product optionally depends on 'JBoss Logging', a logging framework,
-which can be obtained at:
-
-  * LICENSE:
-    * license/LICENSE.jboss-logging.txt (GNU LGPL 2.1)
-  * HOMEPAGE:
-    * http://anonsvn.jboss.org/repos/common/common-logging-spi/
-
-This product optionally depends on 'Apache Felix', an open source OSGi
-framework implementation, which can be obtained at:
-
-  * LICENSE:
-    * license/LICENSE.felix.txt (Apache License 2.0)
-  * HOMEPAGE:
-    * http://felix.apache.org/
-
-This product optionally depends on 'Webbit', a Java event based
-WebSocket and HTTP server:
-
-  * LICENSE:
-    * license/LICENSE.webbit.txt (BSD License)
-  * HOMEPAGE:
-    * https://github.com/joewalnes/webbit
-
-# Jackson JSON processor
-
-Jackson is a high-performance, Free/Open Source JSON processing library.
-It was originally written by Tatu Saloranta (tatu.saloranta@iki.fi), and has
-been in development since 2007.
-It is currently developed by a community of developers, as well as supported
-commercially by FasterXML.com.
-
-Jackson core and extension components may be licensed under different licenses.
-To find the details that apply to this artifact see the accompanying LICENSE file.
-For more information, including possible other licensing options, contact
-FasterXML.com (http://fasterxml.com).
-
-## Credits
-
-A list of contributors may be found from CREDITS file, which is included
-in some artifacts (usually source distributions); but is always available
-from the source code management (SCM) system project uses.
-
-Jackson core and extension components may licensed under different licenses.
-To find the details that apply to this artifact see the accompanying LICENSE file.
-For more information, including possible other licensing options, contact
-FasterXML.com (http://fasterxml.com).
-
-mesos
-Copyright 2014 The Apache Software Foundation
-
-Apache Thrift
-Copyright 2006-2010 The Apache Software Foundation.
-
-   Apache Ant
-   Copyright 1999-2013 The Apache Software Foundation
-
-   The <sync> task is based on code Copyright (c) 2002, Landmark
-   Graphics Corp that has been kindly donated to the Apache Software
-   Foundation.
-
-Apache Commons IO
-Copyright 2002-2012 The Apache Software Foundation
-
-Apache Commons Math
-Copyright 2001-2013 The Apache Software Foundation
-
-===============================================================================
-
-The inverse error function implementation in the Erf class is based on CUDA
-code developed by Mike Giles, Oxford-Man Institute of Quantitative Finance,
-and published in GPU Computing Gems, volume 2, 2010.
-===============================================================================
-
-The BracketFinder (package org.apache.commons.math3.optimization.univariate)
-and PowellOptimizer (package org.apache.commons.math3.optimization.general)
-classes are based on the Python code in module "optimize.py" (version 0.5)
-developed by Travis E. Oliphant for the SciPy library (http://www.scipy.org/)
-Copyright © 2003-2009 SciPy Developers.
-===============================================================================
-
-The LinearConstraint, LinearObjectiveFunction, LinearOptimizer,
-RelationShip, SimplexSolver and SimplexTableau classes in package
-org.apache.commons.math3.optimization.linear include software developed by
-Benjamin McCann (http://www.benmccann.com) and distributed with
-the following copyright: Copyright 2009 Google Inc.
-===============================================================================
-
-This product includes software developed by the
-University of Chicago, as Operator of Argonne National
-Laboratory.
-The LevenbergMarquardtOptimizer class in package
-org.apache.commons.math3.optimization.general includes software
-translated from the lmder, lmpar and qrsolv Fortran routines
-from the Minpack package
-Minpack Copyright Notice (1999) University of Chicago.  All rights reserved
-===============================================================================
-
-The GraggBulirschStoerIntegrator class in package
-org.apache.commons.math3.ode.nonstiff includes software translated
-from the odex Fortran routine developed by E. Hairer and G. Wanner.
-Original source copyright:
-Copyright (c) 2004, Ernst Hairer
-===============================================================================
-
-The EigenDecompositionImpl class in package
-org.apache.commons.math3.linear includes software translated
-from some LAPACK Fortran routines.  Original source copyright:
-Copyright (c) 1992-2008 The University of Tennessee.  All rights reserved.
-===============================================================================
-
-The MersenneTwister class in package org.apache.commons.math3.random
-includes software translated from the 2002-01-26 version of
-the Mersenne-Twister generator written in C by Makoto Matsumoto and Takuji
-Nishimura. Original source copyright:
-Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
-All rights reserved
-===============================================================================
-
-The LocalizedFormatsTest class in the unit tests is an adapted version of
-the OrekitMessagesTest class from the orekit library distributed under the
-terms of the Apache 2 licence. Original source copyright:
-Copyright 2010 CS Systèmes d'Information
-===============================================================================
-
-The HermiteInterpolator class and its corresponding test have been imported from
-the orekit library distributed under the terms of the Apache 2 licence. Original
-source copyright:
-Copyright 2010-2012 CS Systèmes d'Information
-===============================================================================
-
-The creation of the package "o.a.c.m.analysis.integration.gauss" was inspired
-by an original code donated by Sébastien Brisard.
-===============================================================================
-
-The complete text of licenses and disclaimers associated with the the original
-sources enumerated above at the time of code translation are in the LICENSE.txt
-file.
-
-This product currently only contains code developed by authors
-of specific components, as identified by the source code files;
-if such notes are missing files have been created by
-Tatu Saloranta.
-
-For additional credits (generally to people who reported problems)
-see CREDITS file.
-
-Apache Commons Lang
-Copyright 2001-2011 The Apache Software Foundation
-
-Apache Commons Compress
-Copyright 2002-2012 The Apache Software Foundation
-
-Apache Commons CLI
-Copyright 2001-2009 The Apache Software Foundation
-
-Google Guice - Extensions - Servlet
-Copyright 2006-2011 Google, Inc.
-
-Google Guice - Core Library
-Copyright 2006-2011 Google, Inc.
-
-Apache Jakarta HttpClient
-Copyright 1999-2007 The Apache Software Foundation
-
-Apache Hive
-Copyright 2008-2013 The Apache Software Foundation
-
-This product includes software developed by The Apache Software
-Foundation (http://www.apache.org/).
-
-This product includes software developed by The JDBM Project
-(http://jdbm.sourceforge.net/).
-
-This product includes/uses ANTLR (http://www.antlr.org/),
-Copyright (c) 2003-2011, Terrence Parr.
-
-This product includes/uses StringTemplate (http://www.stringtemplate.org/),
-Copyright (c) 2011, Terrence Parr.
-
-This product includes/uses ASM (http://asm.ow2.org/),
-Copyright (c) 2000-2007 INRIA, France Telecom.
-
-This product includes/uses JLine (http://jline.sourceforge.net/),
-Copyright (c) 2002-2006, Marc Prud'hommeaux <mwp1@cornell.edu>.
-
-This product includes/uses SQLLine (http://sqlline.sourceforge.net),
-Copyright (c) 2002, 2003, 2004, 2005 Marc Prud'hommeaux <mwp1@cornell.edu>.
-
-This product includes/uses SLF4J (http://www.slf4j.org/),
-Copyright (c) 2004-2010 QOS.ch
-
-This product includes/uses Bootstrap (http://twitter.github.com/bootstrap/),
-Copyright (c) 2012 Twitter, Inc.
-
-This product includes/uses Glyphicons (http://glyphicons.com/),
-Copyright (c) 2010 - 2012 Jan Kovarík
-
-This product includes DataNucleus (http://www.datanucleus.org/)
-Copyright 2008-2008 DataNucleus
-
-This product includes Guava (http://code.google.com/p/guava-libraries/)
-Copyright (C) 2006 Google Inc.
-
-This product includes JavaEWAH (http://code.google.com/p/javaewah/)
-Copyright (C) 2011 Google Inc.
-
-Apache Commons Pool
-Copyright 1999-2009 The Apache Software Foundation
-
-This product includes/uses Kubernetes & OpenShift 3 Java Client (https://github.com/fabric8io/kubernetes-client)
-Copyright (C) 2015 Red Hat, Inc.
-
-This product includes/uses OkHttp (https://github.com/square/okhttp)
-Copyright (C) 2012 The Android Open Source Project
-
-=========================================================================
-==  NOTICE file corresponding to section 4(d) of the Apache License,   ==
-==  Version 2.0, in this case for the DataNucleus distribution.        ==
-=========================================================================
-
-===================================================================
-This product includes software developed by many individuals,
-including the following:
-===================================================================
-Erik Bengtson
-Andy Jefferson
-
-===================================================================
-This product has included contributions from some individuals,
-including the following:
-===================================================================
-
-===================================================================
-This product has included contributions from some individuals,
-including the following:
-===================================================================
-Joerg von Frantzius
-Thomas Marti
-Barry Haddow
-Marco Schulze
-Ralph Ullrich
-David Ezzio
-Brendan de Beer
-David Eaves
-Martin Taal
-Tony Lai
-Roland Szabo
-Marcus Mennemeier
-Xuan Baldauf
-Eric Sultan
-
-===================================================================
-This product also includes software developed by the TJDO project
-(http://tjdo.sourceforge.net/).
-===================================================================
-
-===================================================================
-This product includes software developed by many individuals,
-including the following:
-===================================================================
-Andy Jefferson
-Erik Bengtson
-Joerg von Frantzius
-Marco Schulze
-
-===================================================================
-This product has included contributions from some individuals,
-including the following:
-===================================================================
-Barry Haddow
-Ralph Ullrich
-David Ezzio
-Brendan de Beer
-David Eaves
-Martin Taal
-Tony Lai
-Roland Szabo
-Anton Troshin (Timesten)
-
-===================================================================
-This product also includes software developed by the Apache Commons project
-(http://commons.apache.org/).
-===================================================================
-
-Apache Java Data Objects (JDO)
-Copyright 2005-2006 The Apache Software Foundation
-
-=========================================================================
-==  NOTICE file corresponding to section 4(d) of the Apache License,   ==
-==  Version 2.0, in this case for the Apache Derby distribution.       ==
-=========================================================================
-
-Apache Derby
-Copyright 2004-2008 The Apache Software Foundation
-
-Portions of Derby were originally developed by
-International Business Machines Corporation and are
-licensed to the Apache Software Foundation under the
-"Software Grant and Corporate Contribution License Agreement",
-informally known as the "Derby CLA".
-The following copyright notice(s) were affixed to portions of the code
-with which this file is now or was at one time distributed
-and are placed here unaltered.
-
-(C) Copyright 1997,2004 International Business Machines Corporation.  All rights reserved.
-
-(C) Copyright IBM Corp. 2003.
-
-The portion of the functionTests under 'nist' was originally
-developed by the National Institute of Standards and Technology (NIST),
-an agency of the United States Department of Commerce, and adapted by
-International Business Machines Corporation in accordance with the NIST
-Software Acknowledgment and Redistribution document at
-http://www.itl.nist.gov/div897/ctg/sql_form.htm
-
-Apache Commons Collections
-Copyright 2001-2008 The Apache Software Foundation
-
-Apache Commons Configuration
-Copyright 2001-2008 The Apache Software Foundation
-
-Apache Jakarta Commons Digester
-Copyright 2001-2006 The Apache Software Foundation
-
-Apache Commons BeanUtils
-Copyright 2000-2008 The Apache Software Foundation
-
-Apache Avro Mapred API
-Copyright 2009-2013 The Apache Software Foundation
-
-Apache Avro IPC
-Copyright 2009-2013 The Apache Software Foundation
-
-
-Vis.js
-Copyright 2010-2015 Almende B.V.
-
-Vis.js is dual licensed under both
-
-  * The Apache 2.0 License
-    http://www.apache.org/licenses/LICENSE-2.0
-
-    and
-
-  * The MIT License
-    http://opensource.org/licenses/MIT
-
-Vis.js may be distributed under either license.
-
-
-Vis.js uses and redistributes the following third-party libraries:
-
-- component-emitter
-  https://github.com/component/emitter
-  The MIT License
-
-- hammer.js
-  http://hammerjs.github.io/
-  The MIT License
-
-- moment.js
-  http://momentjs.com/
-  The MIT License
-
-- keycharm
-  https://github.com/AlexDM0/keycharm
-  The MIT License
-
-===============================================================================
-
-The CSS style for the navigation sidebar of the documentation was originally
-submitted by Óscar Nájera for the scikit-learn project. The scikit-learn project
-is distributed under the 3-Clause BSD license.
-===============================================================================
-
-For CSV functionality:
-
-/*
- * Copyright 2014 Databricks
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright 2015 Ayasdi Inc
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-===============================================================================
-For dev/sparktestsupport/toposort.py:
-
-Copyright 2014 True Blade Systems, Inc.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
diff --git a/NOTICE-binary b/NOTICE-binary
new file mode 100644
index 0000000000000..d56f99bdb55a6
--- /dev/null
+++ b/NOTICE-binary
@@ -0,0 +1,1170 @@
+Apache Spark
+Copyright 2014 and onwards The Apache Software Foundation.
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
+
+
+// ------------------------------------------------------------------
+// NOTICE file corresponding to the section 4d of The Apache License,
+// Version 2.0, in this case for
+// ------------------------------------------------------------------
+
+Hive Beeline
+Copyright 2016 The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
+
+Apache Avro
+Copyright 2009-2014 The Apache Software Foundation
+
+This product currently only contains code developed by authors
+of specific components, as identified by the source code files;
+if such notes are missing files have been created by
+Tatu Saloranta.
+
+For additional credits (generally to people who reported problems)
+see CREDITS file.
+
+Apache Commons Compress
+Copyright 2002-2012 The Apache Software Foundation
+
+This product includes software developed by
+The Apache Software Foundation (http://www.apache.org/).
+
+Apache Avro Mapred API
+Copyright 2009-2014 The Apache Software Foundation
+
+Apache Avro IPC
+Copyright 2009-2014 The Apache Software Foundation
+
+Objenesis
+Copyright 2006-2013 Joe Walnes, Henri Tremblay, Leonardo Mesquita
+
+Apache XBean :: ASM 5 shaded (repackaged)
+Copyright 2005-2015 The Apache Software Foundation
+
+--------------------------------------
+
+This product includes software developed at
+OW2 Consortium (http://asm.ow2.org/)
+
+This product includes software developed by The Apache Software
+Foundation (http://www.apache.org/).
+
+The binary distribution of this product bundles binaries of
+org.iq80.leveldb:leveldb-api (https://github.com/dain/leveldb), which has the
+following notices:
+* Copyright 2011 Dain Sundstrom <dain@iq80.com>
+* Copyright 2011 FuseSource Corp. http://fusesource.com
+
+The binary distribution of this product bundles binaries of
+org.fusesource.hawtjni:hawtjni-runtime (https://github.com/fusesource/hawtjni),
+which has the following notices:
+* This product includes software developed by FuseSource Corp.
+  http://fusesource.com
+* This product includes software developed at
+  Progress Software Corporation and/or its  subsidiaries or affiliates.
+* This product includes software developed by IBM Corporation and others.
+
+The binary distribution of this product bundles binaries of
+Gson 2.2.4,
+which has the following notices:
+
+                            The Netty Project
+                            =================
+
+Please visit the Netty web site for more information:
+
+  * http://netty.io/
+
+Copyright 2014 The Netty Project
+
+The Netty Project licenses this file to you under the Apache License,
+version 2.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at:
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+License for the specific language governing permissions and limitations
+under the License.
+
+Also, please refer to each LICENSE.<component>.txt file, which is located in
+the 'license' directory of the distribution file, for the license terms of the
+components that this product depends on.
+
+-------------------------------------------------------------------------------
+This product contains the extensions to Java Collections Framework which has
+been derived from the works by JSR-166 EG, Doug Lea, and Jason T. Greene:
+
+  * LICENSE:
+    * license/LICENSE.jsr166y.txt (Public Domain)
+  * HOMEPAGE:
+    * http://gee.cs.oswego.edu/cgi-bin/viewcvs.cgi/jsr166/
+    * http://viewvc.jboss.org/cgi-bin/viewvc.cgi/jbosscache/experimental/jsr166/
+
+This product contains a modified version of Robert Harder's Public Domain
+Base64 Encoder and Decoder, which can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.base64.txt (Public Domain)
+  * HOMEPAGE:
+    * http://iharder.sourceforge.net/current/java/base64/
+
+This product contains a modified portion of 'Webbit', an event based
+WebSocket and HTTP server, which can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.webbit.txt (BSD License)
+  * HOMEPAGE:
+    * https://github.com/joewalnes/webbit
+
+This product contains a modified portion of 'SLF4J', a simple logging
+facade for Java, which can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.slf4j.txt (MIT License)
+  * HOMEPAGE:
+    * http://www.slf4j.org/
+
+This product contains a modified portion of 'ArrayDeque', written by Josh
+Bloch of Google, Inc:
+
+  * LICENSE:
+    * license/LICENSE.deque.txt (Public Domain)
+
+This product contains a modified portion of 'Apache Harmony', an open source
+Java SE, which can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.harmony.txt (Apache License 2.0)
+  * HOMEPAGE:
+    * http://archive.apache.org/dist/harmony/
+
+This product contains a modified version of Roland Kuhn's ASL2
+AbstractNodeQueue, which is based on Dmitriy Vyukov's non-intrusive MPSC queue.
+It can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.abstractnodequeue.txt (Public Domain)
+  * HOMEPAGE:
+    * https://github.com/akka/akka/blob/wip-2.2.3-for-scala-2.11/akka-actor/src/main/java/akka/dispatch/AbstractNodeQueue.java
+
+This product contains a modified portion of 'jbzip2', a Java bzip2 compression
+and decompression library written by Matthew J. Francis. It can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.jbzip2.txt (MIT License)
+  * HOMEPAGE:
+    * https://code.google.com/p/jbzip2/
+
+This product contains a modified portion of 'libdivsufsort', a C API library to construct
+the suffix array and the Burrows-Wheeler transformed string for any input string of
+a constant-size alphabet written by Yuta Mori. It can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.libdivsufsort.txt (MIT License)
+  * HOMEPAGE:
+    * https://code.google.com/p/libdivsufsort/
+
+This product contains a modified portion of Nitsan Wakart's 'JCTools', Java Concurrency Tools for the JVM,
+ which can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.jctools.txt (ASL2 License)
+  * HOMEPAGE:
+    * https://github.com/JCTools/JCTools
+
+This product optionally depends on 'JZlib', a re-implementation of zlib in
+pure Java, which can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.jzlib.txt (BSD style License)
+  * HOMEPAGE:
+    * http://www.jcraft.com/jzlib/
+
+This product optionally depends on 'Compress-LZF', a Java library for encoding and
+decoding data in LZF format, written by Tatu Saloranta. It can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.compress-lzf.txt (Apache License 2.0)
+  * HOMEPAGE:
+    * https://github.com/ning/compress
+
+This product optionally depends on 'lz4', a LZ4 Java compression
+and decompression library written by Adrien Grand. It can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.lz4.txt (Apache License 2.0)
+  * HOMEPAGE:
+    * https://github.com/jpountz/lz4-java
+
+This product optionally depends on 'lzma-java', a LZMA Java compression
+and decompression library, which can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.lzma-java.txt (Apache License 2.0)
+  * HOMEPAGE:
+    * https://github.com/jponge/lzma-java
+
+This product contains a modified portion of 'jfastlz', a Java port of FastLZ compression
+and decompression library written by William Kinney. It can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.jfastlz.txt (MIT License)
+  * HOMEPAGE:
+    * https://code.google.com/p/jfastlz/
+
+This product contains a modified portion of and optionally depends on 'Protocol Buffers', Google's data
+interchange format, which can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.protobuf.txt (New BSD License)
+  * HOMEPAGE:
+    * http://code.google.com/p/protobuf/
+
+This product optionally depends on 'Bouncy Castle Crypto APIs' to generate
+a temporary self-signed X.509 certificate when the JVM does not provide the
+equivalent functionality.  It can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.bouncycastle.txt (MIT License)
+  * HOMEPAGE:
+    * http://www.bouncycastle.org/
+
+This product optionally depends on 'Snappy', a compression library produced
+by Google Inc, which can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.snappy.txt (New BSD License)
+  * HOMEPAGE:
+    * http://code.google.com/p/snappy/
+
+This product optionally depends on 'JBoss Marshalling', an alternative Java
+serialization API, which can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.jboss-marshalling.txt (GNU LGPL 2.1)
+  * HOMEPAGE:
+    * http://www.jboss.org/jbossmarshalling
+
+This product optionally depends on 'Caliper', Google's micro-
+benchmarking framework, which can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.caliper.txt (Apache License 2.0)
+  * HOMEPAGE:
+    * http://code.google.com/p/caliper/
+
+This product optionally depends on 'Apache Commons Logging', a logging
+framework, which can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.commons-logging.txt (Apache License 2.0)
+  * HOMEPAGE:
+    * http://commons.apache.org/logging/
+
+This product optionally depends on 'Apache Log4J', a logging framework, which
+can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.log4j.txt (Apache License 2.0)
+  * HOMEPAGE:
+    * http://logging.apache.org/log4j/
+
+This product optionally depends on 'Aalto XML', an ultra-high performance
+non-blocking XML processor, which can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.aalto-xml.txt (Apache License 2.0)
+  * HOMEPAGE:
+    * http://wiki.fasterxml.com/AaltoHome
+
+This product contains a modified version of 'HPACK', a Java implementation of
+the HTTP/2 HPACK algorithm written by Twitter. It can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.hpack.txt (Apache License 2.0)
+  * HOMEPAGE:
+    * https://github.com/twitter/hpack
+
+This product contains a modified portion of 'Apache Commons Lang', a Java library
+provides utilities for the java.lang API, which can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.commons-lang.txt (Apache License 2.0)
+  * HOMEPAGE:
+    * https://commons.apache.org/proper/commons-lang/
+
+The binary distribution of this product bundles binaries of
+Commons Codec 1.4,
+which has the following notices:
+ * src/test/org/apache/commons/codec/language/DoubleMetaphoneTest.javacontains test data from http://aspell.net/test/orig/batch0.tab.Copyright (C) 2002 Kevin Atkinson (kevina@gnu.org)
+  ===============================================================================
+  The content of package org.apache.commons.codec.language.bm has been translated
+  from the original php source code available at http://stevemorse.org/phoneticinfo.htm
+  with permission from the original authors.
+  Original source copyright:Copyright (c) 2008 Alexander Beider & Stephen P. Morse.
+
+The binary distribution of this product bundles binaries of
+Commons Lang 2.6,
+which has the following notices:
+ * This product includes software from the Spring Framework,under the Apache License 2.0 (see: StringUtils.containsWhitespace())
+
+The binary distribution of this product bundles binaries of
+Apache Log4j 1.2.17,
+which has the following notices:
+ * ResolverUtil.java
+    Copyright 2005-2006 Tim Fennell
+  Dumbster SMTP test server
+    Copyright 2004 Jason Paul Kitchen
+  TypeUtil.java
+    Copyright 2002-2012 Ramnivas Laddad, Juergen Hoeller, Chris Beams
+
+The binary distribution of this product bundles binaries of
+Jetty 6.1.26,
+which has the following notices:
+ * ==============================================================
+    Jetty Web Container
+    Copyright 1995-2016 Mort Bay Consulting Pty Ltd.
+   ==============================================================
+
+   The Jetty Web Container is Copyright Mort Bay Consulting Pty Ltd
+   unless otherwise noted.
+
+   Jetty is dual licensed under both
+
+     * The Apache 2.0 License
+       http://www.apache.org/licenses/LICENSE-2.0.html
+
+         and
+
+     * The Eclipse Public 1.0 License
+       http://www.eclipse.org/legal/epl-v10.html
+
+   Jetty may be distributed under either license.
+
+   ------
+   Eclipse
+
+   The following artifacts are EPL.
+    * org.eclipse.jetty.orbit:org.eclipse.jdt.core
+
+   The following artifacts are EPL and ASL2.
+    * org.eclipse.jetty.orbit:javax.security.auth.message
+
+   The following artifacts are EPL and CDDL 1.0.
+    * org.eclipse.jetty.orbit:javax.mail.glassfish
+
+   ------
+   Oracle
+
+   The following artifacts are CDDL + GPLv2 with classpath exception.
+   https://glassfish.dev.java.net/nonav/public/CDDL+GPL.html
+
+    * javax.servlet:javax.servlet-api
+    * javax.annotation:javax.annotation-api
+    * javax.transaction:javax.transaction-api
+    * javax.websocket:javax.websocket-api
+
+   ------
+   Oracle OpenJDK
+
+   If ALPN is used to negotiate HTTP/2 connections, then the following
+   artifacts may be included in the distribution or downloaded when ALPN
+   module is selected.
+
+    * java.sun.security.ssl
+
+   These artifacts replace/modify OpenJDK classes.  The modififications
+   are hosted at github and both modified and original are under GPL v2 with
+   classpath exceptions.
+   http://openjdk.java.net/legal/gplv2+ce.html
+
+   ------
+   OW2
+
+   The following artifacts are licensed by the OW2 Foundation according to the
+   terms of http://asm.ow2.org/license.html
+
+   org.ow2.asm:asm-commons
+   org.ow2.asm:asm
+
+   ------
+   Apache
+
+   The following artifacts are ASL2 licensed.
+
+   org.apache.taglibs:taglibs-standard-spec
+   org.apache.taglibs:taglibs-standard-impl
+
+   ------
+   MortBay
+
+   The following artifacts are ASL2 licensed.  Based on selected classes from
+   following Apache Tomcat jars, all ASL2 licensed.
+
+   org.mortbay.jasper:apache-jsp
+     org.apache.tomcat:tomcat-jasper
+     org.apache.tomcat:tomcat-juli
+     org.apache.tomcat:tomcat-jsp-api
+     org.apache.tomcat:tomcat-el-api
+     org.apache.tomcat:tomcat-jasper-el
+     org.apache.tomcat:tomcat-api
+     org.apache.tomcat:tomcat-util-scan
+     org.apache.tomcat:tomcat-util
+
+   org.mortbay.jasper:apache-el
+     org.apache.tomcat:tomcat-jasper-el
+     org.apache.tomcat:tomcat-el-api
+
+   ------
+   Mortbay
+
+   The following artifacts are CDDL + GPLv2 with classpath exception.
+
+   https://glassfish.dev.java.net/nonav/public/CDDL+GPL.html
+
+   org.eclipse.jetty.toolchain:jetty-schemas
+
+   ------
+   Assorted
+
+   The UnixCrypt.java code implements the one way cryptography used by
+   Unix systems for simple password protection.  Copyright 1996 Aki Yoshida,
+   modified April 2001  by Iris Van den Broeke, Daniel Deville.
+   Permission to use, copy, modify and distribute UnixCrypt
+   for non-commercial or commercial purposes and without fee is
+   granted provided that the copyright notice appears in all copies./
+
+The binary distribution of this product bundles binaries of
+Snappy for Java 1.0.4.1,
+which has the following notices:
+ * This product includes software developed by Google
+    Snappy: http://code.google.com/p/snappy/ (New BSD License)
+
+   This product includes software developed by Apache
+    PureJavaCrc32C from apache-hadoop-common http://hadoop.apache.org/
+    (Apache 2.0 license)
+
+   This library containd statically linked libstdc++. This inclusion is allowed by
+   "GCC RUntime Library Exception"
+   http://gcc.gnu.org/onlinedocs/libstdc++/manual/license.html
+
+   == Contributors ==
+     * Tatu Saloranta
+       * Providing benchmark suite
+     * Alec Wysoker
+       * Performance and memory usage improvement
+
+The binary distribution of this product bundles binaries of
+Xerces2 Java Parser 2.9.1,
+which has the following notices:
+ * =========================================================================
+   ==  NOTICE file corresponding to section 4(d) of the Apache License,   ==
+   ==  Version 2.0, in this case for the Apache Xerces Java distribution. ==
+   =========================================================================
+
+   Apache Xerces Java
+   Copyright 1999-2007 The Apache Software Foundation
+
+   This product includes software developed at
+   The Apache Software Foundation (http://www.apache.org/).
+
+   Portions of this software were originally based on the following:
+     - software copyright (c) 1999, IBM Corporation., http://www.ibm.com.
+     - software copyright (c) 1999, Sun Microsystems., http://www.sun.com.
+     - voluntary contributions made by Paul Eng on behalf of the
+       Apache Software Foundation that were originally developed at iClick, Inc.,
+       software copyright (c) 1999.
+
+Apache Commons Collections
+Copyright 2001-2015 The Apache Software Foundation
+
+Apache Commons Configuration
+Copyright 2001-2008 The Apache Software Foundation
+
+Apache Jakarta Commons Digester
+Copyright 2001-2006 The Apache Software Foundation
+
+Apache Commons BeanUtils
+Copyright 2000-2008 The Apache Software Foundation
+
+ApacheDS Protocol Kerberos Codec
+Copyright 2003-2013 The Apache Software Foundation
+
+ApacheDS I18n
+Copyright 2003-2013 The Apache Software Foundation
+
+Apache Directory API ASN.1 API
+Copyright 2003-2013 The Apache Software Foundation
+
+Apache Directory LDAP API Utilities
+Copyright 2003-2013 The Apache Software Foundation
+
+Curator Client
+Copyright 2011-2015 The Apache Software Foundation
+
+htrace-core
+Copyright 2015 The Apache Software Foundation
+
+   =========================================================================
+   ==  NOTICE file corresponding to section 4(d) of the Apache License,   ==
+   ==  Version 2.0, in this case for the Apache Xerces Java distribution. ==
+   =========================================================================
+
+   Portions of this software were originally based on the following:
+     - software copyright (c) 1999, IBM Corporation., http://www.ibm.com.
+     - software copyright (c) 1999, Sun Microsystems., http://www.sun.com.
+     - voluntary contributions made by Paul Eng on behalf of the
+       Apache Software Foundation that were originally developed at iClick, Inc.,
+       software copyright (c) 1999.
+
+# Jackson JSON processor
+
+Jackson is a high-performance, Free/Open Source JSON processing library.
+It was originally written by Tatu Saloranta (tatu.saloranta@iki.fi), and has
+been in development since 2007.
+It is currently developed by a community of developers, as well as supported
+commercially by FasterXML.com.
+
+## Licensing
+
+Jackson core and extension components may licensed under different licenses.
+To find the details that apply to this artifact see the accompanying LICENSE file.
+For more information, including possible other licensing options, contact
+FasterXML.com (http://fasterxml.com).
+
+## Credits
+
+A list of contributors may be found from CREDITS file, which is included
+in some artifacts (usually source distributions); but is always available
+from the source code management (SCM) system project uses.
+
+Apache HttpCore
+Copyright 2005-2017 The Apache Software Foundation
+
+Curator Recipes
+Copyright 2011-2015 The Apache Software Foundation
+
+Curator Framework
+Copyright 2011-2015 The Apache Software Foundation
+
+Apache Commons Lang
+Copyright 2001-2016 The Apache Software Foundation
+
+This product includes software from the Spring Framework,
+under the Apache License 2.0 (see: StringUtils.containsWhitespace())
+
+Apache Commons Math
+Copyright 2001-2015 The Apache Software Foundation
+
+This product includes software developed for Orekit by
+CS Systèmes d'Information (http://www.c-s.fr/)
+Copyright 2010-2012 CS Systèmes d'Information
+
+Apache log4j
+Copyright 2007 The Apache Software Foundation
+
+# Compress LZF
+
+This library contains efficient implementation of LZF compression format,
+as well as additional helper classes that build on JDK-provided gzip (deflat)
+codec.
+
+Library is licensed under Apache License 2.0, as per accompanying LICENSE file.
+
+## Credit
+
+Library has been written by Tatu Saloranta (tatu.saloranta@iki.fi).
+It was started at Ning, inc., as an official Open Source process used by
+platform backend, but after initial versions has been developed outside of
+Ning by supporting community.
+
+Other contributors include:
+
+* Jon Hartlaub (first versions of streaming reader/writer; unit tests)
+* Cedrik Lime: parallel LZF implementation
+
+Various community members have contributed bug reports, and suggested minor
+fixes; these can be found from file "VERSION.txt" in SCM.
+
+Apache Commons Net
+Copyright 2001-2012 The Apache Software Foundation
+
+Copyright 2011 The Netty Project
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+This product contains a modified version of 'JZlib', a re-implementation of
+zlib in pure Java, which can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.jzlib.txt (BSD Style License)
+  * HOMEPAGE:
+    * http://www.jcraft.com/jzlib/
+
+This product contains a modified version of 'Webbit', a Java event based
+WebSocket and HTTP server:
+
+This product optionally depends on 'Protocol Buffers', Google's data
+interchange format, which can be obtained at:
+
+This product optionally depends on 'SLF4J', a simple logging facade for Java,
+which can be obtained at:
+
+This product optionally depends on 'Apache Log4J', a logging framework,
+which can be obtained at:
+
+This product optionally depends on 'JBoss Logging', a logging framework,
+which can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.jboss-logging.txt (GNU LGPL 2.1)
+  * HOMEPAGE:
+    * http://anonsvn.jboss.org/repos/common/common-logging-spi/
+
+This product optionally depends on 'Apache Felix', an open source OSGi
+framework implementation, which can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.felix.txt (Apache License 2.0)
+  * HOMEPAGE:
+    * http://felix.apache.org/
+
+Jackson core and extension components may be licensed under different licenses.
+To find the details that apply to this artifact see the accompanying LICENSE file.
+For more information, including possible other licensing options, contact
+FasterXML.com (http://fasterxml.com).
+
+Apache Ivy (TM)
+Copyright 2007-2014 The Apache Software Foundation
+
+Portions of Ivy were originally developed at
+Jayasoft SARL (http://www.jayasoft.fr/)
+and are licensed to the Apache Software Foundation under the
+"Software Grant License Agreement"
+
+SSH and SFTP support is provided by the JCraft JSch package,
+which is open source software, available under
+the terms of a BSD style license.
+The original software and related information is available
+at http://www.jcraft.com/jsch/.
+
+
+ORC Core
+Copyright 2013-2018 The Apache Software Foundation
+
+Apache Commons Lang
+Copyright 2001-2011 The Apache Software Foundation
+
+ORC MapReduce
+Copyright 2013-2018 The Apache Software Foundation
+
+Apache Parquet Format
+Copyright 2017 The Apache Software Foundation
+
+Arrow Vectors
+Copyright 2017 The Apache Software Foundation
+
+Arrow Format
+Copyright 2017 The Apache Software Foundation
+
+Arrow Memory
+Copyright 2017 The Apache Software Foundation
+
+Apache Commons CLI
+Copyright 2001-2009 The Apache Software Foundation
+
+Google Guice - Extensions - Servlet
+Copyright 2006-2011 Google, Inc.
+
+Apache Commons IO
+Copyright 2002-2012 The Apache Software Foundation
+
+Google Guice - Core Library
+Copyright 2006-2011 Google, Inc.
+
+mesos
+Copyright 2017 The Apache Software Foundation
+
+Apache Parquet Hadoop Bundle (Incubating)
+Copyright 2015 The Apache Software Foundation
+
+Hive Query Language
+Copyright 2016 The Apache Software Foundation
+
+Apache Extras Companion for log4j 1.2.
+Copyright 2007 The Apache Software Foundation
+
+Hive Metastore
+Copyright 2016 The Apache Software Foundation
+
+Apache Commons Logging
+Copyright 2003-2013 The Apache Software Foundation
+
+=========================================================================
+==  NOTICE file corresponding to section 4(d) of the Apache License,   ==
+==  Version 2.0, in this case for the DataNucleus distribution.        ==
+=========================================================================
+
+===================================================================
+This product includes software developed by many individuals,
+including the following:
+===================================================================
+Erik Bengtson
+Andy Jefferson
+
+===================================================================
+This product has included contributions from some individuals,
+including the following:
+===================================================================
+
+===================================================================
+This product includes software developed by many individuals,
+including the following:
+===================================================================
+Andy Jefferson
+Erik Bengtson
+Joerg von Frantzius
+Marco Schulze
+
+===================================================================
+This product has included contributions from some individuals,
+including the following:
+===================================================================
+Barry Haddow
+Ralph Ullrich
+David Ezzio
+Brendan de Beer
+David Eaves
+Martin Taal
+Tony Lai
+Roland Szabo
+Anton Troshin (Timesten)
+
+===================================================================
+This product also includes software developed by the TJDO project
+(http://tjdo.sourceforge.net/).
+===================================================================
+
+===================================================================
+This product also includes software developed by the Apache Commons project
+(http://commons.apache.org/).
+===================================================================
+
+Apache Commons Pool
+Copyright 1999-2009 The Apache Software Foundation
+
+Apache Commons DBCP
+Copyright 2001-2010 The Apache Software Foundation
+
+Apache Java Data Objects (JDO)
+Copyright 2005-2006 The Apache Software Foundation
+
+Apache Jakarta HttpClient
+Copyright 1999-2007 The Apache Software Foundation
+
+Calcite Avatica
+Copyright 2012-2015 The Apache Software Foundation
+
+Calcite Core
+Copyright 2012-2015 The Apache Software Foundation
+
+Calcite Linq4j
+Copyright 2012-2015 The Apache Software Foundation
+
+Apache HttpClient
+Copyright 1999-2017 The Apache Software Foundation
+
+Apache Commons Codec
+Copyright 2002-2014 The Apache Software Foundation
+
+src/test/org/apache/commons/codec/language/DoubleMetaphoneTest.java
+contains test data from http://aspell.net/test/orig/batch0.tab.
+Copyright (C) 2002 Kevin Atkinson (kevina@gnu.org)
+
+===============================================================================
+
+The content of package org.apache.commons.codec.language.bm has been translated
+from the original php source code available at http://stevemorse.org/phoneticinfo.htm
+with permission from the original authors.
+Original source copyright:
+Copyright (c) 2008 Alexander Beider & Stephen P. Morse.
+
+=============================================================================
+= NOTICE file corresponding to section 4d of the Apache License Version 2.0 =
+=============================================================================
+This product includes software developed by
+Joda.org (http://www.joda.org/).
+
+===================================================================
+This product has included contributions from some individuals,
+including the following:
+===================================================================
+Joerg von Frantzius
+Thomas Marti
+Barry Haddow
+Marco Schulze
+Ralph Ullrich
+David Ezzio
+Brendan de Beer
+David Eaves
+Martin Taal
+Tony Lai
+Roland Szabo
+Marcus Mennemeier
+Xuan Baldauf
+Eric Sultan
+
+Apache Thrift
+Copyright 2006-2010 The Apache Software Foundation.
+
+=========================================================================
+==  NOTICE file corresponding to section 4(d) of the Apache License,
+==  Version 2.0, in this case for the Apache Derby distribution.
+==
+==  DO NOT EDIT THIS FILE DIRECTLY. IT IS GENERATED
+==  BY THE buildnotice TARGET IN THE TOP LEVEL build.xml FILE.
+==
+=========================================================================
+
+Apache Derby
+Copyright 2004-2015 The Apache Software Foundation
+
+=========================================================================
+
+Portions of Derby were originally developed by
+International Business Machines Corporation and are
+licensed to the Apache Software Foundation under the
+"Software Grant and Corporate Contribution License Agreement",
+informally known as the "Derby CLA".
+The following copyright notice(s) were affixed to portions of the code
+with which this file is now or was at one time distributed
+and are placed here unaltered.
+
+(C) Copyright 1997,2004 International Business Machines Corporation.  All rights reserved.
+
+(C) Copyright IBM Corp. 2003.
+
+The portion of the functionTests under 'nist' was originally
+developed by the National Institute of Standards and Technology (NIST),
+an agency of the United States Department of Commerce, and adapted by
+International Business Machines Corporation in accordance with the NIST
+Software Acknowledgment and Redistribution document at
+http://www.itl.nist.gov/div897/ctg/sql_form.htm
+
+The JDBC apis for small devices and JDBC3 (under java/stubs/jsr169 and
+java/stubs/jdbc3) were produced by trimming sources supplied by the
+Apache Harmony project. In addition, the Harmony SerialBlob and
+SerialClob implementations are used. The following notice covers the Harmony sources:
+
+Portions of Harmony were originally developed by
+Intel Corporation and are licensed to the Apache Software
+Foundation under the "Software Grant and Corporate Contribution
+License Agreement", informally known as the "Intel Harmony CLA".
+
+The Derby build relies on source files supplied by the Apache Felix
+project. The following notice covers the Felix files:
+
+  Apache Felix Main
+  Copyright 2008 The Apache Software Foundation
+
+  I. Included Software
+
+  This product includes software developed at
+  The Apache Software Foundation (http://www.apache.org/).
+  Licensed under the Apache License 2.0.
+
+  This product includes software developed at
+  The OSGi Alliance (http://www.osgi.org/).
+  Copyright (c) OSGi Alliance (2000, 2007).
+  Licensed under the Apache License 2.0.
+
+  This product includes software from http://kxml.sourceforge.net.
+  Copyright (c) 2002,2003, Stefan Haustein, Oberhausen, Rhld., Germany.
+  Licensed under BSD License.
+
+  II. Used Software
+
+  This product uses software developed at
+  The OSGi Alliance (http://www.osgi.org/).
+  Copyright (c) OSGi Alliance (2000, 2007).
+  Licensed under the Apache License 2.0.
+
+  III. License Summary
+  - Apache License 2.0
+  - BSD License
+
+The Derby build relies on jar files supplied by the Apache Lucene
+project. The following notice covers the Lucene files:
+
+Apache Lucene
+Copyright 2013 The Apache Software Foundation
+
+Includes software from other Apache Software Foundation projects,
+including, but not limited to:
+ - Apache Ant
+ - Apache Jakarta Regexp
+ - Apache Commons
+ - Apache Xerces
+
+ICU4J, (under analysis/icu) is licensed under an MIT styles license
+and Copyright (c) 1995-2008 International Business Machines Corporation and others
+
+Some data files (under analysis/icu/src/data) are derived from Unicode data such
+as the Unicode Character Database. See http://unicode.org/copyright.html for more
+details.
+
+Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is
+BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/
+
+The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were
+automatically generated with the moman/finenight FSA library, created by
+Jean-Philippe Barrette-LaPierre. This library is available under an MIT license,
+see http://sites.google.com/site/rrettesite/moman and
+http://bitbucket.org/jpbarrette/moman/overview/
+
+The class org.apache.lucene.util.WeakIdentityMap was derived from
+the Apache CXF project and is Apache License 2.0.
+
+The Google Code Prettify is Apache License 2.0.
+See http://code.google.com/p/google-code-prettify/
+
+JUnit (junit-4.10) is licensed under the Common Public License v. 1.0
+See http://junit.sourceforge.net/cpl-v10.html
+
+This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin
+g Package (jaspell): http://jaspell.sourceforge.net/
+License: The BSD License (http://www.opensource.org/licenses/bsd-license.php)
+
+The snowball stemmers in
+  analysis/common/src/java/net/sf/snowball
+were developed by Martin Porter and Richard Boulton.
+The snowball stopword lists in
+  analysis/common/src/resources/org/apache/lucene/analysis/snowball
+were developed by Martin Porter and Richard Boulton.
+The full snowball package is available from
+  http://snowball.tartarus.org/
+
+The KStem stemmer in
+  analysis/common/src/org/apache/lucene/analysis/en
+was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
+under the BSD-license.
+
+The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
+stopword list that is BSD-licensed created by Jacques Savoy.  These files reside in:
+analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
+analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
+analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
+analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
+analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
+See http://members.unine.ch/jacques.savoy/clef/index.html.
+
+The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
+(common) are based on BSD-licensed reference implementations created by Jacques Savoy and
+Ljiljana Dolamic. These files reside in:
+analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java
+analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java
+analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java
+analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java
+analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java
+analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java
+analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java
+analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java
+analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java
+analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java
+analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java
+
+The Stempel analyzer (stempel) includes BSD-licensed software developed
+by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil,
+and Edmond Nolan.
+
+The Polish analyzer (stempel) comes with a default
+stopword list that is BSD-licensed created by the Carrot2 project. The file resides
+in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt.
+See http://project.carrot2.org/license.html.
+
+The SmartChineseAnalyzer source code (smartcn) was
+provided by Xiaoping Gao and copyright 2009 by www.imdict.net.
+
+WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/)
+is derived from Unicode data such as the Unicode Character Database.
+See http://unicode.org/copyright.html for more details.
+
+The Morfologik analyzer (morfologik) includes BSD-licensed software
+developed by Dawid Weiss and Marcin Miłkowski (http://morfologik.blogspot.com/).
+
+Morfologik uses data from Polish ispell/myspell dictionary
+(http://www.sjp.pl/slownik/en/) licenced on the terms of (inter alia)
+LGPL and Creative Commons ShareAlike.
+
+Morfologic includes data from BSD-licensed dictionary of Polish (SGJP)
+(http://sgjp.pl/morfeusz/)
+
+Servlet-api.jar and javax.servlet-*.jar are under the CDDL license, the original
+source code for this can be found at http://www.eclipse.org/jetty/downloads.php
+
+===========================================================================
+Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration
+===========================================================================
+
+This software includes a binary and/or source version of data from
+
+  mecab-ipadic-2.7.0-20070801
+
+which can be obtained from
+
+  http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz
+
+or
+
+  http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz
+
+===========================================================================
+mecab-ipadic-2.7.0-20070801 Notice
+===========================================================================
+
+Nara Institute of Science and Technology (NAIST),
+the copyright holders, disclaims all warranties with regard to this
+software, including all implied warranties of merchantability and
+fitness, in no event shall NAIST be liable for
+any special, indirect or consequential damages or any damages
+whatsoever resulting from loss of use, data or profits, whether in an
+action of contract, negligence or other tortuous action, arising out
+of or in connection with the use or performance of this software.
+
+A large portion of the dictionary entries
+originate from ICOT Free Software.  The following conditions for ICOT
+Free Software applies to the current dictionary as well.
+
+Each User may also freely distribute the Program, whether in its
+original form or modified, to any third party or parties, PROVIDED
+that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
+on, or be attached to, the Program, which is distributed substantially
+in the same form as set out herein and that such intended
+distribution, if actually made, will neither violate or otherwise
+contravene any of the laws and regulations of the countries having
+jurisdiction over the User or the intended distribution itself.
+
+NO WARRANTY
+
+The program was produced on an experimental basis in the course of the
+research and development conducted during the project and is provided
+to users as so produced on an experimental basis.  Accordingly, the
+program is provided without any warranty whatsoever, whether express,
+implied, statutory or otherwise.  The term "warranty" used herein
+includes, but is not limited to, any warranty of the quality,
+performance, merchantability and fitness for a particular purpose of
+the program and the nonexistence of any infringement or violation of
+any right of any third party.
+
+Each user of the program will agree and understand, and be deemed to
+have agreed and understood, that there is no warranty whatsoever for
+the program and, accordingly, the entire risk arising from or
+otherwise connected with the program is assumed by the user.
+
+Therefore, neither ICOT, the copyright holder, or any other
+organization that participated in or was otherwise related to the
+development of the program and their respective officials, directors,
+officers and other employees shall be held liable for any and all
+damages, including, without limitation, general, special, incidental
+and consequential damages, arising out of or otherwise in connection
+with the use or inability to use the program or any product, material
+or result produced or otherwise obtained by using the program,
+regardless of whether they have been advised of, or otherwise had
+knowledge of, the possibility of such damages at any time during the
+project or thereafter.  Each user will be deemed to have agreed to the
+foregoing by his or her commencement of use of the program.  The term
+"use" as used herein includes, but is not limited to, the use,
+modification, copying and distribution of the program and the
+production of secondary products from the program.
+
+In the case where the program, whether in its original form or
+modified, was distributed or delivered to or received by a user from
+any person, organization or entity other than ICOT, unless it makes or
+grants independently of ICOT any specific warranty to the user in
+writing, such person, organization or entity, will also be exempted
+from and not be held liable to the user for any such damages as noted
+above as far as the program is concerned.
+
+The Derby build relies on a jar file supplied by the JSON Simple
+project, hosted at https://code.google.com/p/json-simple/.
+The JSON simple jar file is licensed under the Apache 2.0 License.
+
+Hive CLI
+Copyright 2016 The Apache Software Foundation
+
+Hive JDBC
+Copyright 2016 The Apache Software Foundation
+
+
+Chill is a set of Scala extensions for Kryo.
+Copyright 2012 Twitter, Inc.
+
+Third Party Dependencies:
+
+Kryo 2.17
+BSD 3-Clause License
+http://code.google.com/p/kryo
+
+Commons-Codec 1.7
+Apache Public License 2.0
+http://hadoop.apache.org
+
+
+
+Breeze is distributed under an Apache License V2.0 (See LICENSE)
+
+===============================================================================
+
+Proximal algorithms outlined in Proximal.scala (package breeze.optimize.proximal)
+are based on https://github.com/cvxgrp/proximal (see LICENSE for details) and distributed with
+Copyright (c) 2014 by Debasish Das (Verizon), all rights reserved.
+
+===============================================================================
+
+QuadraticMinimizer class in package breeze.optimize.proximal is distributed with Copyright (c)
+2014, Debasish Das (Verizon), all rights reserved.
+
+===============================================================================
+
+NonlinearMinimizer class in package breeze.optimize.proximal is distributed with Copyright (c)
+2015, Debasish Das (Verizon), all rights reserved.
+
+
+   =========================================================================
+   ==  NOTICE file corresponding to section 4(d) of the Apache License,   ==
+   ==  Version 2.0, in this case for the distribution of jets3t.          ==
+   =========================================================================
+
+   This product includes software developed by:
+
+   The Apache Software Foundation (http://www.apache.org/).
+
+   The ExoLab Project (http://www.exolab.org/)
+
+   Sun Microsystems (http://www.sun.com/)
+
+   Codehaus (http://castor.codehaus.org)
+
+   Tatu Saloranta (http://wiki.fasterxml.com/TatuSaloranta)
+
+
+
+stream-lib
+Copyright 2016 AddThis
+
+This product includes software developed by AddThis.
+
+This product also includes code adapted from:
+
+Apache Solr (http://lucene.apache.org/solr/)
+Copyright 2014 The Apache Software Foundation
+
+Apache Mahout (http://mahout.apache.org/)
+Copyright 2014 The Apache Software Foundation
\ No newline at end of file
diff --git a/dev/.rat-excludes b/dev/.rat-excludes
index 23b24212b4d29..466135e72233a 100644
--- a/dev/.rat-excludes
+++ b/dev/.rat-excludes
@@ -11,6 +11,10 @@ cache
 .rat-excludes
 .*md
 derby.log
+licenses/*
+licenses-binary/*
+LICENSE
+NOTICE
 TAGS
 RELEASE
 control
diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
index 84233c64caa9c..ad99ce55806af 100755
--- a/dev/make-distribution.sh
+++ b/dev/make-distribution.sh
@@ -211,9 +211,10 @@ mkdir -p "$DISTDIR/examples/src/main"
 cp -r "$SPARK_HOME/examples/src/main" "$DISTDIR/examples/src/"
 
 # Copy license and ASF files
-cp "$SPARK_HOME/LICENSE" "$DISTDIR"
-cp -r "$SPARK_HOME/licenses" "$DISTDIR"
-cp "$SPARK_HOME/NOTICE" "$DISTDIR"
+cp "$SPARK_HOME/LICENSE-binary" "$DISTDIR/LICENSE"
+mkdir -p "$DISTDIR/licenses"
+cp -r "$SPARK_HOME/licenses-binary" "$DISTDIR/licenses"
+cp "$SPARK_HOME/NOTICE-binary" "$DISTDIR/NOTICE"
 
 if [ -e "$SPARK_HOME/CHANGES.txt" ]; then
   cp "$SPARK_HOME/CHANGES.txt" "$DISTDIR"
diff --git a/licenses/LICENSE-scopt.txt b/licenses-binary/LICENSE-AnchorJS.txt
similarity index 100%
rename from licenses/LICENSE-scopt.txt
rename to licenses-binary/LICENSE-AnchorJS.txt
diff --git a/licenses-binary/LICENSE-CC0.txt b/licenses-binary/LICENSE-CC0.txt
new file mode 100644
index 0000000000000..1625c17936079
--- /dev/null
+++ b/licenses-binary/LICENSE-CC0.txt
@@ -0,0 +1,121 @@
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.
\ No newline at end of file
diff --git a/licenses/LICENSE-antlr.txt b/licenses-binary/LICENSE-antlr.txt
similarity index 100%
rename from licenses/LICENSE-antlr.txt
rename to licenses-binary/LICENSE-antlr.txt
diff --git a/licenses-binary/LICENSE-arpack.txt b/licenses-binary/LICENSE-arpack.txt
new file mode 100644
index 0000000000000..a3ad80087bb63
--- /dev/null
+++ b/licenses-binary/LICENSE-arpack.txt
@@ -0,0 +1,8 @@
+Copyright © 2018 The University of Tennessee. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+· Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+· Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer listed in this license in the documentation and/or other materials provided with the distribution.
+· Neither the name of the copyright holders nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+This software is provided by the copyright holders and contributors "as is" and any express or implied warranties, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose are disclaimed. in no event shall the copyright owner or contributors be liable for any direct, indirect, incidental, special, exemplary, or consequential damages (including, but not limited to, procurement of substitute goods or services; loss of use, data, or profits; or business interruption) however caused and on any theory of liability, whether in contract, strict liability, or tort (including negligence or otherwise) arising in any way out of the use of this software, even if advised of the possibility of such damage.
\ No newline at end of file
diff --git a/licenses-binary/LICENSE-automaton.txt b/licenses-binary/LICENSE-automaton.txt
new file mode 100644
index 0000000000000..2fc6e8c3432f0
--- /dev/null
+++ b/licenses-binary/LICENSE-automaton.txt
@@ -0,0 +1,24 @@
+Copyright (c) 2001-2017 Anders Moeller
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+3. The name of the author may not be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses-binary/LICENSE-bootstrap.txt b/licenses-binary/LICENSE-bootstrap.txt
new file mode 100644
index 0000000000000..6c711832fbc85
--- /dev/null
+++ b/licenses-binary/LICENSE-bootstrap.txt
@@ -0,0 +1,13 @@
+Copyright 2013 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/licenses-binary/LICENSE-bouncycastle-bcprov.txt b/licenses-binary/LICENSE-bouncycastle-bcprov.txt
new file mode 100644
index 0000000000000..c445a93a06dd4
--- /dev/null
+++ b/licenses-binary/LICENSE-bouncycastle-bcprov.txt
@@ -0,0 +1,7 @@
+Copyright (c) 2000 - 2018 The Legion of the Bouncy Castle Inc. (https://www.bouncycastle.org)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses-binary/LICENSE-cloudpickle.txt b/licenses-binary/LICENSE-cloudpickle.txt
new file mode 100644
index 0000000000000..b1e20fa1eda88
--- /dev/null
+++ b/licenses-binary/LICENSE-cloudpickle.txt
@@ -0,0 +1,28 @@
+Copyright (c) 2012, Regents of the University of California.
+Copyright (c) 2009 `PiCloud, Inc. <http://www.picloud.com>`_.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the University of California, Berkeley nor the
+      names of its contributors may be used to endorse or promote
+      products derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses-binary/LICENSE-d3.min.js.txt b/licenses-binary/LICENSE-d3.min.js.txt
new file mode 100644
index 0000000000000..c71e3f254c068
--- /dev/null
+++ b/licenses-binary/LICENSE-d3.min.js.txt
@@ -0,0 +1,26 @@
+Copyright (c) 2010-2015, Michael Bostock
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* The name Michael Bostock may not be used to endorse or promote products
+  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-Mockito.txt b/licenses-binary/LICENSE-dagre-d3.txt
similarity index 93%
rename from licenses/LICENSE-Mockito.txt
rename to licenses-binary/LICENSE-dagre-d3.txt
index e0840a446caf5..4864fe05e9803 100644
--- a/licenses/LICENSE-Mockito.txt
+++ b/licenses-binary/LICENSE-dagre-d3.txt
@@ -1,6 +1,4 @@
-The MIT License
-
-Copyright (c) 2007 Mockito contributors
+Copyright (c) 2013 Chris Pettitt
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/licenses-binary/LICENSE-datatables.txt b/licenses-binary/LICENSE-datatables.txt
new file mode 100644
index 0000000000000..bb7708b5b5a49
--- /dev/null
+++ b/licenses-binary/LICENSE-datatables.txt
@@ -0,0 +1,7 @@
+Copyright (C) 2008-2018, SpryMedia Ltd.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-f2j.txt b/licenses-binary/LICENSE-f2j.txt
similarity index 100%
rename from licenses/LICENSE-f2j.txt
rename to licenses-binary/LICENSE-f2j.txt
diff --git a/licenses-binary/LICENSE-graphlib-dot.txt b/licenses-binary/LICENSE-graphlib-dot.txt
new file mode 100644
index 0000000000000..4864fe05e9803
--- /dev/null
+++ b/licenses-binary/LICENSE-graphlib-dot.txt
@@ -0,0 +1,19 @@
+Copyright (c) 2013 Chris Pettitt
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses-binary/LICENSE-heapq.txt b/licenses-binary/LICENSE-heapq.txt
new file mode 100644
index 0000000000000..0c4c4b954bea4
--- /dev/null
+++ b/licenses-binary/LICENSE-heapq.txt
@@ -0,0 +1,280 @@
+
+# A. HISTORY OF THE SOFTWARE
+# ==========================
+#
+# Python was created in the early 1990s by Guido van Rossum at Stichting
+# Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands
+# as a successor of a language called ABC.  Guido remains Python's
+# principal author, although it includes many contributions from others.
+#
+# In 1995, Guido continued his work on Python at the Corporation for
+#     National Research Initiatives (CNRI, see http://www.cnri.reston.va.us)
+# in Reston, Virginia where he released several versions of the
+# software.
+#
+# In May 2000, Guido and the Python core development team moved to
+# BeOpen.com to form the BeOpen PythonLabs team.  In October of the same
+# year, the PythonLabs team moved to Digital Creations (now Zope
+# Corporation, see http://www.zope.com).  In 2001, the Python Software
+# Foundation (PSF, see http://www.python.org/psf/) was formed, a
+# non-profit organization created specifically to own Python-related
+# Intellectual Property.  Zope Corporation is a sponsoring member of
+# the PSF.
+#
+# All Python releases are Open Source (see http://www.opensource.org for
+# the Open Source Definition).  Historically, most, but not all, Python
+# releases have also been GPL-compatible; the table below summarizes
+# the various releases.
+#
+# Release         Derived     Year        Owner       GPL-
+# from                                compatible? (1)
+#
+# 0.9.0 thru 1.2              1991-1995   CWI         yes
+# 1.3 thru 1.5.2  1.2         1995-1999   CNRI        yes
+# 1.6             1.5.2       2000        CNRI        no
+# 2.0             1.6         2000        BeOpen.com  no
+# 1.6.1           1.6         2001        CNRI        yes (2)
+# 2.1             2.0+1.6.1   2001        PSF         no
+# 2.0.1           2.0+1.6.1   2001        PSF         yes
+# 2.1.1           2.1+2.0.1   2001        PSF         yes
+# 2.2             2.1.1       2001        PSF         yes
+# 2.1.2           2.1.1       2002        PSF         yes
+# 2.1.3           2.1.2       2002        PSF         yes
+# 2.2.1           2.2         2002        PSF         yes
+# 2.2.2           2.2.1       2002        PSF         yes
+# 2.2.3           2.2.2       2003        PSF         yes
+# 2.3             2.2.2       2002-2003   PSF         yes
+# 2.3.1           2.3         2002-2003   PSF         yes
+# 2.3.2           2.3.1       2002-2003   PSF         yes
+# 2.3.3           2.3.2       2002-2003   PSF         yes
+# 2.3.4           2.3.3       2004        PSF         yes
+# 2.3.5           2.3.4       2005        PSF         yes
+# 2.4             2.3         2004        PSF         yes
+# 2.4.1           2.4         2005        PSF         yes
+# 2.4.2           2.4.1       2005        PSF         yes
+# 2.4.3           2.4.2       2006        PSF         yes
+# 2.4.4           2.4.3       2006        PSF         yes
+# 2.5             2.4         2006        PSF         yes
+# 2.5.1           2.5         2007        PSF         yes
+# 2.5.2           2.5.1       2008        PSF         yes
+# 2.5.3           2.5.2       2008        PSF         yes
+# 2.6             2.5         2008        PSF         yes
+# 2.6.1           2.6         2008        PSF         yes
+# 2.6.2           2.6.1       2009        PSF         yes
+# 2.6.3           2.6.2       2009        PSF         yes
+# 2.6.4           2.6.3       2009        PSF         yes
+# 2.6.5           2.6.4       2010        PSF         yes
+# 2.7             2.6         2010        PSF         yes
+#
+# Footnotes:
+#
+# (1) GPL-compatible doesn't mean that we're distributing Python under
+# the GPL.  All Python licenses, unlike the GPL, let you distribute
+# a modified version without making your changes open source.  The
+# GPL-compatible licenses make it possible to combine Python with
+#     other software that is released under the GPL; the others don't.
+#
+# (2) According to Richard Stallman, 1.6.1 is not GPL-compatible,
+# because its license has a choice of law clause.  According to
+# CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1
+# is "not incompatible" with the GPL.
+#
+# Thanks to the many outside volunteers who have worked under Guido's
+# direction to make these releases possible.
+#
+#
+# B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON
+# ===============================================================
+#
+# PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
+# --------------------------------------------
+#
+# 1. This LICENSE AGREEMENT is between the Python Software Foundation
+# ("PSF"), and the Individual or Organization ("Licensee") accessing and
+# otherwise using this software ("Python") in source or binary form and
+# its associated documentation.
+#
+# 2. Subject to the terms and conditions of this License Agreement, PSF hereby
+# grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
+# analyze, test, perform and/or display publicly, prepare derivative works,
+# distribute, and otherwise use Python alone or in any derivative version,
+# provided, however, that PSF's License Agreement and PSF's notice of copyright,
+# i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+# 2011, 2012, 2013 Python Software Foundation; All Rights Reserved" are retained
+# in Python alone or in any derivative version prepared by Licensee.
+#
+# 3. In the event Licensee prepares a derivative work that is based on
+# or incorporates Python or any part thereof, and wants to make
+# the derivative work available to others as provided herein, then
+# Licensee hereby agrees to include in any such work a brief summary of
+# the changes made to Python.
+#
+# 4. PSF is making Python available to Licensee on an "AS IS"
+# basis.  PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+# IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
+# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
+# INFRINGE ANY THIRD PARTY RIGHTS.
+#
+# 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
+# FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
+# A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
+# OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+#
+# 6. This License Agreement will automatically terminate upon a material
+# breach of its terms and conditions.
+#
+# 7. Nothing in this License Agreement shall be deemed to create any
+# relationship of agency, partnership, or joint venture between PSF and
+# Licensee.  This License Agreement does not grant permission to use PSF
+# trademarks or trade name in a trademark sense to endorse or promote
+# products or services of Licensee, or any third party.
+#
+# 8. By copying, installing or otherwise using Python, Licensee
+# agrees to be bound by the terms and conditions of this License
+# Agreement.
+#
+#
+# BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0
+# -------------------------------------------
+#
+# BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1
+#
+# 1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an
+# office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the
+# Individual or Organization ("Licensee") accessing and otherwise using
+# this software in source or binary form and its associated
+# documentation ("the Software").
+#
+# 2. Subject to the terms and conditions of this BeOpen Python License
+# Agreement, BeOpen hereby grants Licensee a non-exclusive,
+# royalty-free, world-wide license to reproduce, analyze, test, perform
+# and/or display publicly, prepare derivative works, distribute, and
+# otherwise use the Software alone or in any derivative version,
+# provided, however, that the BeOpen Python License is retained in the
+# Software, alone or in any derivative version prepared by Licensee.
+#
+# 3. BeOpen is making the Software available to Licensee on an "AS IS"
+# basis.  BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+# IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND
+# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT
+# INFRINGE ANY THIRD PARTY RIGHTS.
+#
+# 4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE
+# SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS
+# AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY
+# DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+#
+# 5. This License Agreement will automatically terminate upon a material
+# breach of its terms and conditions.
+#
+# 6. This License Agreement shall be governed by and interpreted in all
+# respects by the law of the State of California, excluding conflict of
+# law provisions.  Nothing in this License Agreement shall be deemed to
+# create any relationship of agency, partnership, or joint venture
+# between BeOpen and Licensee.  This License Agreement does not grant
+# permission to use BeOpen trademarks or trade names in a trademark
+# sense to endorse or promote products or services of Licensee, or any
+# third party.  As an exception, the "BeOpen Python" logos available at
+# http://www.pythonlabs.com/logos.html may be used according to the
+# permissions granted on that web page.
+#
+# 7. By copying, installing or otherwise using the software, Licensee
+# agrees to be bound by the terms and conditions of this License
+# Agreement.
+#
+#
+# CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1
+# ---------------------------------------
+#
+# 1. This LICENSE AGREEMENT is between the Corporation for National
+#     Research Initiatives, having an office at 1895 Preston White Drive,
+# Reston, VA 20191 ("CNRI"), and the Individual or Organization
+# ("Licensee") accessing and otherwise using Python 1.6.1 software in
+# source or binary form and its associated documentation.
+#
+# 2. Subject to the terms and conditions of this License Agreement, CNRI
+# hereby grants Licensee a nonexclusive, royalty-free, world-wide
+# license to reproduce, analyze, test, perform and/or display publicly,
+# prepare derivative works, distribute, and otherwise use Python 1.6.1
+# alone or in any derivative version, provided, however, that CNRI's
+# License Agreement and CNRI's notice of copyright, i.e., "Copyright (c)
+# 1995-2001 Corporation for National Research Initiatives; All Rights
+# Reserved" are retained in Python 1.6.1 alone or in any derivative
+# version prepared by Licensee.  Alternately, in lieu of CNRI's License
+# Agreement, Licensee may substitute the following text (omitting the
+# quotes): "Python 1.6.1 is made available subject to the terms and
+# conditions in CNRI's License Agreement.  This Agreement together with
+# Python 1.6.1 may be located on the Internet using the following
+# unique, persistent identifier (known as a handle): 1895.22/1013.  This
+# Agreement may also be obtained from a proxy server on the Internet
+# using the following URL: http://hdl.handle.net/1895.22/1013".
+#
+# 3. In the event Licensee prepares a derivative work that is based on
+# or incorporates Python 1.6.1 or any part thereof, and wants to make
+# the derivative work available to others as provided herein, then
+# Licensee hereby agrees to include in any such work a brief summary of
+# the changes made to Python 1.6.1.
+#
+# 4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS"
+# basis.  CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+# IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND
+# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT
+# INFRINGE ANY THIRD PARTY RIGHTS.
+#
+# 5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
+# 1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
+# A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1,
+# OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+#
+# 6. This License Agreement will automatically terminate upon a material
+# breach of its terms and conditions.
+#
+# 7. This License Agreement shall be governed by the federal
+# intellectual property law of the United States, including without
+# limitation the federal copyright law, and, to the extent such
+# U.S. federal law does not apply, by the law of the Commonwealth of
+# Virginia, excluding Virginia's conflict of law provisions.
+# Notwithstanding the foregoing, with regard to derivative works based
+# on Python 1.6.1 that incorporate non-separable material that was
+# previously distributed under the GNU General Public License (GPL), the
+# law of the Commonwealth of Virginia shall govern this License
+# Agreement only as to issues arising under or with respect to
+# Paragraphs 4, 5, and 7 of this License Agreement.  Nothing in this
+# License Agreement shall be deemed to create any relationship of
+# agency, partnership, or joint venture between CNRI and Licensee.  This
+# License Agreement does not grant permission to use CNRI trademarks or
+# trade name in a trademark sense to endorse or promote products or
+# services of Licensee, or any third party.
+#
+# 8. By clicking on the "ACCEPT" button where indicated, or by copying,
+# installing or otherwise using Python 1.6.1, Licensee agrees to be
+# bound by the terms and conditions of this License Agreement.
+#
+# ACCEPT
+#
+#
+# CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2
+# --------------------------------------------------
+#
+# Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam,
+# The Netherlands.  All rights reserved.
+#
+# Permission to use, copy, modify, and distribute this software and its
+# documentation for any purpose and without fee is hereby granted,
+# provided that the above copyright notice appear in all copies and that
+# both that copyright notice and this permission notice appear in
+# supporting documentation, and that the name of Stichting Mathematisch
+# Centrum or CWI not be used in advertising or publicity pertaining to
+# distribution of the software without specific, written prior
+# permission.
+#
+# STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
+# THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
+# FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
\ No newline at end of file
diff --git a/licenses-binary/LICENSE-janino.txt b/licenses-binary/LICENSE-janino.txt
new file mode 100644
index 0000000000000..d1e1f237c4641
--- /dev/null
+++ b/licenses-binary/LICENSE-janino.txt
@@ -0,0 +1,31 @@
+Janino - An embedded Java[TM] compiler
+
+Copyright (c) 2001-2016, Arno Unkrig
+Copyright (c) 2015-2016  TIBCO Software Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+   2. Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials
+      provided with the distribution.
+   3. Neither the name of JANINO nor the names of its contributors
+      may be used to endorse or promote products derived from this
+      software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses-binary/LICENSE-javassist.html b/licenses-binary/LICENSE-javassist.html
new file mode 100644
index 0000000000000..5abd563a0c4d9
--- /dev/null
+++ b/licenses-binary/LICENSE-javassist.html
@@ -0,0 +1,373 @@
+<HTML>
+<HEAD>
+  <TITLE>Javassist License</TITLE>
+  <META http-equiv=Content-Type content="text/html; charset=iso-8859-1">
+  <META content="MSHTML 5.50.4916.2300" name=GENERATOR></HEAD>
+
+<BODY text=#000000 vLink=#551a8b aLink=#ff0000 link=#0000ee bgColor=#ffffff>
+<CENTER><B><FONT size=+2>MOZILLA PUBLIC LICENSE</FONT></B> <BR><B>Version
+  1.1</B>
+  <P>
+  <HR width="20%">
+</CENTER>
+<P><B>1. Definitions.</B>
+  <UL><B>1.0.1. "Commercial Use" </B>means distribution or otherwise making the
+    Covered Code available to a third party.
+<P><B>1.1. ''Contributor''</B> means each entity that creates or contributes
+  to the creation of Modifications.
+<P><B>1.2. ''Contributor Version''</B> means the combination of the Original
+  Code, prior Modifications used by a Contributor, and the Modifications made by
+  that particular Contributor.
+<P><B>1.3. ''Covered Code''</B> means the Original Code or Modifications or
+  the combination of the Original Code and Modifications, in each case including
+  portions thereof<B>.</B>
+<P><B>1.4. ''Electronic Distribution Mechanism''</B> means a mechanism
+  generally accepted in the software development community for the electronic
+  transfer of data.
+<P><B>1.5. ''Executable''</B> means Covered Code in any form other than Source
+  Code.
+<P><B>1.6. ''Initial Developer''</B> means the individual or entity identified
+  as the Initial Developer in the Source Code notice required by <B>Exhibit
+    A</B>.
+<P><B>1.7. ''Larger Work''</B> means a work which combines Covered Code or
+  portions thereof with code not governed by the terms of this License.
+<P><B>1.8. ''License''</B> means this document.
+<P><B>1.8.1. "Licensable"</B> means having the right to grant, to the maximum
+  extent possible, whether at the time of the initial grant or subsequently
+  acquired, any and all of the rights conveyed herein.
+<P><B>1.9. ''Modifications''</B> means any addition to or deletion from the
+  substance or structure of either the Original Code or any previous
+  Modifications. When Covered Code is released as a series of files, a
+  Modification is:
+  <UL><B>A.</B> Any addition to or deletion from the contents of a file
+    containing Original Code or previous Modifications.
+<P><B>B.</B> Any new file that contains any part of the Original Code or
+  previous Modifications. <BR>&nbsp;</P></UL><B>1.10. ''Original Code''</B>
+means Source Code of computer software code which is described in the Source
+Code notice required by <B>Exhibit A</B> as Original Code, and which, at the
+time of its release under this License is not already Covered Code governed by
+this License.
+<P><B>1.10.1. "Patent Claims"</B> means any patent claim(s), now owned or
+  hereafter acquired, including without limitation,&nbsp; method, process, and
+  apparatus claims, in any patent Licensable by grantor.
+<P><B>1.11. ''Source Code''</B> means the preferred form of the Covered Code
+  for making modifications to it, including all modules it contains, plus any
+  associated interface definition files, scripts used to control compilation and
+  installation of an Executable, or source code differential comparisons against
+  either the Original Code or another well known, available Covered Code of the
+  Contributor's choice. The Source Code can be in a compressed or archival form,
+  provided the appropriate decompression or de-archiving software is widely
+  available for no charge.
+<P><B>1.12. "You'' (or "Your")&nbsp;</B> means an individual or a legal entity
+  exercising rights under, and complying with all of the terms of, this License
+  or a future version of this License issued under Section 6.1. For legal
+  entities, "You'' includes any entity which controls, is controlled by, or is
+  under common control with You. For purposes of this definition, "control''
+  means (a) the power, direct or indirect, to cause the direction or management
+  of such entity, whether by contract or otherwise, or (b) ownership of more
+  than fifty percent (50%) of the outstanding shares or beneficial ownership of
+  such entity.</P></UL><B>2. Source Code License.</B>
+<UL><B>2.1. The Initial Developer Grant.</B> <BR>The Initial Developer hereby
+  grants You a world-wide, royalty-free, non-exclusive license, subject to third
+  party intellectual property claims:
+  <UL><B>(a)</B>&nbsp;<B> </B>under intellectual property rights (other than
+    patent or trademark) Licensable by Initial Developer to use, reproduce,
+    modify, display, perform, sublicense and distribute the Original Code (or
+    portions thereof) with or without Modifications, and/or as part of a Larger
+    Work; and
+    <P><B>(b)</B> under Patents Claims infringed by the making, using or selling
+      of Original Code, to make, have made, use, practice, sell, and offer for
+      sale, and/or otherwise dispose of the Original Code (or portions thereof).
+    <UL>
+      <UL></UL></UL><B>(c) </B>the licenses granted in this Section 2.1(a) and (b)
+    are effective on the date Initial Developer first distributes Original Code
+    under the terms of this License.
+    <P><B>(d) </B>Notwithstanding Section 2.1(b) above, no patent license is
+      granted: 1) for code that You delete from the Original Code; 2) separate
+      from the Original Code;&nbsp; or 3) for infringements caused by: i) the
+      modification of the Original Code or ii) the combination of the Original
+      Code with other software or devices. <BR>&nbsp;</P></UL><B>2.2. Contributor
+    Grant.</B> <BR>Subject to third party intellectual property claims, each
+  Contributor hereby grants You a world-wide, royalty-free, non-exclusive
+  license
+  <UL> <BR><B>(a)</B>&nbsp;<B> </B>under intellectual property rights (other
+    than patent or trademark) Licensable by Contributor, to use, reproduce,
+    modify, display, perform, sublicense and distribute the Modifications
+    created by such Contributor (or portions thereof) either on an unmodified
+    basis, with other Modifications, as Covered Code and/or as part of a Larger
+    Work; and
+    <P><B>(b)</B> under Patent Claims infringed by the making, using, or selling
+      of&nbsp; Modifications made by that Contributor either alone and/or in<FONT
+          color=#000000> combination with its Contributor Version (or portions of such
+        combination), to make, use, sell, offer for sale, have made, and/or
+        otherwise dispose of: 1) Modifications made by that Contributor (or portions
+        thereof); and 2) the combination of&nbsp; Modifications made by that
+        Contributor with its Contributor Version (or portions of such
+        combination).</FONT>
+    <P><B>(c) </B>the licenses granted in Sections 2.2(a) and 2.2(b) are
+      effective on the date Contributor first makes Commercial Use of the Covered
+      Code.
+    <P><B>(d)&nbsp;</B>&nbsp;&nbsp; Notwithstanding Section 2.2(b) above, no
+      patent license is granted: 1) for any code that Contributor has deleted from
+      the Contributor Version; 2)&nbsp; separate from the Contributor
+      Version;&nbsp; 3)&nbsp; for infringements caused by: i) third party
+      modifications of Contributor Version or ii)&nbsp; the combination of
+      Modifications made by that Contributor with other software&nbsp; (except as
+      part of the Contributor Version) or other devices; or 4) under Patent Claims
+      infringed by Covered Code in the absence of Modifications made by that
+      Contributor.</P></UL></UL>
+<P><BR><B>3. Distribution Obligations.</B>
+  <UL><B>3.1. Application of License.</B> <BR>The Modifications which You create
+    or to which You contribute are governed by the terms of this License,
+    including without limitation Section <B>2.2</B>. The Source Code version of
+    Covered Code may be distributed only under the terms of this License or a
+    future version of this License released under Section <B>6.1</B>, and You must
+    include a copy of this License with every copy of the Source Code You
+    distribute. You may not offer or impose any terms on any Source Code version
+    that alters or restricts the applicable version of this License or the
+    recipients' rights hereunder. However, You may include an additional document
+    offering the additional rights described in Section <B>3.5</B>.
+<P><B>3.2. Availability of Source Code.</B> <BR>Any Modification which You
+  create or to which You contribute must be made available in Source Code form
+  under the terms of this License either on the same media as an Executable
+  version or via an accepted Electronic Distribution Mechanism to anyone to whom
+  you made an Executable version available; and if made available via Electronic
+  Distribution Mechanism, must remain available for at least twelve (12) months
+  after the date it initially became available, or at least six (6) months after
+  a subsequent version of that particular Modification has been made available
+  to such recipients. You are responsible for ensuring that the Source Code
+  version remains available even if the Electronic Distribution Mechanism is
+  maintained by a third party.
+<P><B>3.3. Description of Modifications.</B> <BR>You must cause all Covered
+  Code to which You contribute to contain a file documenting the changes You
+  made to create that Covered Code and the date of any change. You must include
+  a prominent statement that the Modification is derived, directly or
+  indirectly, from Original Code provided by the Initial Developer and including
+  the name of the Initial Developer in (a) the Source Code, and (b) in any
+  notice in an Executable version or related documentation in which You describe
+  the origin or ownership of the Covered Code.
+<P><B>3.4. Intellectual Property Matters</B>
+  <UL><B>(a) Third Party Claims</B>. <BR>If Contributor has knowledge that a
+    license under a third party's intellectual property rights is required to
+    exercise the rights granted by such Contributor under Sections 2.1 or 2.2,
+    Contributor must include a text file with the Source Code distribution
+    titled "LEGAL'' which describes the claim and the party making the claim in
+    sufficient detail that a recipient will know whom to contact. If Contributor
+    obtains such knowledge after the Modification is made available as described
+    in Section 3.2, Contributor shall promptly modify the LEGAL file in all
+    copies Contributor makes available thereafter and shall take other steps
+    (such as notifying appropriate mailing lists or newsgroups) reasonably
+    calculated to inform those who received the Covered Code that new knowledge
+    has been obtained.
+<P><B>(b) Contributor APIs</B>. <BR>If Contributor's Modifications include
+  an application programming interface and Contributor has knowledge of patent
+  licenses which are reasonably necessary to implement that API, Contributor
+  must also include this information in the LEGAL file.
+  <BR>&nbsp;</P></UL>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+<B>(c)&nbsp;&nbsp;&nbsp; Representations.</B>
+<UL>Contributor represents that, except as disclosed pursuant to Section
+  3.4(a) above, Contributor believes that Contributor's Modifications are
+  Contributor's original creation(s) and/or Contributor has sufficient rights
+  to grant the rights conveyed by this License.</UL>
+<P><BR><B>3.5. Required Notices.</B> <BR>You must duplicate the notice in
+  <B>Exhibit A</B> in each file of the Source Code.&nbsp; If it is not possible
+  to put such notice in a particular Source Code file due to its structure, then
+  You must include such notice in a location (such as a relevant directory)
+  where a user would be likely to look for such a notice.&nbsp; If You created
+  one or more Modification(s) You may add your name as a Contributor to the
+  notice described in <B>Exhibit A</B>.&nbsp; You must also duplicate this
+  License in any documentation for the Source Code where You describe
+  recipients' rights or ownership rights relating to Covered Code.&nbsp; You may
+  choose to offer, and to charge a fee for, warranty, support, indemnity or
+  liability obligations to one or more recipients of Covered Code. However, You
+  may do so only on Your own behalf, and not on behalf of the Initial Developer
+  or any Contributor. You must make it absolutely clear than any such warranty,
+  support, indemnity or liability obligation is offered by You alone, and You
+  hereby agree to indemnify the Initial Developer and every Contributor for any
+  liability incurred by the Initial Developer or such Contributor as a result of
+  warranty, support, indemnity or liability terms You offer.
+<P><B>3.6. Distribution of Executable Versions.</B> <BR>You may distribute
+  Covered Code in Executable form only if the requirements of Section
+  <B>3.1-3.5</B> have been met for that Covered Code, and if You include a
+  notice stating that the Source Code version of the Covered Code is available
+  under the terms of this License, including a description of how and where You
+  have fulfilled the obligations of Section <B>3.2</B>. The notice must be
+  conspicuously included in any notice in an Executable version, related
+  documentation or collateral in which You describe recipients' rights relating
+  to the Covered Code. You may distribute the Executable version of Covered Code
+  or ownership rights under a license of Your choice, which may contain terms
+  different from this License, provided that You are in compliance with the
+  terms of this License and that the license for the Executable version does not
+  attempt to limit or alter the recipient's rights in the Source Code version
+  from the rights set forth in this License. If You distribute the Executable
+  version under a different license You must make it absolutely clear that any
+  terms which differ from this License are offered by You alone, not by the
+  Initial Developer or any Contributor. You hereby agree to indemnify the
+  Initial Developer and every Contributor for any liability incurred by the
+  Initial Developer or such Contributor as a result of any such terms You offer.
+
+<P><B>3.7. Larger Works.</B> <BR>You may create a Larger Work by combining
+  Covered Code with other code not governed by the terms of this License and
+  distribute the Larger Work as a single product. In such a case, You must make
+  sure the requirements of this License are fulfilled for the Covered
+  Code.</P></UL><B>4. Inability to Comply Due to Statute or Regulation.</B>
+<UL>If it is impossible for You to comply with any of the terms of this
+  License with respect to some or all of the Covered Code due to statute,
+  judicial order, or regulation then You must: (a) comply with the terms of this
+  License to the maximum extent possible; and (b) describe the limitations and
+  the code they affect. Such description must be included in the LEGAL file
+  described in Section <B>3.4</B> and must be included with all distributions of
+  the Source Code. Except to the extent prohibited by statute or regulation,
+  such description must be sufficiently detailed for a recipient of ordinary
+  skill to be able to understand it.</UL><B>5. Application of this License.</B>
+<UL>This License applies to code to which the Initial Developer has attached
+  the notice in <B>Exhibit A</B> and to related Covered Code.</UL><B>6. Versions
+  of the License.</B>
+<UL><B>6.1. New Versions</B>. <BR>Netscape Communications Corporation
+  (''Netscape'') may publish revised and/or new versions of the License from
+  time to time. Each version will be given a distinguishing version number.
+  <P><B>6.2. Effect of New Versions</B>. <BR>Once Covered Code has been
+    published under a particular version of the License, You may always continue
+    to use it under the terms of that version. You may also choose to use such
+    Covered Code under the terms of any subsequent version of the License
+    published by Netscape. No one other than Netscape has the right to modify the
+    terms applicable to Covered Code created under this License.
+  <P><B>6.3. Derivative Works</B>. <BR>If You create or use a modified version
+    of this License (which you may only do in order to apply it to code which is
+    not already Covered Code governed by this License), You must (a) rename Your
+    license so that the phrases ''Mozilla'', ''MOZILLAPL'', ''MOZPL'',
+    ''Netscape'', "MPL", ''NPL'' or any confusingly similar phrase do not appear
+    in your license (except to note that your license differs from this License)
+    and (b) otherwise make it clear that Your version of the license contains
+    terms which differ from the Mozilla Public License and Netscape Public
+    License. (Filling in the name of the Initial Developer, Original Code or
+    Contributor in the notice described in <B>Exhibit A</B> shall not of
+    themselves be deemed to be modifications of this License.)</P></UL><B>7.
+  DISCLAIMER OF WARRANTY.</B>
+<UL>COVERED CODE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS'' BASIS, WITHOUT
+  WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, WITHOUT
+  LIMITATION, WARRANTIES THAT THE COVERED CODE IS FREE OF DEFECTS, MERCHANTABLE,
+  FIT FOR A PARTICULAR PURPOSE OR NON-INFRINGING. THE ENTIRE RISK AS TO THE
+  QUALITY AND PERFORMANCE OF THE COVERED CODE IS WITH YOU. SHOULD ANY COVERED
+  CODE PROVE DEFECTIVE IN ANY RESPECT, YOU (NOT THE INITIAL DEVELOPER OR ANY
+  OTHER CONTRIBUTOR) ASSUME THE COST OF ANY NECESSARY SERVICING, REPAIR OR
+  CORRECTION. THIS DISCLAIMER OF WARRANTY CONSTITUTES AN ESSENTIAL PART OF THIS
+  LICENSE. NO USE OF ANY COVERED CODE IS AUTHORIZED HEREUNDER EXCEPT UNDER THIS
+  DISCLAIMER.</UL><B>8. TERMINATION.</B>
+<UL><B>8.1.&nbsp; </B>This License and the rights granted hereunder will
+  terminate automatically if You fail to comply with terms herein and fail to
+  cure such breach within 30 days of becoming aware of the breach. All
+  sublicenses to the Covered Code which are properly granted shall survive any
+  termination of this License. Provisions which, by their nature, must remain in
+  effect beyond the termination of this License shall survive.
+  <P><B>8.2.&nbsp; </B>If You initiate litigation by asserting a patent
+    infringement claim (excluding declatory judgment actions) against Initial
+    Developer or a Contributor (the Initial Developer or Contributor against whom
+    You file such action is referred to as "Participant")&nbsp; alleging that:
+  <P><B>(a)&nbsp; </B>such Participant's Contributor Version directly or
+    indirectly infringes any patent, then any and all rights granted by such
+    Participant to You under Sections 2.1 and/or 2.2 of this License shall, upon
+    60 days notice from Participant terminate prospectively, unless if within 60
+    days after receipt of notice You either: (i)&nbsp; agree in writing to pay
+    Participant a mutually agreeable reasonable royalty for Your past and future
+    use of Modifications made by such Participant, or (ii) withdraw Your
+    litigation claim with respect to the Contributor Version against such
+    Participant.&nbsp; If within 60 days of notice, a reasonable royalty and
+    payment arrangement are not mutually agreed upon in writing by the parties or
+    the litigation claim is not withdrawn, the rights granted by Participant to
+    You under Sections 2.1 and/or 2.2 automatically terminate at the expiration of
+    the 60 day notice period specified above.
+  <P><B>(b)</B>&nbsp; any software, hardware, or device, other than such
+    Participant's Contributor Version, directly or indirectly infringes any
+    patent, then any rights granted to You by such Participant under Sections
+    2.1(b) and 2.2(b) are revoked effective as of the date You first made, used,
+    sold, distributed, or had made, Modifications made by that Participant.
+  <P><B>8.3.&nbsp; </B>If You assert a patent infringement claim against
+    Participant alleging that such Participant's Contributor Version directly or
+    indirectly infringes any patent where such claim is resolved (such as by
+    license or settlement) prior to the initiation of patent infringement
+    litigation, then the reasonable value of the licenses granted by such
+    Participant under Sections 2.1 or 2.2 shall be taken into account in
+    determining the amount or value of any payment or license.
+  <P><B>8.4.</B>&nbsp; In the event of termination under Sections 8.1 or 8.2
+    above,&nbsp; all end user license agreements (excluding distributors and
+    resellers) which have been validly granted by You or any distributor hereunder
+    prior to termination shall survive termination.</P></UL><B>9. LIMITATION OF
+  LIABILITY.</B>
+<UL>UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT (INCLUDING
+  NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE INITIAL DEVELOPER, ANY
+  OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF COVERED CODE, OR ANY SUPPLIER OF ANY
+  OF SUCH PARTIES, BE LIABLE TO ANY PERSON FOR ANY INDIRECT, SPECIAL,
+  INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER INCLUDING, WITHOUT
+  LIMITATION, DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER FAILURE OR
+  MALFUNCTION, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH
+  PARTY SHALL HAVE BEEN INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS
+  LIMITATION OF LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL
+  INJURY RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE EXTENT APPLICABLE LAW
+  PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OR
+  LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO THIS EXCLUSION AND
+  LIMITATION MAY NOT APPLY TO YOU.</UL><B>10. U.S. GOVERNMENT END USERS.</B>
+<UL>The Covered Code is a ''commercial item,'' as that term is defined in 48
+  C.F.R. 2.101 (Oct. 1995), consisting of ''commercial computer software'' and
+  ''commercial computer software documentation,'' as such terms are used in 48
+  C.F.R. 12.212 (Sept. 1995). Consistent with 48 C.F.R. 12.212 and 48 C.F.R.
+  227.7202-1 through 227.7202-4 (June 1995), all U.S. Government End Users
+  acquire Covered Code with only those rights set forth herein.</UL><B>11.
+  MISCELLANEOUS.</B>
+<UL>This License represents the complete agreement concerning subject matter
+  hereof. If any provision of this License is held to be unenforceable, such
+  provision shall be reformed only to the extent necessary to make it
+  enforceable. This License shall be governed by California law provisions
+  (except to the extent applicable law, if any, provides otherwise), excluding
+  its conflict-of-law provisions. With respect to disputes in which at least one
+  party is a citizen of, or an entity chartered or registered to do business in
+  the United States of America, any litigation relating to this License shall be
+  subject to the jurisdiction of the Federal Courts of the Northern District of
+  California, with venue lying in Santa Clara County, California, with the
+  losing party responsible for costs, including without limitation, court costs
+  and reasonable attorneys' fees and expenses. The application of the United
+  Nations Convention on Contracts for the International Sale of Goods is
+  expressly excluded. Any law or regulation which provides that the language of
+  a contract shall be construed against the drafter shall not apply to this
+  License.</UL><B>12. RESPONSIBILITY FOR CLAIMS.</B>
+<UL>As between Initial Developer and the Contributors, each party is
+  responsible for claims and damages arising, directly or indirectly, out of its
+  utilization of rights under this License and You agree to work with Initial
+  Developer and Contributors to distribute such responsibility on an equitable
+  basis. Nothing herein is intended or shall be deemed to constitute any
+  admission of liability.</UL><B>13. MULTIPLE-LICENSED CODE.</B>
+<UL>Initial Developer may designate portions of the Covered Code as
+  "Multiple-Licensed".&nbsp; "Multiple-Licensed" means that the Initial
+  Developer permits you to utilize portions of the Covered Code under Your
+  choice of the MPL or the alternative licenses, if any, specified by the
+  Initial Developer in the file described in Exhibit A.</UL>
+<P><BR><B>EXHIBIT A -Mozilla Public License.</B>
+  <UL>The contents of this file are subject to the Mozilla Public License
+    Version 1.1 (the "License"); you may not use this file except in compliance
+    with the License. You may obtain a copy of the License at
+    <BR>http://www.mozilla.org/MPL/
+<P>Software distributed under the License is distributed on an "AS IS" basis,
+  WITHOUT WARRANTY OF <BR>ANY KIND, either express or implied. See the License
+  for the specific language governing rights and <BR>limitations under the
+  License.
+<P>The Original Code is Javassist.
+<P>The Initial Developer of the Original Code is Shigeru Chiba.
+  Portions created by the Initial Developer are<BR>&nbsp;
+  Copyright (C) 1999- Shigeru Chiba. All Rights Reserved.
+<P>Contributor(s):  __Bill Burke, Jason T. Greene______________.
+
+<p>Alternatively, the contents of this software may be used under the
+  terms of the GNU Lesser General Public License Version 2.1 or later
+  (the "LGPL"), or the Apache License Version 2.0 (the "AL"),
+  in which case the provisions of the LGPL or the AL are applicable
+  instead of those above. If you wish to allow use of your version of
+  this software only under the terms of either the LGPL or the AL, and not to allow others to
+  use your version of this software under the terms of the MPL, indicate
+  your decision by deleting the provisions above and replace them with
+  the notice and other provisions required by the LGPL or the AL. If you do not
+  delete the provisions above, a recipient may use your version of this
+  software under the terms of any one of the MPL, the LGPL or the AL.
+
+<P></P></UL>
+</BODY>
+</HTML>
\ No newline at end of file
diff --git a/licenses/LICENSE-javolution.txt b/licenses-binary/LICENSE-javolution.txt
similarity index 100%
rename from licenses/LICENSE-javolution.txt
rename to licenses-binary/LICENSE-javolution.txt
diff --git a/licenses/LICENSE-jline.txt b/licenses-binary/LICENSE-jline.txt
similarity index 100%
rename from licenses/LICENSE-jline.txt
rename to licenses-binary/LICENSE-jline.txt
diff --git a/licenses/LICENSE-junit-interface.txt b/licenses-binary/LICENSE-jodd.txt
similarity index 67%
rename from licenses/LICENSE-junit-interface.txt
rename to licenses-binary/LICENSE-jodd.txt
index e835350c4e2a4..cc6b458adb386 100644
--- a/licenses/LICENSE-junit-interface.txt
+++ b/licenses-binary/LICENSE-jodd.txt
@@ -1,15 +1,15 @@
-Copyright (c) 2009-2012, Stefan Zeiger
+Copyright (c) 2003-present, Jodd Team (https://jodd.org)
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
-    * Redistributions of source code must retain the above copyright notice,
-      this list of conditions and the following disclaimer.
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
 
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
diff --git a/licenses/LICENSE-DPark.txt b/licenses-binary/LICENSE-join.txt
similarity index 100%
rename from licenses/LICENSE-DPark.txt
rename to licenses-binary/LICENSE-join.txt
diff --git a/licenses-binary/LICENSE-jquery.txt b/licenses-binary/LICENSE-jquery.txt
new file mode 100644
index 0000000000000..45930542204fb
--- /dev/null
+++ b/licenses-binary/LICENSE-jquery.txt
@@ -0,0 +1,20 @@
+Copyright JS Foundation and other contributors, https://js.foundation/
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses-binary/LICENSE-json-formatter.txt b/licenses-binary/LICENSE-json-formatter.txt
new file mode 100644
index 0000000000000..5193348fce126
--- /dev/null
+++ b/licenses-binary/LICENSE-json-formatter.txt
@@ -0,0 +1,6 @@
+Copyright 2014 Mohsen Azimi
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
\ No newline at end of file
diff --git a/licenses-binary/LICENSE-jtransforms.html b/licenses-binary/LICENSE-jtransforms.html
new file mode 100644
index 0000000000000..351c17412357b
--- /dev/null
+++ b/licenses-binary/LICENSE-jtransforms.html
@@ -0,0 +1,388 @@
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en"><head><title>Mozilla Public License version 1.1</title>
+
+
+
+  <style type="text/css">
+    .very-strong{
+      text-transform:uppercase;
+    }
+    dt{
+      font-weight:bold;
+    }
+    dd p{
+      margin:0;
+    }
+  </style></head><body>
+<small> </small><h1><small>Mozilla Public License Version 1.1</small></h1>
+<small> </small><h2 id="section-1"><small>1. Definitions.</small></h2>
+<small> </small><dl>
+  <small>  <dt id="section-1.0.1">1.0.1. "Commercial Use"
+  </dt><dd>means distribution or otherwise making the Covered Code available to a third party.
+  </dd><dt id="section-1.1">1.1. "Contributor"
+  </dt><dd>means each entity that creates or contributes to the creation of Modifications.
+  </dd><dt id="section-1.2">1.2. "Contributor Version"
+  </dt><dd>means the combination of the Original Code, prior Modifications used by a Contributor,
+    and the Modifications made by that particular Contributor.
+  </dd><dt id="section-1.3">1.3. "Covered Code"
+  </dt><dd>means the Original Code or Modifications or the combination of the Original Code and
+    Modifications, in each case including portions thereof.
+  </dd><dt id="section-1.4">1.4. "Electronic Distribution Mechanism"
+  </dt><dd>means a mechanism generally accepted in the software development community for the
+    electronic transfer of data.
+  </dd><dt id="section-1.5">1.5. "Executable"
+  </dt><dd>means Covered Code in any form other than Source Code.
+  </dd><dt id="section-1.6">1.6. "Initial Developer"
+  </dt><dd>means the individual or entity identified as the Initial Developer in the Source Code
+    notice required by <a href="#exhibit-a">Exhibit A</a>.
+  </dd><dt id="section-1.7">1.7. "Larger Work"
+  </dt><dd>means a work which combines Covered Code or portions thereof with code not governed
+    by the terms of this License.
+  </dd><dt id="section-1.8">1.8. "License"
+  </dt><dd>means this document.
+  </dd><dt id="section-1.8.1">1.8.1. "Licensable"
+  </dt><dd>means having the right to grant, to the maximum extent possible, whether at the
+    time of the initial grant or subsequently acquired, any and all of the rights
+    conveyed herein.
+  </dd><dt id="section-1.9">1.9. "Modifications"
+  </dt><dd>
+    <p>means any addition to or deletion from the substance or structure of either the
+      Original Code or any previous Modifications. When Covered Code is released as a
+      series of files, a Modification is:
+    </p><ol type="a">
+    <li id="section-1.9-a">Any addition to or deletion from the contents of a file
+      containing Original Code or previous Modifications.
+    </li><li id="section-1.9-b">Any new file that contains any part of the Original Code or
+    previous Modifications.
+  </li></ol>
+  </dd><dt id="section-1.10">1.10. "Original Code"
+  </dt><dd>means Source Code of computer software code which is described in the Source Code
+    notice required by <a href="#exhibit-a">Exhibit A</a> as Original Code, and which,
+    at the time of its release under this License is not already Covered Code governed
+    by this License.
+  </dd><dt id="section-1.10.1">1.10.1. "Patent Claims"
+  </dt><dd>means any patent claim(s), now owned or hereafter acquired, including without
+    limitation, method, process, and apparatus claims, in any patent Licensable by
+    grantor.
+  </dd><dt id="section-1.11">1.11. "Source Code"
+  </dt><dd>means the preferred form of the Covered Code for making modifications to it,
+    including all modules it contains, plus any associated interface definition files,
+    scripts used to control compilation and installation of an Executable, or source
+    code differential comparisons against either the Original Code or another well known,
+    available Covered Code of the Contributor's choice. The Source Code can be in a
+    compressed or archival form, provided the appropriate decompression or de-archiving
+    software is widely available for no charge.
+  </dd><dt id="section-1.12">1.12. "You" (or "Your")
+  </dt><dd>means an individual or a legal entity exercising rights under, and complying with
+    all of the terms of, this License or a future version of this License issued under
+    <a href="#section-6.1">Section 6.1.</a> For legal entities, "You" includes any entity
+    which controls, is controlled by, or is under common control with You. For purposes of
+    this definition, "control" means (a) the power, direct or indirect, to cause the
+    direction or management of such entity, whether by contract or otherwise, or (b)
+    ownership of more than fifty percent (50%) of the outstanding shares or beneficial
+    ownership of such entity.
+  </dd></small></dl>
+<small> </small><h2 id="section-2"><small>2. Source Code License.</small></h2>
+<small> </small><h3 id="section-2.1"><small>2.1. The Initial Developer Grant.</small></h3>
+<small> </small><p><small>The Initial Developer hereby grants You a world-wide, royalty-free, non-exclusive
+  license, subject to third party intellectual property claims:
+</small></p><ol type="a">
+  <small>  </small><li id="section-2.1-a"><small>under intellectual property rights (other than patent or
+  trademark) Licensable by Initial Developer to use, reproduce, modify, display, perform,
+  sublicense and distribute the Original Code (or portions thereof) with or without
+  Modifications, and/or as part of a Larger Work; and
+</small></li><li id="section-2.1-b"><small>under Patents Claims infringed by the making, using or selling
+  of Original Code, to make, have made, use, practice, sell, and offer for sale, and/or
+  otherwise dispose of the Original Code (or portions thereof).
+</small></li><li id="section-2.1-c"><small>the licenses granted in this Section 2.1
+  (<a href="#section-2.1-a">a</a>) and (<a href="#section-2.1-b">b</a>) are effective on
+  the date Initial Developer first distributes Original Code under the terms of this
+  License.
+</small></li><li id="section-2.1-d"><small>Notwithstanding Section 2.1 (<a href="#section-2.1-b">b</a>)
+  above, no patent license is granted: 1) for code that You delete from the Original Code;
+  2) separate from the Original Code; or 3) for infringements caused by: i) the
+  modification of the Original Code or ii) the combination of the Original Code with other
+  software or devices.
+</small></li></ol>
+<small> </small><h3 id="section-2.2"><small>2.2. Contributor Grant.</small></h3>
+<small> </small><p><small>Subject to third party intellectual property claims, each Contributor hereby grants You
+  a world-wide, royalty-free, non-exclusive license
+</small></p><ol type="a">
+  <small>  </small><li id="section-2.2-a"><small>under intellectual property rights (other than patent or trademark)
+  Licensable by Contributor, to use, reproduce, modify, display, perform, sublicense and
+  distribute the Modifications created by such Contributor (or portions thereof) either on
+  an unmodified basis, with other Modifications, as Covered Code and/or as part of a Larger
+  Work; and
+</small></li><li id="section-2.2-b"><small>under Patent Claims infringed by the making, using, or selling of
+  Modifications made by that Contributor either alone and/or in combination with its
+  Contributor Version (or portions of such combination), to make, use, sell, offer for
+  sale, have made, and/or otherwise dispose of: 1) Modifications made by that Contributor
+  (or portions thereof); and 2) the combination of Modifications made by that Contributor
+  with its Contributor Version (or portions of such combination).
+</small></li><li id="section-2.2-c"><small>the licenses granted in Sections 2.2
+  (<a href="#section-2.2-a">a</a>) and 2.2 (<a href="#section-2.2-b">b</a>) are effective
+  on the date Contributor first makes Commercial Use of the Covered Code.
+</small></li><li id="section-2.2-d"><small>Notwithstanding Section 2.2 (<a href="#section-2.2-b">b</a>)
+  above, no patent license is granted: 1) for any code that Contributor has deleted from
+  the Contributor Version; 2) separate from the Contributor Version; 3) for infringements
+  caused by: i) third party modifications of Contributor Version or ii) the combination of
+  Modifications made by that Contributor with other software (except as part of the
+  Contributor Version) or other devices; or 4) under Patent Claims infringed by Covered Code
+  in the absence of Modifications made by that Contributor.
+</small></li></ol>
+<small> </small><h2 id="section-3"><small>3. Distribution Obligations.</small></h2>
+<small> </small><h3 id="section-3.1"><small>3.1. Application of License.</small></h3>
+<small> </small><p><small>The Modifications which You create or to which You contribute are governed by the terms
+  of this License, including without limitation Section <a href="#section-2.2">2.2</a>. The
+  Source Code version of Covered Code may be distributed only under the terms of this License
+  or a future version of this License released under Section <a href="#section-6.1">6.1</a>,
+  and You must include a copy of this License with every copy of the Source Code You
+  distribute. You may not offer or impose any terms on any Source Code version that alters or
+  restricts the applicable version of this License or the recipients' rights hereunder.
+  However, You may include an additional document offering the additional rights described in
+  Section <a href="#section-3.5">3.5</a>.
+</small></p><h3 id="section-3.2"><small>3.2. Availability of Source Code.</small></h3>
+<small> </small><p><small>Any Modification which You create or to which You contribute must be made available in
+  Source Code form under the terms of this License either on the same media as an Executable
+  version or via an accepted Electronic Distribution Mechanism to anyone to whom you made an
+  Executable version available; and if made available via Electronic Distribution Mechanism,
+  must remain available for at least twelve (12) months after the date it initially became
+  available, or at least six (6) months after a subsequent version of that particular
+  Modification has been made available to such recipients. You are responsible for ensuring
+  that the Source Code version remains available even if the Electronic Distribution
+  Mechanism is maintained by a third party.
+</small></p><h3 id="section-3.3"><small>3.3. Description of Modifications.</small></h3>
+<small> </small><p><small>You must cause all Covered Code to which You contribute to contain a file documenting the
+  changes You made to create that Covered Code and the date of any change. You must include a
+  prominent statement that the Modification is derived, directly or indirectly, from Original
+  Code provided by the Initial Developer and including the name of the Initial Developer in
+  (a) the Source Code, and (b) in any notice in an Executable version or related documentation
+  in which You describe the origin or ownership of the Covered Code.
+</small></p><h3 id="section-3.4"><small>3.4. Intellectual Property Matters</small></h3>
+<small> </small><h4 id="section-3.4-a"><small>(a) Third Party Claims</small></h4>
+<small> </small><p><small>If Contributor has knowledge that a license under a third party's intellectual property
+  rights is required to exercise the rights granted by such Contributor under Sections
+  <a href="#section-2.1">2.1</a> or <a href="#section-2.2">2.2</a>, Contributor must include a
+  text file with the Source Code distribution titled "LEGAL" which describes the claim and the
+  party making the claim in sufficient detail that a recipient will know whom to contact. If
+  Contributor obtains such knowledge after the Modification is made available as described in
+  Section <a href="#section-3.2">3.2</a>, Contributor shall promptly modify the LEGAL file in
+  all copies Contributor makes available thereafter and shall take other steps (such as
+  notifying appropriate mailing lists or newsgroups) reasonably calculated to inform those who
+  received the Covered Code that new knowledge has been obtained.
+</small></p><h4 id="section-3.4-b"><small>(b) Contributor APIs</small></h4>
+<small> </small><p><small>If Contributor's Modifications include an application programming interface and Contributor
+  has knowledge of patent licenses which are reasonably necessary to implement that
+  <abbr>API</abbr>, Contributor must also include this information in the
+  <strong class="very-strong">legal</strong> file.
+</small></p><h4 id="section-3.4-c"><small>(c) Representations.</small></h4>
+<small> </small><p><small>Contributor represents that, except as disclosed pursuant to Section 3.4
+  (<a href="#section-3.4-a">a</a>) above, Contributor believes that Contributor's Modifications
+  are Contributor's original creation(s) and/or Contributor has sufficient rights to grant the
+  rights conveyed by this License.
+</small></p><h3 id="section-3.5"><small>3.5. Required Notices.</small></h3>
+<small> </small><p><small>You must duplicate the notice in <a href="#exhibit-a">Exhibit A</a> in each file of the
+  Source Code. If it is not possible to put such notice in a particular Source Code file due to
+  its structure, then You must include such notice in a location (such as a relevant directory)
+  where a user would be likely to look for such a notice. If You created one or more
+  Modification(s) You may add your name as a Contributor to the notice described in
+  <a href="#exhibit-a">Exhibit A</a>. You must also duplicate this License in any documentation
+  for the Source Code where You describe recipients' rights or ownership rights relating to
+  Covered Code. You may choose to offer, and to charge a fee for, warranty, support, indemnity
+  or liability obligations to one or more recipients of Covered Code. However, You may do so
+  only on Your own behalf, and not on behalf of the Initial Developer or any Contributor. You
+  must make it absolutely clear than any such warranty, support, indemnity or liability
+  obligation is offered by You alone, and You hereby agree to indemnify the Initial Developer
+  and every Contributor for any liability incurred by the Initial Developer or such Contributor
+  as a result of warranty, support, indemnity or liability terms You offer.
+</small></p><h3 id="section-3.6"><small>3.6. Distribution of Executable Versions.</small></h3>
+<small> </small><p><small>You may distribute Covered Code in Executable form only if the requirements of Sections
+  <a href="#section-3.1">3.1</a>, <a href="#section-3.2">3.2</a>,
+  <a href="#section-3.3">3.3</a>, <a href="#section-3.4">3.4</a> and
+  <a href="#section-3.5">3.5</a> have been met for that Covered Code, and if You include a
+  notice stating that the Source Code version of the Covered Code is available under the terms
+  of this License, including a description of how and where You have fulfilled the obligations
+  of Section <a href="#section-3.2">3.2</a>. The notice must be conspicuously included in any
+  notice in an Executable version, related documentation or collateral in which You describe
+  recipients' rights relating to the Covered Code. You may distribute the Executable version of
+  Covered Code or ownership rights under a license of Your choice, which may contain terms
+  different from this License, provided that You are in compliance with the terms of this
+  License and that the license for the Executable version does not attempt to limit or alter the
+  recipient's rights in the Source Code version from the rights set forth in this License. If
+  You distribute the Executable version under a different license You must make it absolutely
+  clear that any terms which differ from this License are offered by You alone, not by the
+  Initial Developer or any Contributor. You hereby agree to indemnify the Initial Developer and
+  every Contributor for any liability incurred by the Initial Developer or such Contributor as
+  a result of any such terms You offer.
+</small></p><h3 id="section-3.7"><small>3.7. Larger Works.</small></h3>
+<small> </small><p><small>You may create a Larger Work by combining Covered Code with other code not governed by the
+  terms of this License and distribute the Larger Work as a single product. In such a case,
+  You must make sure the requirements of this License are fulfilled for the Covered Code.
+</small></p><h2 id="section-4"><small>4. Inability to Comply Due to Statute or Regulation.</small></h2>
+<small> </small><p><small>If it is impossible for You to comply with any of the terms of this License with respect to
+  some or all of the Covered Code due to statute, judicial order, or regulation then You must:
+  (a) comply with the terms of this License to the maximum extent possible; and (b) describe
+  the limitations and the code they affect. Such description must be included in the
+  <strong class="very-strong">legal</strong> file described in Section
+  <a href="#section-3.4">3.4</a> and must be included with all distributions of the Source Code.
+  Except to the extent prohibited by statute or regulation, such description must be
+  sufficiently detailed for a recipient of ordinary skill to be able to understand it.
+</small></p><h2 id="section-5"><small>5. Application of this License.</small></h2>
+<small> </small><p><small>This License applies to code to which the Initial Developer has attached the notice in
+  <a href="#exhibit-a">Exhibit A</a> and to related Covered Code.
+</small></p><h2 id="section-6"><small>6. Versions of the License.</small></h2>
+<small> </small><h3 id="section-6.1"><small>6.1. New Versions</small></h3>
+<small> </small><p><small>Netscape Communications Corporation ("Netscape") may publish revised and/or new versions
+  of the License from time to time. Each version will be given a distinguishing version number.
+</small></p><h3 id="section-6.2"><small>6.2. Effect of New Versions</small></h3>
+<small> </small><p><small>Once Covered Code has been published under a particular version of the License, You may
+  always continue to use it under the terms of that version. You may also choose to use such
+  Covered Code under the terms of any subsequent version of the License published by Netscape.
+  No one other than Netscape has the right to modify the terms applicable to Covered Code
+  created under this License.
+</small></p><h3 id="section-6.3"><small>6.3. Derivative Works</small></h3>
+<small> </small><p><small>If You create or use a modified version of this License (which you may only do in order to
+  apply it to code which is not already Covered Code governed by this License), You must (a)
+  rename Your license so that the phrases "Mozilla", "MOZILLAPL", "MOZPL", "Netscape", "MPL",
+  "NPL" or any confusingly similar phrase do not appear in your license (except to note that
+  your license differs from this License) and (b) otherwise make it clear that Your version of
+  the license contains terms which differ from the Mozilla Public License and Netscape Public
+  License. (Filling in the name of the Initial Developer, Original Code or Contributor in the
+  notice described in <a href="#exhibit-a">Exhibit A</a> shall not of themselves be deemed to
+  be modifications of this License.)
+</small></p><h2 id="section-7"><small>7. <strong class="very-strong">Disclaimer of warranty</strong></small></h2>
+<small> </small><p><small><strong class="very-strong">Covered code is provided under this license on an "as is"
+  basis, without warranty of any kind, either expressed or implied, including, without
+  limitation, warranties that the covered code is free of defects, merchantable, fit for a
+  particular purpose or non-infringing. The entire risk as to the quality and performance of
+  the covered code is with you. Should any covered code prove defective in any respect, you
+  (not the initial developer or any other contributor) assume the cost of any necessary
+  servicing, repair or correction. This disclaimer of warranty constitutes an essential part
+  of this license. No use of any covered code is authorized hereunder except under this
+  disclaimer.</strong>
+</small></p><h2 id="section-8"><small>8. Termination</small></h2>
+<small> </small><p id="section-8.1"><small>8.1. This License and the rights granted hereunder will terminate
+  automatically if You fail to comply with terms herein and fail to cure such breach
+  within 30 days of becoming aware of the breach. All sublicenses to the Covered Code which
+  are properly granted shall survive any termination of this License. Provisions which, by
+  their nature, must remain in effect beyond the termination of this License shall survive.
+</small></p><p id="section-8.2"><small>8.2. If You initiate litigation by asserting a patent infringement
+  claim (excluding declatory judgment actions) against Initial Developer or a Contributor
+  (the Initial Developer or Contributor against whom You file such action is referred to
+  as "Participant") alleging that:
+</small></p><ol type="a">
+  <small>  </small><li id="section-8.2-a"><small>such Participant's Contributor Version directly or indirectly
+  infringes any patent, then any and all rights granted by such Participant to You under
+  Sections <a href="#section-2.1">2.1</a> and/or <a href="#section-2.2">2.2</a> of this
+  License shall, upon 60 days notice from Participant terminate prospectively, unless if
+  within 60 days after receipt of notice You either: (i) agree in writing to pay
+  Participant a mutually agreeable reasonable royalty for Your past and future use of
+  Modifications made by such Participant, or (ii) withdraw Your litigation claim with
+  respect to the Contributor Version against such Participant. If within 60 days of
+  notice, a reasonable royalty and payment arrangement are not mutually agreed upon in
+  writing by the parties or the litigation claim is not withdrawn, the rights granted by
+  Participant to You under Sections <a href="#section-2.1">2.1</a> and/or
+  <a href="#section-2.2">2.2</a> automatically terminate at the expiration of the 60 day
+  notice period specified above.
+</small></li><li id="section-8.2-b"><small>any software, hardware, or device, other than such Participant's
+  Contributor Version, directly or indirectly infringes any patent, then any rights
+  granted to You by such Participant under Sections 2.1(<a href="#section-2.1-b">b</a>)
+  and 2.2(<a href="#section-2.2-b">b</a>) are revoked effective as of the date You first
+  made, used, sold, distributed, or had made, Modifications made by that Participant.
+</small></li></ol>
+<small> </small><p id="section-8.3"><small>8.3. If You assert a patent infringement claim against Participant
+  alleging that such Participant's Contributor Version directly or indirectly infringes
+  any patent where such claim is resolved (such as by license or settlement) prior to the
+  initiation of patent infringement litigation, then the reasonable value of the licenses
+  granted by such Participant under Sections <a href="#section-2.1">2.1</a> or
+  <a href="#section-2.2">2.2</a> shall be taken into account in determining the amount or
+  value of any payment or license.
+</small></p><p id="section-8.4"><small>8.4. In the event of termination under Sections
+  <a href="#section-8.1">8.1</a> or <a href="#section-8.2">8.2</a> above, all end user
+  license agreements (excluding distributors and resellers) which have been validly
+  granted by You or any distributor hereunder prior to termination shall survive
+  termination.
+</small></p><h2 id="section-9"><small>9. <strong class="very-strong">Limitation of liability</strong></small></h2>
+<small> </small><p><small><strong class="very-strong">Under no circumstances and under no legal theory, whether
+  tort (including negligence), contract, or otherwise, shall you, the initial developer,
+  any other contributor, or any distributor of covered code, or any supplier of any of
+  such parties, be liable to any person for any indirect, special, incidental, or
+  consequential damages of any character including, without limitation, damages for loss
+  of goodwill, work stoppage, computer failure or malfunction, or any and all other
+  commercial damages or losses, even if such party shall have been informed of the
+  possibility of such damages. This limitation of liability shall not apply to liability
+  for death or personal injury resulting from such party's negligence to the extent
+  applicable law prohibits such limitation. Some jurisdictions do not allow the exclusion
+  or limitation of incidental or consequential damages, so this exclusion and limitation
+  may not apply to you.</strong>
+</small></p><h2 id="section-10"><small>10. <abbr title="United States">U.S.</abbr> government end users</small></h2>
+<small> </small><p><small>The Covered Code is a "commercial item," as that term is defined in 48
+  <abbr>C.F.R.</abbr> 2.101 (<abbr title="October">Oct.</abbr> 1995), consisting of
+  "commercial computer software" and "commercial computer software documentation," as such
+  terms are used in 48 <abbr>C.F.R.</abbr> 12.212 (<abbr title="September">Sept.</abbr>
+  1995). Consistent with 48 <abbr>C.F.R.</abbr> 12.212 and 48 <abbr>C.F.R.</abbr>
+  227.7202-1 through 227.7202-4 (June 1995), all <abbr>U.S.</abbr> Government End Users
+  acquire Covered Code with only those rights set forth herein.
+</small></p><h2 id="section-11"><small>11. Miscellaneous</small></h2>
+<small> </small><p><small>This License represents the complete agreement concerning subject matter hereof. If
+  any provision of this License is held to be unenforceable, such provision shall be
+  reformed only to the extent necessary to make it enforceable. This License shall be
+  governed by California law provisions (except to the extent applicable law, if any,
+  provides otherwise), excluding its conflict-of-law provisions. With respect to
+  disputes in which at least one party is a citizen of, or an entity chartered or
+  registered to do business in the United States of America, any litigation relating to
+  this License shall be subject to the jurisdiction of the Federal Courts of the
+  Northern District of California, with venue lying in Santa Clara County, California,
+  with the losing party responsible for costs, including without limitation, court
+  costs and reasonable attorneys' fees and expenses. The application of the United
+  Nations Convention on Contracts for the International Sale of Goods is expressly
+  excluded. Any law or regulation which provides that the language of a contract
+  shall be construed against the drafter shall not apply to this License.
+</small></p><h2 id="section-12"><small>12. Responsibility for claims</small></h2>
+<small> </small><p><small>As between Initial Developer and the Contributors, each party is responsible for
+  claims and damages arising, directly or indirectly, out of its utilization of rights
+  under this License and You agree to work with Initial Developer and Contributors to
+  distribute such responsibility on an equitable basis. Nothing herein is intended or
+  shall be deemed to constitute any admission of liability.
+</small></p><h2 id="section-13"><small>13. Multiple-licensed code</small></h2>
+<small> </small><p><small>Initial Developer may designate portions of the Covered Code as
+  "Multiple-Licensed". "Multiple-Licensed" means that the Initial Developer permits
+  you to utilize portions of the Covered Code under Your choice of the <abbr>MPL</abbr>
+  or the alternative licenses, if any, specified by the Initial Developer in the file
+  described in <a href="#exhibit-a">Exhibit A</a>.
+</small></p><h2 id="exhibit-a"><small>Exhibit A - Mozilla Public License.</small></h2>
+<small> </small><pre><small>"The contents of this file are subject to the Mozilla Public License
+Version 1.1 (the "License"); you may not use this file except in
+compliance with the License. You may obtain a copy of the License at
+http://www.mozilla.org/MPL/
+
+Software distributed under the License is distributed on an "AS IS"
+basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+License for the specific language governing rights and limitations
+under the License.
+
+The Original Code is JTransforms.
+
+The Initial Developer of the Original Code is
+Piotr Wendykier, Emory University.
+Portions created by the Initial Developer are Copyright (C) 2007-2009
+the Initial Developer. All Rights Reserved.
+
+Alternatively, the contents of this file may be used under the terms of
+either the GNU General Public License Version 2 or later (the "GPL"), or
+the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+in which case the provisions of the GPL or the LGPL are applicable instead
+of those above. If you wish to allow use of your version of this file only
+under the terms of either the GPL or the LGPL, and not to allow others to
+use your version of this file under the terms of the MPL, indicate your
+decision by deleting the provisions above and replace them with the notice
+and other provisions required by the GPL or the LGPL. If you do not delete
+the provisions above, a recipient may use your version of this file under
+the terms of any one of the MPL, the GPL or the LGPL.</small></pre>
+<small> </small><p><small>NOTE: The text of this Exhibit A may differ slightly from the text of
+  the notices in the Source Code files of the Original Code. You should
+  use the text of this Exhibit A rather than the text found in the
+  Original Code Source Code for Your Modifications.
+
+</small></p></body></html>
\ No newline at end of file
diff --git a/licenses/LICENSE-kryo.txt b/licenses-binary/LICENSE-kryo.txt
similarity index 100%
rename from licenses/LICENSE-kryo.txt
rename to licenses-binary/LICENSE-kryo.txt
diff --git a/licenses-binary/LICENSE-leveldbjni.txt b/licenses-binary/LICENSE-leveldbjni.txt
new file mode 100644
index 0000000000000..b4dabb9174c6d
--- /dev/null
+++ b/licenses-binary/LICENSE-leveldbjni.txt
@@ -0,0 +1,27 @@
+Copyright (c) 2011 FuseSource Corp. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of FuseSource Corp. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses-binary/LICENSE-machinist.txt b/licenses-binary/LICENSE-machinist.txt
new file mode 100644
index 0000000000000..68cc3a3e3a9c4
--- /dev/null
+++ b/licenses-binary/LICENSE-machinist.txt
@@ -0,0 +1,19 @@
+Copyright (c) 2011-2014 Erik Osheim, Tom Switzer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/licenses-binary/LICENSE-matchMedia-polyfill.txt b/licenses-binary/LICENSE-matchMedia-polyfill.txt
new file mode 100644
index 0000000000000..2fd0bc2b37448
--- /dev/null
+++ b/licenses-binary/LICENSE-matchMedia-polyfill.txt
@@ -0,0 +1 @@
+matchMedia() polyfill - Test a CSS media type/query in JS. Authors & copyright (c) 2012: Scott Jehl, Paul Irish, Nicholas Zakas. Dual MIT/BSD license
\ No newline at end of file
diff --git a/licenses/LICENSE-minlog.txt b/licenses-binary/LICENSE-minlog.txt
similarity index 100%
rename from licenses/LICENSE-minlog.txt
rename to licenses-binary/LICENSE-minlog.txt
diff --git a/licenses-binary/LICENSE-modernizr.txt b/licenses-binary/LICENSE-modernizr.txt
new file mode 100644
index 0000000000000..2bf24b9b9f848
--- /dev/null
+++ b/licenses-binary/LICENSE-modernizr.txt
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) <year> <copyright holders>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-netlib.txt b/licenses-binary/LICENSE-netlib.txt
similarity index 100%
rename from licenses/LICENSE-netlib.txt
rename to licenses-binary/LICENSE-netlib.txt
diff --git a/licenses/LICENSE-paranamer.txt b/licenses-binary/LICENSE-paranamer.txt
similarity index 100%
rename from licenses/LICENSE-paranamer.txt
rename to licenses-binary/LICENSE-paranamer.txt
diff --git a/licenses/LICENSE-jpmml-model.txt b/licenses-binary/LICENSE-pmml-model.txt
similarity index 100%
rename from licenses/LICENSE-jpmml-model.txt
rename to licenses-binary/LICENSE-pmml-model.txt
diff --git a/licenses/LICENSE-protobuf.txt b/licenses-binary/LICENSE-protobuf.txt
similarity index 100%
rename from licenses/LICENSE-protobuf.txt
rename to licenses-binary/LICENSE-protobuf.txt
diff --git a/licenses-binary/LICENSE-py4j.txt b/licenses-binary/LICENSE-py4j.txt
new file mode 100644
index 0000000000000..70af3e69ed67a
--- /dev/null
+++ b/licenses-binary/LICENSE-py4j.txt
@@ -0,0 +1,27 @@
+Copyright (c) 2009-2011, Barthelemy Dagenais All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+- The name of the author may not be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/licenses/LICENSE-pyrolite.txt b/licenses-binary/LICENSE-pyrolite.txt
similarity index 100%
rename from licenses/LICENSE-pyrolite.txt
rename to licenses-binary/LICENSE-pyrolite.txt
diff --git a/licenses/LICENSE-reflectasm.txt b/licenses-binary/LICENSE-reflectasm.txt
similarity index 100%
rename from licenses/LICENSE-reflectasm.txt
rename to licenses-binary/LICENSE-reflectasm.txt
diff --git a/licenses-binary/LICENSE-respond.txt b/licenses-binary/LICENSE-respond.txt
new file mode 100644
index 0000000000000..dea4ff9e5b2ea
--- /dev/null
+++ b/licenses-binary/LICENSE-respond.txt
@@ -0,0 +1,22 @@
+Copyright (c) 2012 Scott Jehl
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses-binary/LICENSE-sbt-launch-lib.txt b/licenses-binary/LICENSE-sbt-launch-lib.txt
new file mode 100644
index 0000000000000..3b9156baaab78
--- /dev/null
+++ b/licenses-binary/LICENSE-sbt-launch-lib.txt
@@ -0,0 +1,26 @@
+// Generated from http://www.opensource.org/licenses/bsd-license.php
+Copyright (c) 2011, Paul Phillips.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of the author nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-scala.txt b/licenses-binary/LICENSE-scala.txt
similarity index 100%
rename from licenses/LICENSE-scala.txt
rename to licenses-binary/LICENSE-scala.txt
diff --git a/licenses-binary/LICENSE-scopt.txt b/licenses-binary/LICENSE-scopt.txt
new file mode 100644
index 0000000000000..e92e9b592fba0
--- /dev/null
+++ b/licenses-binary/LICENSE-scopt.txt
@@ -0,0 +1,9 @@
+This project is licensed under the MIT license.
+
+Copyright (c) scopt contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-slf4j.txt b/licenses-binary/LICENSE-slf4j.txt
similarity index 100%
rename from licenses/LICENSE-slf4j.txt
rename to licenses-binary/LICENSE-slf4j.txt
diff --git a/licenses-binary/LICENSE-sorttable.js.txt b/licenses-binary/LICENSE-sorttable.js.txt
new file mode 100644
index 0000000000000..b31a5b206bf40
--- /dev/null
+++ b/licenses-binary/LICENSE-sorttable.js.txt
@@ -0,0 +1,16 @@
+Copyright (c) 1997-2007 Stuart Langridge
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/licenses/LICENSE-spire.txt b/licenses-binary/LICENSE-spire.txt
similarity index 100%
rename from licenses/LICENSE-spire.txt
rename to licenses-binary/LICENSE-spire.txt
diff --git a/licenses-binary/LICENSE-vis.txt b/licenses-binary/LICENSE-vis.txt
new file mode 100644
index 0000000000000..18b7323059a41
--- /dev/null
+++ b/licenses-binary/LICENSE-vis.txt
@@ -0,0 +1,22 @@
+vis.js
+https://github.com/almende/vis
+
+A dynamic, browser-based visualization library.
+
+@version 4.16.1
+@date    2016-04-18
+
+@license
+Copyright (C) 2011-2016 Almende B.V, http://almende.com
+
+Vis.js is dual licensed under both
+
+* The Apache 2.0 License
+  http://www.apache.org/licenses/LICENSE-2.0
+
+and
+
+* The MIT License
+  http://opensource.org/licenses/MIT
+
+Vis.js may be distributed under either license.
\ No newline at end of file
diff --git a/licenses/LICENSE-xmlenc.txt b/licenses-binary/LICENSE-xmlenc.txt
similarity index 100%
rename from licenses/LICENSE-xmlenc.txt
rename to licenses-binary/LICENSE-xmlenc.txt
diff --git a/licenses/LICENSE-zstd-jni.txt b/licenses-binary/LICENSE-zstd-jni.txt
similarity index 100%
rename from licenses/LICENSE-zstd-jni.txt
rename to licenses-binary/LICENSE-zstd-jni.txt
diff --git a/licenses/LICENSE-zstd.txt b/licenses-binary/LICENSE-zstd.txt
similarity index 100%
rename from licenses/LICENSE-zstd.txt
rename to licenses-binary/LICENSE-zstd.txt
diff --git a/licenses/LICENSE-CC0.txt b/licenses/LICENSE-CC0.txt
new file mode 100644
index 0000000000000..1625c17936079
--- /dev/null
+++ b/licenses/LICENSE-CC0.txt
@@ -0,0 +1,121 @@
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.
\ No newline at end of file
diff --git a/licenses/LICENSE-SnapTree.txt b/licenses/LICENSE-SnapTree.txt
deleted file mode 100644
index a538825d89ec5..0000000000000
--- a/licenses/LICENSE-SnapTree.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-SNAPTREE LICENSE
-
-Copyright (c) 2009-2012 Stanford University, unless otherwise specified.
-All rights reserved.
-
-This software was developed by the Pervasive Parallelism Laboratory of
-Stanford University, California, USA.
-
-Permission to use, copy, modify, and distribute this software in source
-or binary form for any purpose with or without fee is hereby granted,
-provided that the following conditions are met:
-
-   1. Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-   2. Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-   3. Neither the name of Stanford University nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-
-THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGE.
diff --git a/licenses/LICENSE-bootstrap.txt b/licenses/LICENSE-bootstrap.txt
new file mode 100644
index 0000000000000..6c711832fbc85
--- /dev/null
+++ b/licenses/LICENSE-bootstrap.txt
@@ -0,0 +1,13 @@
+Copyright 2013 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/licenses/LICENSE-boto.txt b/licenses/LICENSE-boto.txt
deleted file mode 100644
index 7bba0cd9e10a4..0000000000000
--- a/licenses/LICENSE-boto.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-Copyright (c) 2006-2008 Mitch Garnaat http://garnaat.org/
-
-Permission is hereby granted, free of charge, to any person obtaining a
-copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish, dis-
-tribute, sublicense, and/or sell copies of the Software, and to permit
-persons to whom the Software is furnished to do so, subject to the fol-
-lowing conditions:
-
-The above copyright notice and this permission notice shall be included
-in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
-ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
-SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-IN THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-datatables.txt b/licenses/LICENSE-datatables.txt
new file mode 100644
index 0000000000000..bb7708b5b5a49
--- /dev/null
+++ b/licenses/LICENSE-datatables.txt
@@ -0,0 +1,7 @@
+Copyright (C) 2008-2018, SpryMedia Ltd.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-graphlib-dot.txt b/licenses/LICENSE-graphlib-dot.txt
index c9e18cd562423..4864fe05e9803 100644
--- a/licenses/LICENSE-graphlib-dot.txt
+++ b/licenses/LICENSE-graphlib-dot.txt
@@ -1,4 +1,4 @@
-Copyright (c) 2012-2013 Chris Pettitt
+Copyright (c) 2013 Chris Pettitt
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/licenses/LICENSE-jbcrypt.txt b/licenses/LICENSE-jbcrypt.txt
deleted file mode 100644
index d332534c06356..0000000000000
--- a/licenses/LICENSE-jbcrypt.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-jBCrypt is subject to the following license:
-
-/*
- * Copyright (c) 2006 Damien Miller <djm@mindrot.org>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
diff --git a/licenses/LICENSE-jmock.txt b/licenses/LICENSE-join.txt
similarity index 60%
rename from licenses/LICENSE-jmock.txt
rename to licenses/LICENSE-join.txt
index ed7964fe3d9ef..1d916090e4ea0 100644
--- a/licenses/LICENSE-jmock.txt
+++ b/licenses/LICENSE-join.txt
@@ -1,19 +1,21 @@
-Copyright (c) 2000-2017, jMock.org
+Copyright (c) 2011, Douban Inc. <http://www.douban.com/>
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 
-Redistributions of source code must retain the above copyright notice,
-this list of conditions and the following disclaimer. Redistributions
-in binary form must reproduce the above copyright notice, this list of
-conditions and the following disclaimer in the documentation and/or
-other materials provided with the distribution.
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
 
-Neither the name of jMock nor the names of its contributors may be
-used to endorse or promote products derived from this software without
-specific prior written permission.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+
+    * Neither the name of the Douban Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
@@ -25,4 +27,4 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-jquery.txt b/licenses/LICENSE-jquery.txt
index e1dd696d3b6cc..45930542204fb 100644
--- a/licenses/LICENSE-jquery.txt
+++ b/licenses/LICENSE-jquery.txt
@@ -1,9 +1,20 @@
-The MIT License (MIT)
+Copyright JS Foundation and other contributors, https://js.foundation/
 
-Copyright (c) <year> <copyright holders>
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
 
-Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
 
-The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-json-formatter.txt b/licenses/LICENSE-json-formatter.txt
new file mode 100644
index 0000000000000..5193348fce126
--- /dev/null
+++ b/licenses/LICENSE-json-formatter.txt
@@ -0,0 +1,6 @@
+Copyright 2014 Mohsen Azimi
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
\ No newline at end of file
diff --git a/licenses/LICENSE-matchMedia-polyfill.txt b/licenses/LICENSE-matchMedia-polyfill.txt
new file mode 100644
index 0000000000000..2fd0bc2b37448
--- /dev/null
+++ b/licenses/LICENSE-matchMedia-polyfill.txt
@@ -0,0 +1 @@
+matchMedia() polyfill - Test a CSS media type/query in JS. Authors & copyright (c) 2012: Scott Jehl, Paul Irish, Nicholas Zakas. Dual MIT/BSD license
\ No newline at end of file
diff --git a/licenses/LICENSE-postgresql.txt b/licenses/LICENSE-postgresql.txt
deleted file mode 100644
index 515bf9af4d432..0000000000000
--- a/licenses/LICENSE-postgresql.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-PostgreSQL Database Management System
-(formerly known as Postgres, then as Postgres95)
-
-Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
-
-Portions Copyright (c) 1994, The Regents of the University of California
-
-Permission to use, copy, modify, and distribute this software and its
-documentation for any purpose, without fee, and without a written agreement
-is hereby granted, provided that the above copyright notice and this
-paragraph and the following two paragraphs appear in all copies.
-
-IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
-DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
-LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
-DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
-INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
-AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
-ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO
-PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
-
diff --git a/licenses/LICENSE-respond.txt b/licenses/LICENSE-respond.txt
new file mode 100644
index 0000000000000..dea4ff9e5b2ea
--- /dev/null
+++ b/licenses/LICENSE-respond.txt
@@ -0,0 +1,22 @@
+Copyright (c) 2012 Scott Jehl
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-scalacheck.txt b/licenses/LICENSE-scalacheck.txt
deleted file mode 100644
index cb8f97842f4c4..0000000000000
--- a/licenses/LICENSE-scalacheck.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-ScalaCheck LICENSE
-
-Copyright (c) 2007-2015, Rickard Nilsson
-All rights reserved.
-
-Permission to use, copy, modify, and distribute this software in source
-or binary form for any purpose with or without fee is hereby granted,
-provided that the following conditions are met:
-
-   1. Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-   2. Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-   3. Neither the name of the author nor the names of its contributors
-      may be used to endorse or promote products derived from this
-      software without specific prior written permission.
-
-
-THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-vis.txt b/licenses/LICENSE-vis.txt
new file mode 100644
index 0000000000000..18b7323059a41
--- /dev/null
+++ b/licenses/LICENSE-vis.txt
@@ -0,0 +1,22 @@
+vis.js
+https://github.com/almende/vis
+
+A dynamic, browser-based visualization library.
+
+@version 4.16.1
+@date    2016-04-18
+
+@license
+Copyright (C) 2011-2016 Almende B.V, http://almende.com
+
+Vis.js is dual licensed under both
+
+* The Apache 2.0 License
+  http://www.apache.org/licenses/LICENSE-2.0
+
+and
+
+* The MIT License
+  http://opensource.org/licenses/MIT
+
+Vis.js may be distributed under either license.
\ No newline at end of file

From 8f91c697e251423b826cd6ac4ddd9e2dac15b96e Mon Sep 17 00:00:00 2001
From: Yuanjian Li <xyliyuanjian@gmail.com>
Date: Mon, 2 Jul 2018 14:35:37 +0800
Subject: [PATCH 38/79] [SPARK-24665][PYSPARK] Use SQLConf in PySpark to manage
 all sql configs

## What changes were proposed in this pull request?

Use SQLConf for PySpark to manage all sql configs, drop all the hard code in config usage.

## How was this patch tested?

Existing UT.

Author: Yuanjian Li <xyliyuanjian@gmail.com>

Closes #21648 from xuanyuanking/SPARK-24665.
---
 python/pyspark/sql/context.py                 |  5 +++
 python/pyspark/sql/dataframe.py               | 42 +++++--------------
 .../apache/spark/sql/internal/SQLConf.scala   |  6 +++
 3 files changed, 21 insertions(+), 32 deletions(-)

diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index e9ec7ba866761..9c094dd9a9033 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -93,6 +93,11 @@ def _ssql_ctx(self):
         """
         return self._jsqlContext
 
+    @property
+    def _conf(self):
+        """Accessor for the JVM SQL-specific configurations"""
+        return self.sparkSession._jsparkSession.sessionState().conf()
+
     @classmethod
     @since(1.6)
     def getOrCreate(cls, sc):
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index cb3fe448b6fc7..c40aea9bcef0a 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -354,32 +354,12 @@ def show(self, n=20, truncate=True, vertical=False):
         else:
             print(self._jdf.showString(n, int(truncate), vertical))
 
-    @property
-    def _eager_eval(self):
-        """Returns true if the eager evaluation enabled.
-        """
-        return self.sql_ctx.getConf(
-            "spark.sql.repl.eagerEval.enabled", "false").lower() == "true"
-
-    @property
-    def _max_num_rows(self):
-        """Returns the max row number for eager evaluation.
-        """
-        return int(self.sql_ctx.getConf(
-            "spark.sql.repl.eagerEval.maxNumRows", "20"))
-
-    @property
-    def _truncate(self):
-        """Returns the truncate length for eager evaluation.
-        """
-        return int(self.sql_ctx.getConf(
-            "spark.sql.repl.eagerEval.truncate", "20"))
-
     def __repr__(self):
-        if not self._support_repr_html and self._eager_eval:
+        if not self._support_repr_html and self.sql_ctx._conf.isReplEagerEvalEnabled():
             vertical = False
             return self._jdf.showString(
-                self._max_num_rows, self._truncate, vertical)
+                self.sql_ctx._conf.replEagerEvalMaxNumRows(),
+                self.sql_ctx._conf.replEagerEvalTruncate(), vertical)
         else:
             return "DataFrame[%s]" % (", ".join("%s: %s" % c for c in self.dtypes))
 
@@ -391,10 +371,10 @@ def _repr_html_(self):
         import cgi
         if not self._support_repr_html:
             self._support_repr_html = True
-        if self._eager_eval:
-            max_num_rows = max(self._max_num_rows, 0)
+        if self.sql_ctx._conf.isReplEagerEvalEnabled():
+            max_num_rows = max(self.sql_ctx._conf.replEagerEvalMaxNumRows(), 0)
             sock_info = self._jdf.getRowsToPython(
-                max_num_rows, self._truncate)
+                max_num_rows, self.sql_ctx._conf.replEagerEvalTruncate())
             rows = list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
             head = rows[0]
             row_data = rows[1:]
@@ -2049,13 +2029,12 @@ def toPandas(self):
 
         import pandas as pd
 
-        if self.sql_ctx.getConf("spark.sql.execution.pandas.respectSessionTimeZone").lower() \
-           == "true":
-            timezone = self.sql_ctx.getConf("spark.sql.session.timeZone")
+        if self.sql_ctx._conf.pandasRespectSessionTimeZone():
+            timezone = self.sql_ctx._conf.sessionLocalTimeZone()
         else:
             timezone = None
 
-        if self.sql_ctx.getConf("spark.sql.execution.arrow.enabled", "false").lower() == "true":
+        if self.sql_ctx._conf.arrowEnabled():
             use_arrow = True
             try:
                 from pyspark.sql.types import to_arrow_schema
@@ -2065,8 +2044,7 @@ def toPandas(self):
                 to_arrow_schema(self.schema)
             except Exception as e:
 
-                if self.sql_ctx.getConf("spark.sql.execution.arrow.fallback.enabled", "true") \
-                        .lower() == "true":
+                if self.sql_ctx._conf.arrowFallbackEnabled():
                     msg = (
                         "toPandas attempted Arrow optimization because "
                         "'spark.sql.execution.arrow.enabled' is set to true; however, "
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index da1c34cdc78f2..e2c48e2d8a14c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1728,6 +1728,12 @@ class SQLConf extends Serializable with Logging {
 
   def legacySizeOfNull: Boolean = getConf(SQLConf.LEGACY_SIZE_OF_NULL)
 
+  def isReplEagerEvalEnabled: Boolean = getConf(SQLConf.REPL_EAGER_EVAL_ENABLED)
+
+  def replEagerEvalMaxNumRows: Int = getConf(SQLConf.REPL_EAGER_EVAL_MAX_NUM_ROWS)
+
+  def replEagerEvalTruncate: Int = getConf(SQLConf.REPL_EAGER_EVAL_TRUNCATE)
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */

From 8008f9cb82e7c228b94eade2e7cb484d6d17e6a4 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Mon, 2 Jul 2018 22:09:47 +0800
Subject: [PATCH 39/79] [SPARK-24715][BUILD] Override jline version as 2.14.3
 in SBT

## What changes were proposed in this pull request?

During SPARK-24418 (Upgrade Scala to 2.11.12 and 2.12.6), we upgrade `jline` version together. So, `mvn` works correctly. However, `sbt` brings old jline library and is hitting `NoSuchMethodError` in `master` branch, see https://github.com/apache/spark/pull/21495#issuecomment-401560826. This overrides jline version in SBT to make sbt build work.

## How was this patch tested?

Manually test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #21692 from viirya/SPARK-24715.
---
 project/SparkBuild.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index b606f9355e03b..f887e4570c85d 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -464,7 +464,8 @@ object DockerIntegrationTests {
  */
 object DependencyOverrides {
   lazy val settings = Seq(
-    dependencyOverrides += "com.google.guava" % "guava" % "14.0.1")
+    dependencyOverrides += "com.google.guava" % "guava" % "14.0.1",
+    dependencyOverrides += "jline" % "jline" % "2.14.3")
 }
 
 /**

From f599cde69506a5aedeeec449cba9a8b5ab128282 Mon Sep 17 00:00:00 2001
From: Rekha Joshi <rekhajoshm@gmail.com>
Date: Mon, 2 Jul 2018 22:39:00 +0800
Subject: [PATCH 40/79] [SPARK-24507][DOCUMENTATION] Update streaming guide

## What changes were proposed in this pull request?
Updated streaming guide for direct stream and link to integration guide.

## How was this patch tested?
jekyll build

Author: Rekha Joshi <rekhajoshm@gmail.com>

Closes #21683 from rekhajoshm/SPARK-24507.
---
 docs/streaming-programming-guide.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index c30959263cdfa..118b05355c74d 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -2176,6 +2176,8 @@ the input data stream (using `inputStream.repartition(<number of partitions>)`).
 This distributes the received batches of data across the specified number of machines in the cluster
 before further processing.
 
+For direct stream, please refer to [Spark Streaming + Kafka Integration Guide](streaming-kafka-integration.html)
+
 ### Level of Parallelism in Data Processing
 {:.no_toc}
 Cluster resources can be under-utilized if the number of parallel tasks used in any stage of the

From 42815548c7ef498439b9ba47134a6f3e1b519c83 Mon Sep 17 00:00:00 2001
From: mcheah <mcheah@palantir.com>
Date: Mon, 2 Jul 2018 10:24:04 -0700
Subject: [PATCH 41/79] [SPARK-24683][K8S] Fix k8s no resource

## What changes were proposed in this pull request?

Make SparkSubmit pass in the main class even if `SparkLauncher.NO_RESOURCE` is the primary resource.

## How was this patch tested?

New integration test written to capture this case.

Author: mcheah <mcheah@palantir.com>

Closes #21660 from mccheah/fix-k8s-no-resource.
---
 .../scala/org/apache/spark/deploy/SparkSubmit.scala    |  2 ++
 .../deploy/k8s/submit/KubernetesDriverBuilder.scala    |  3 ++-
 .../deploy/k8s/integrationtest/KubernetesSuite.scala   | 10 ++++++++--
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index e83d82f847c61..2da778a29779d 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -702,6 +702,8 @@ private[spark] class SparkSubmit extends Logging {
           childArgs ++= Array("--primary-java-resource", args.primaryResource)
           childArgs ++= Array("--main-class", args.mainClass)
         }
+      } else {
+        childArgs ++= Array("--main-class", args.mainClass)
       }
       if (args.childArgs != null) {
         args.childArgs.foreach { arg =>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesDriverBuilder.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesDriverBuilder.scala
index 5762d8245f778..0dd1c37661707 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesDriverBuilder.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesDriverBuilder.scala
@@ -64,7 +64,8 @@ private[spark] class KubernetesDriverBuilder(
         case JavaMainAppResource(_) =>
           provideJavaStep(kubernetesConf)
         case PythonMainAppResource(_) =>
-          providePythonStep(kubernetesConf)}.getOrElse(provideJavaStep(kubernetesConf))
+          providePythonStep(kubernetesConf)}
+      .getOrElse(provideJavaStep(kubernetesConf))
 
     val allFeatures: Seq[KubernetesFeatureConfigStep] =
       (baseFeatures :+ bindingsStep) ++
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala
index 65c513cf241a4..6e334c83fbde8 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala
@@ -21,17 +21,17 @@ import java.nio.file.{Path, Paths}
 import java.util.UUID
 import java.util.regex.Pattern
 
-import scala.collection.JavaConverters._
-
 import com.google.common.io.PatternFilenameFilter
 import io.fabric8.kubernetes.api.model.{Container, Pod}
 import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll}
 import org.scalatest.concurrent.{Eventually, PatienceConfiguration}
 import org.scalatest.time.{Minutes, Seconds, Span}
+import scala.collection.JavaConverters._
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.deploy.k8s.integrationtest.backend.{IntegrationTestBackend, IntegrationTestBackendFactory}
 import org.apache.spark.deploy.k8s.integrationtest.config._
+import org.apache.spark.launcher.SparkLauncher
 
 private[spark] class KubernetesSuite extends SparkFunSuite
   with BeforeAndAfterAll with BeforeAndAfter {
@@ -109,6 +109,12 @@ private[spark] class KubernetesSuite extends SparkFunSuite
     runSparkPiAndVerifyCompletion()
   }
 
+  test("Use SparkLauncher.NO_RESOURCE") {
+    sparkAppConf.setJars(Seq(containerLocalSparkDistroExamplesJar))
+    runSparkPiAndVerifyCompletion(
+      appResource = SparkLauncher.NO_RESOURCE)
+  }
+
   test("Run SparkPi with a master URL without a scheme.") {
     val url = kubernetesTestComponents.kubernetesClient.getMasterUrl
     val k8sMasterUrl = if (url.getPort < 0) {

From 85fe1297e35bcff9cf86bd53fee615e140ee5bfb Mon Sep 17 00:00:00 2001
From: Stavros Kontopoulos <stavros.kontopoulos@lightbend.com>
Date: Mon, 2 Jul 2018 13:08:16 -0700
Subject: [PATCH 42/79] [SPARK-24428][K8S] Fix unused code

## What changes were proposed in this pull request?

Remove code that is misleading and is a leftover from a previous implementation.

## How was this patch tested?
Manually.

Author: Stavros Kontopoulos <stavros.kontopoulos@lightbend.com>

Closes #21462 from skonto/fix-k8s-docs.
---
 .../org/apache/spark/deploy/k8s/Constants.scala      |  6 ------
 .../cluster/k8s/KubernetesClusterManager.scala       |  2 --
 .../docker/src/main/dockerfiles/spark/entrypoint.sh  | 12 +++++-------
 3 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
index 69bd03d1eda6f..5ecdd3a04d77b 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
@@ -25,9 +25,6 @@ private[spark] object Constants {
   val SPARK_POD_DRIVER_ROLE = "driver"
   val SPARK_POD_EXECUTOR_ROLE = "executor"
 
-  // Annotations
-  val SPARK_APP_NAME_ANNOTATION = "spark-app-name"
-
   // Credentials secrets
   val DRIVER_CREDENTIALS_SECRETS_BASE_DIR =
     "/mnt/secrets/spark-kubernetes-credentials"
@@ -50,17 +47,14 @@ private[spark] object Constants {
   val DEFAULT_BLOCKMANAGER_PORT = 7079
   val DRIVER_PORT_NAME = "driver-rpc-port"
   val BLOCK_MANAGER_PORT_NAME = "blockmanager"
-  val EXECUTOR_PORT_NAME = "executor"
 
   // Environment Variables
-  val ENV_EXECUTOR_PORT = "SPARK_EXECUTOR_PORT"
   val ENV_DRIVER_URL = "SPARK_DRIVER_URL"
   val ENV_EXECUTOR_CORES = "SPARK_EXECUTOR_CORES"
   val ENV_EXECUTOR_MEMORY = "SPARK_EXECUTOR_MEMORY"
   val ENV_APPLICATION_ID = "SPARK_APPLICATION_ID"
   val ENV_EXECUTOR_ID = "SPARK_EXECUTOR_ID"
   val ENV_EXECUTOR_POD_IP = "SPARK_EXECUTOR_POD_IP"
-  val ENV_MOUNTED_CLASSPATH = "SPARK_MOUNTED_CLASSPATH"
   val ENV_JAVA_OPT_PREFIX = "SPARK_JAVA_OPT_"
   val ENV_CLASSPATH = "SPARK_CLASSPATH"
   val ENV_DRIVER_BIND_ADDRESS = "SPARK_DRIVER_BIND_ADDRESS"
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala
index c6e931a38405f..de2a52bc7a0b8 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala
@@ -48,8 +48,6 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
       sc: SparkContext,
       masterURL: String,
       scheduler: TaskScheduler): SchedulerBackend = {
-    val executorSecretNamesToMountPaths = KubernetesUtils.parsePrefixedKeyValuePairs(
-      sc.conf, KUBERNETES_EXECUTOR_SECRETS_PREFIX)
     val kubernetesClient = SparkKubernetesClientFactory.createKubernetesClient(
       KUBERNETES_MASTER_INTERNAL_URL,
       Some(sc.conf.get(KUBERNETES_NAMESPACE)),
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
index 2f4e115e84ecd..8bdb0f7a10795 100755
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
@@ -51,12 +51,10 @@ esac
 
 SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*"
 env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt
-readarray -t SPARK_JAVA_OPTS < /tmp/java_opts.txt
-if [ -n "$SPARK_MOUNTED_CLASSPATH" ]; then
-  SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_MOUNTED_CLASSPATH"
-fi
-if [ -n "$SPARK_MOUNTED_FILES_DIR" ]; then
-  cp -R "$SPARK_MOUNTED_FILES_DIR/." .
+readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt
+
+if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then
+  SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH"
 fi
 
 if [ -n "$PYSPARK_FILES" ]; then
@@ -101,7 +99,7 @@ case "$SPARK_K8S_CMD" in
   executor)
     CMD=(
       ${JAVA_HOME}/bin/java
-      "${SPARK_JAVA_OPTS[@]}"
+      "${SPARK_EXECUTOR_JAVA_OPTS[@]}"
       -Xms$SPARK_EXECUTOR_MEMORY
       -Xmx$SPARK_EXECUTOR_MEMORY
       -cp "$SPARK_CLASSPATH"

From a7c8f0c8cb144a026ea21e8780107e363ceacb8d Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Tue, 3 Jul 2018 12:20:03 +0800
Subject: [PATCH 43/79] [SPARK-24385][SQL] Resolve self-join condition
 ambiguity for EqualNullSafe

## What changes were proposed in this pull request?

In Dataset.join we have a small hack for resolving ambiguity in the column name for self-joins. The current code supports only `EqualTo`.

The PR extends the fix to `EqualNullSafe`.

Credit for this PR should be given to daniel-shields.

## How was this patch tested?

added UT

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #21605 from mgaido91/SPARK-24385_2.
---
 .../src/main/scala/org/apache/spark/sql/Dataset.scala     | 5 +++++
 .../scala/org/apache/spark/sql/DataFrameJoinSuite.scala   | 8 ++++++++
 2 files changed, 13 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 2ec236fc75efc..c97246f30220d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1016,6 +1016,11 @@ class Dataset[T] private[sql](
         catalyst.expressions.EqualTo(
           withPlan(plan.left).resolve(a.name),
           withPlan(plan.right).resolve(b.name))
+      case catalyst.expressions.EqualNullSafe(a: AttributeReference, b: AttributeReference)
+        if a.sameRef(b) =>
+        catalyst.expressions.EqualNullSafe(
+          withPlan(plan.left).resolve(a.name),
+          withPlan(plan.right).resolve(b.name))
     }}
 
     withPlan {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
index 0d9eeabb397a1..10d9a11d2ee79 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
@@ -287,4 +287,12 @@ class DataFrameJoinSuite extends QueryTest with SharedSQLContext {
       dfOne.join(dfTwo, $"a" === $"b", "left").queryExecution.optimizedPlan
     }
   }
+
+  test("SPARK-24385: Resolve ambiguity in self-joins with EqualNullSafe") {
+    withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "false") {
+      val df = spark.range(2)
+      // this throws an exception before the fix
+      df.join(df, df("id") <=> df("id")).queryExecution.optimizedPlan
+    }
+  }
 }

From 5585c5765f13519a447587ca778d52ce6a36a484 Mon Sep 17 00:00:00 2001
From: DB Tsai <d_tsai@apple.com>
Date: Tue, 3 Jul 2018 10:13:48 -0700
Subject: [PATCH 44/79] [SPARK-24420][BUILD] Upgrade ASM to 6.1 to support
 JDK9+

## What changes were proposed in this pull request?

Upgrade ASM to 6.1 to support JDK9+

## How was this patch tested?

Existing tests.

Author: DB Tsai <d_tsai@apple.com>

Closes #21459 from dbtsai/asm.
---
 core/pom.xml                                              | 2 +-
 .../main/scala/org/apache/spark/util/ClosureCleaner.scala | 4 ++--
 dev/deps/spark-deps-hadoop-2.6                            | 2 +-
 dev/deps/spark-deps-hadoop-2.7                            | 2 +-
 dev/deps/spark-deps-hadoop-3.1                            | 2 +-
 graphx/pom.xml                                            | 2 +-
 .../org/apache/spark/graphx/util/BytecodeUtils.scala      | 4 ++--
 pom.xml                                                   | 8 ++++----
 repl/pom.xml                                              | 4 ++--
 .../scala/org/apache/spark/repl/ExecutorClassLoader.scala | 4 ++--
 sql/core/pom.xml                                          | 2 +-
 11 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/core/pom.xml b/core/pom.xml
index 220522d3a8296..d0b869e6ef92c 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -56,7 +56,7 @@
     </dependency>
     <dependency>
       <groupId>org.apache.xbean</groupId>
-      <artifactId>xbean-asm5-shaded</artifactId>
+      <artifactId>xbean-asm6-shaded</artifactId>
     </dependency>
     <dependency>
       <groupId>org.apache.hadoop</groupId>
diff --git a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
index ad0c0639521f6..073d71c63b0c7 100644
--- a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
+++ b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
@@ -22,8 +22,8 @@ import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
 import scala.collection.mutable.{Map, Set, Stack}
 import scala.language.existentials
 
-import org.apache.xbean.asm5.{ClassReader, ClassVisitor, MethodVisitor, Type}
-import org.apache.xbean.asm5.Opcodes._
+import org.apache.xbean.asm6.{ClassReader, ClassVisitor, MethodVisitor, Type}
+import org.apache.xbean.asm6.Opcodes._
 
 import org.apache.spark.{SparkEnv, SparkException}
 import org.apache.spark.internal.Logging
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 96e9c27210d05..f50a0aac0aefc 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -192,7 +192,7 @@ stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
 univocity-parsers-2.6.3.jar
 validation-api-1.1.0.Final.jar
-xbean-asm5-shaded-4.4.jar
+xbean-asm6-shaded-4.8.jar
 xercesImpl-2.9.1.jar
 xmlenc-0.52.jar
 xz-1.0.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 4a6ee027ec355..774f9dc39ce4d 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -193,7 +193,7 @@ stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
 univocity-parsers-2.6.3.jar
 validation-api-1.1.0.Final.jar
-xbean-asm5-shaded-4.4.jar
+xbean-asm6-shaded-4.8.jar
 xercesImpl-2.9.1.jar
 xmlenc-0.52.jar
 xz-1.0.jar
diff --git a/dev/deps/spark-deps-hadoop-3.1 b/dev/deps/spark-deps-hadoop-3.1
index e0b560c8ec71f..19c05ad1e991f 100644
--- a/dev/deps/spark-deps-hadoop-3.1
+++ b/dev/deps/spark-deps-hadoop-3.1
@@ -214,7 +214,7 @@ token-provider-1.0.1.jar
 univocity-parsers-2.6.3.jar
 validation-api-1.1.0.Final.jar
 woodstox-core-5.0.3.jar
-xbean-asm5-shaded-4.4.jar
+xbean-asm6-shaded-4.8.jar
 xz-1.0.jar
 zjsonpatch-0.3.0.jar
 zookeeper-3.4.9.jar
diff --git a/graphx/pom.xml b/graphx/pom.xml
index fbe77fcb958d5..0f5dc548600b2 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -53,7 +53,7 @@
     </dependency>
     <dependency>
       <groupId>org.apache.xbean</groupId>
-      <artifactId>xbean-asm5-shaded</artifactId>
+      <artifactId>xbean-asm6-shaded</artifactId>
     </dependency>
     <dependency>
       <groupId>com.google.guava</groupId>
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/util/BytecodeUtils.scala b/graphx/src/main/scala/org/apache/spark/graphx/util/BytecodeUtils.scala
index d76e84ed8c9ed..a559685b1633c 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/util/BytecodeUtils.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/util/BytecodeUtils.scala
@@ -22,8 +22,8 @@ import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
 import scala.collection.mutable.HashSet
 import scala.language.existentials
 
-import org.apache.xbean.asm5.{ClassReader, ClassVisitor, MethodVisitor}
-import org.apache.xbean.asm5.Opcodes._
+import org.apache.xbean.asm6.{ClassReader, ClassVisitor, MethodVisitor}
+import org.apache.xbean.asm6.Opcodes._
 
 import org.apache.spark.util.Utils
 
diff --git a/pom.xml b/pom.xml
index 90e64ff71d229..ca30f9f12b098 100644
--- a/pom.xml
+++ b/pom.xml
@@ -313,13 +313,13 @@
         <artifactId>chill-java</artifactId>
         <version>${chill.version}</version>
       </dependency>
-      <!-- This artifact is a shaded version of ASM 5.0.4. The POM that was used to produce this
-           is at https://github.com/apache/geronimo-xbean/tree/xbean-4.4/xbean-asm5-shaded
+      <!-- This artifact is a shaded version of ASM 6.x. The POM that was used to produce this
+           is at https://github.com/apache/geronimo-xbean/tree/trunk/xbean-asm6-shaded
            For context on why we shade ASM, see SPARK-782 and SPARK-6152. -->
       <dependency>
         <groupId>org.apache.xbean</groupId>
-        <artifactId>xbean-asm5-shaded</artifactId>
-        <version>4.4</version>
+        <artifactId>xbean-asm6-shaded</artifactId>
+        <version>4.8</version>
       </dependency>
 
       <!-- Shaded deps marked as provided. These are promoted to compile scope
diff --git a/repl/pom.xml b/repl/pom.xml
index 6f4a863c48bc7..861bbd7c49654 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -102,7 +102,7 @@
 
     <dependency>
       <groupId>org.apache.xbean</groupId>
-      <artifactId>xbean-asm5-shaded</artifactId>
+      <artifactId>xbean-asm6-shaded</artifactId>
     </dependency>
 
     <!-- Explicit listing of transitive deps that are shaded. Otherwise, odd compiler crashes. -->
@@ -166,7 +166,7 @@
       </plugin>
     </plugins>
   </build>
-  
+
   <profiles>
     <profile>
       <id>scala-2.12</id>
diff --git a/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala b/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala
index 4dc399827ffed..42298b06a2c86 100644
--- a/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala
+++ b/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala
@@ -22,8 +22,8 @@ import java.net.{URI, URL, URLEncoder}
 import java.nio.channels.Channels
 
 import org.apache.hadoop.fs.{FileSystem, Path}
-import org.apache.xbean.asm5._
-import org.apache.xbean.asm5.Opcodes._
+import org.apache.xbean.asm6._
+import org.apache.xbean.asm6.Opcodes._
 
 import org.apache.spark.{SparkConf, SparkEnv}
 import org.apache.spark.deploy.SparkHadoopUtil
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index f270c70fbfcf0..18ae314309d7b 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -118,7 +118,7 @@
     </dependency>
     <dependency>
       <groupId>org.apache.xbean</groupId>
-      <artifactId>xbean-asm5-shaded</artifactId>
+      <artifactId>xbean-asm6-shaded</artifactId>
     </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>

From 776f299fc8146b400e97185b1577b0fc8f06e14b Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Wed, 4 Jul 2018 09:38:18 +0800
Subject: [PATCH 45/79] [SPARK-24709][SQL] schema_of_json() - schema inference
 from an example

## What changes were proposed in this pull request?

In the PR, I propose to add new function - *schema_of_json()* which infers schema of JSON string literal. The result of the function is a string containing a schema in DDL format.

One of the use cases is using of *schema_of_json()* in the combination with *from_json()*. Currently, _from_json()_ requires a schema as a mandatory argument. The *schema_of_json()* function will allow to point out an JSON string as an example which has the same schema as the first argument of _from_json()_. For instance:

```sql
select from_json(json_column, schema_of_json('{"c1": [0], "c2": [{"c3":0}]}'))
from json_table;
```

## How was this patch tested?

Added new test to `JsonFunctionsSuite`, `JsonExpressionsSuite` and SQL tests to `json-functions.sql`

Author: Maxim Gekk <maxim.gekk@databricks.com>

Closes #21686 from MaxGekk/infer_schema_json.
---
 python/pyspark/sql/functions.py               | 27 ++++++++++
 .../catalyst/analysis/FunctionRegistry.scala  |  1 +
 .../expressions/jsonExpressions.scala         | 52 ++++++++++++++++---
 .../sql/catalyst}/json/JsonInferSchema.scala  |  5 +-
 .../expressions/JsonExpressionsSuite.scala    |  7 +++
 .../datasources/json/JsonDataSource.scala     |  2 +-
 .../org/apache/spark/sql/functions.scala      | 42 +++++++++++++++
 .../sql-tests/inputs/json-functions.sql       |  4 ++
 .../sql-tests/results/json-functions.sql.out  | 20 ++++++-
 .../apache/spark/sql/JsonFunctionsSuite.scala | 17 +++++-
 .../datasources/json/JsonSuite.scala          |  4 +-
 11 files changed, 163 insertions(+), 18 deletions(-)
 rename sql/{core/src/main/scala/org/apache/spark/sql/execution/datasources => catalyst/src/main/scala/org/apache/spark/sql/catalyst}/json/JsonInferSchema.scala (98%)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 9652d3e79b875..4d371976364d3 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2189,11 +2189,16 @@ def from_json(col, schema, options={}):
     >>> df = spark.createDataFrame(data, ("key", "value"))
     >>> df.select(from_json(df.value, schema).alias("json")).collect()
     [Row(json=[Row(a=1)])]
+    >>> schema = schema_of_json(lit('''{"a": 0}'''))
+    >>> df.select(from_json(df.value, schema).alias("json")).collect()
+    [Row(json=Row(a=1))]
     """
 
     sc = SparkContext._active_spark_context
     if isinstance(schema, DataType):
         schema = schema.json()
+    elif isinstance(schema, Column):
+        schema = _to_java_column(schema)
     jc = sc._jvm.functions.from_json(_to_java_column(col), schema, options)
     return Column(jc)
 
@@ -2235,6 +2240,28 @@ def to_json(col, options={}):
     return Column(jc)
 
 
+@ignore_unicode_prefix
+@since(2.4)
+def schema_of_json(col):
+    """
+    Parses a column containing a JSON string and infers its schema in DDL format.
+
+    :param col: string column in json format
+
+    >>> from pyspark.sql.types import *
+    >>> data = [(1, '{"a": 1}')]
+    >>> df = spark.createDataFrame(data, ("key", "value"))
+    >>> df.select(schema_of_json(df.value).alias("json")).collect()
+    [Row(json=u'struct<a:bigint>')]
+    >>> df.select(schema_of_json(lit('{"a": 0}')).alias("json")).collect()
+    [Row(json=u'struct<a:bigint>')]
+    """
+
+    sc = SparkContext._active_spark_context
+    jc = sc._jvm.functions.schema_of_json(_to_java_column(col))
+    return Column(jc)
+
+
 @since(1.5)
 def size(col):
     """
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index a574d8a84d4fb..80a0af672bf74 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -505,6 +505,7 @@ object FunctionRegistry {
     // json
     expression[StructsToJson]("to_json"),
     expression[JsonToStructs]("from_json"),
+    expression[SchemaOfJson]("schema_of_json"),
 
     // cast
     expression[Cast]("cast"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index f6d74f5b74c8e..8cd86053a01c7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import java.io.{ByteArrayInputStream, ByteArrayOutputStream, CharArrayWriter, InputStreamReader, StringWriter}
+import java.io._
 
 import scala.util.parsing.combinator.RegexParsers
 
@@ -28,7 +28,8 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.catalyst.json._
-import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, BadRecordException, FailFastMode, GenericArrayData, MapData}
+import org.apache.spark.sql.catalyst.json.JsonInferSchema.inferField
+import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -525,17 +526,19 @@ case class JsonToStructs(
   override def nullable: Boolean = true
 
   // Used in `FunctionRegistry`
-  def this(child: Expression, schema: Expression) =
+  def this(child: Expression, schema: Expression, options: Map[String, String]) =
     this(
-      schema = JsonExprUtils.validateSchemaLiteral(schema),
-      options = Map.empty[String, String],
+      schema = JsonExprUtils.evalSchemaExpr(schema),
+      options = options,
       child = child,
       timeZoneId = None,
       forceNullableSchema = SQLConf.get.getConf(SQLConf.FROM_JSON_FORCE_NULLABLE_SCHEMA))
 
+  def this(child: Expression, schema: Expression) = this(child, schema, Map.empty[String, String])
+
   def this(child: Expression, schema: Expression, options: Expression) =
     this(
-      schema = JsonExprUtils.validateSchemaLiteral(schema),
+      schema = JsonExprUtils.evalSchemaExpr(schema),
       options = JsonExprUtils.convertToMapData(options),
       child = child,
       timeZoneId = None,
@@ -744,11 +747,44 @@ case class StructsToJson(
   override def inputTypes: Seq[AbstractDataType] = TypeCollection(ArrayType, StructType) :: Nil
 }
 
+/**
+ * A function infers schema of JSON string.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(json[, options]) - Returns schema in the DDL format of JSON string.",
+  examples = """
+    Examples:
+      > SELECT _FUNC_('[{"col":0}]');
+       array<struct<col:int>>
+  """,
+  since = "2.4.0")
+case class SchemaOfJson(child: Expression)
+  extends UnaryExpression with String2StringExpression with CodegenFallback {
+
+  private val jsonOptions = new JSONOptions(Map.empty, "UTC")
+  private val jsonFactory = new JsonFactory()
+  jsonOptions.setJacksonOptions(jsonFactory)
+
+  override def convert(v: UTF8String): UTF8String = {
+    val dt = Utils.tryWithResource(CreateJacksonParser.utf8String(jsonFactory, v)) { parser =>
+      parser.nextToken()
+      inferField(parser, jsonOptions)
+    }
+
+    UTF8String.fromString(dt.catalogString)
+  }
+}
+
 object JsonExprUtils {
 
-  def validateSchemaLiteral(exp: Expression): DataType = exp match {
+  def evalSchemaExpr(exp: Expression): DataType = exp match {
     case Literal(s, StringType) => DataType.fromDDL(s.toString)
-    case e => throw new AnalysisException(s"Expected a string literal instead of $e")
+    case e @ SchemaOfJson(_: Literal) =>
+      val ddlSchema = e.eval().asInstanceOf[UTF8String]
+      DataType.fromDDL(ddlSchema.toString)
+    case e => throw new AnalysisException(
+      "Schema should be specified in DDL format as a string literal" +
+      s" or output of the schema_of_json function instead of ${e.sql}")
   }
 
   def convertToMapData(exp: Expression): Map[String, String] = exp match {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
similarity index 98%
rename from sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonInferSchema.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
index 8e1b430f4eb33..491ca005877f8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonInferSchema.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.execution.datasources.json
+package org.apache.spark.sql.catalyst.json
 
 import java.util.Comparator
 
@@ -25,7 +25,6 @@ import org.apache.spark.SparkException
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.analysis.TypeCoercion
 import org.apache.spark.sql.catalyst.json.JacksonUtils.nextUntil
-import org.apache.spark.sql.catalyst.json.JSONOptions
 import org.apache.spark.sql.catalyst.util.{DropMalformedMode, FailFastMode, ParseMode, PermissiveMode}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -103,7 +102,7 @@ private[sql] object JsonInferSchema {
   /**
    * Infer the type of a json document from the parser's token stream
    */
-  private def inferField(parser: JsonParser, configOptions: JSONOptions): DataType = {
+  def inferField(parser: JsonParser, configOptions: JSONOptions): DataType = {
     import com.fasterxml.jackson.core.JsonToken._
     parser.getCurrentToken match {
       case null | VALUE_NULL => NullType
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
index 00e97637eee7e..52203b9e337ba 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
@@ -706,4 +706,11 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with
       assert(schemaToCompare == schema)
     }
   }
+
+  test("SPARK-24709: infer schema of json strings") {
+    checkEvaluation(SchemaOfJson(Literal.create("""{"col":0}""")), "struct<col:bigint>")
+    checkEvaluation(
+      SchemaOfJson(Literal.create("""{"col0":["a"], "col1": {"col2": "b"}}""")),
+      "struct<col0:array<string>,col1:struct<col2:string>>")
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala
index 3b6df45e949e8..2fee2128ba1f9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala
@@ -33,7 +33,7 @@ import org.apache.spark.input.{PortableDataStream, StreamInputFormat}
 import org.apache.spark.rdd.{BinaryFileRDD, RDD}
 import org.apache.spark.sql.{Dataset, Encoders, SparkSession}
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.json.{CreateJacksonParser, JacksonParser, JSONOptions}
+import org.apache.spark.sql.catalyst.json.{CreateJacksonParser, JacksonParser, JsonInferSchema, JSONOptions}
 import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.datasources.text.TextFileFormat
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index acca9572cb14c..614f65f0faaba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -3381,6 +3381,48 @@ object functions {
     from_json(e, dataType, options)
   }
 
+  /**
+   * (Scala-specific) Parses a column containing a JSON string into a `MapType` with `StringType`
+   * as keys type, `StructType` or `ArrayType` of `StructType`s with the specified schema.
+   * Returns `null`, in the case of an unparseable string.
+   *
+   * @param e a string column containing JSON data.
+   * @param schema the schema to use when parsing the json string
+   *
+   * @group collection_funcs
+   * @since 2.4.0
+   */
+  def from_json(e: Column, schema: Column): Column = {
+    from_json(e, schema, Map.empty[String, String].asJava)
+  }
+
+  /**
+   * (Java-specific) Parses a column containing a JSON string into a `MapType` with `StringType`
+   * as keys type, `StructType` or `ArrayType` of `StructType`s with the specified schema.
+   * Returns `null`, in the case of an unparseable string.
+   *
+   * @param e a string column containing JSON data.
+   * @param schema the schema to use when parsing the json string
+   * @param options options to control how the json is parsed. accepts the same options and the
+   *                json data source.
+   *
+   * @group collection_funcs
+   * @since 2.4.0
+   */
+  def from_json(e: Column, schema: Column, options: java.util.Map[String, String]): Column = {
+    withExpr(new JsonToStructs(e.expr, schema.expr, options.asScala.toMap))
+  }
+
+  /**
+   * Parses a column containing a JSON string and infers its schema.
+   *
+   * @param e a string column containing JSON data.
+   *
+   * @group collection_funcs
+   * @since 2.4.0
+   */
+  def schema_of_json(e: Column): Column = withExpr(new SchemaOfJson(e.expr))
+
   /**
    * (Scala-specific) Converts a column containing a `StructType`, `ArrayType` of `StructType`s,
    * a `MapType` or `ArrayType` of `MapType`s into a JSON string with the specified schema.
diff --git a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql
index dc15d13cd1dd3..79fdd5895e691 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql
@@ -35,3 +35,7 @@ DROP VIEW IF EXISTS jsonTable;
 -- from_json - complex types
 select from_json('{"a":1, "b":2}', 'map<string, int>');
 select from_json('{"a":1, "b":"2"}', 'struct<a:int,b:string>');
+
+-- infer schema of json literal
+select schema_of_json('{"c1":0, "c2":[1]}');
+select from_json('{"c1":[1, 2, 3]}', schema_of_json('{"c1":[0]}'));
diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
index 2b3288dc5a137..3d49323751a10 100644
--- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 28
+-- Number of queries: 30
 
 
 -- !query 0
@@ -183,7 +183,7 @@ select from_json('{"a":1}', 1)
 struct<>
 -- !query 17 output
 org.apache.spark.sql.AnalysisException
-Expected a string literal instead of 1;; line 1 pos 7
+Schema should be specified in DDL format as a string literal or output of the schema_of_json function instead of 1;; line 1 pos 7
 
 
 -- !query 18
@@ -274,3 +274,19 @@ select from_json('{"a":1, "b":"2"}', 'struct<a:int,b:string>')
 struct<jsontostructs({"a":1, "b":"2"}):struct<a:int,b:string>>
 -- !query 27 output
 {"a":1,"b":"2"}
+
+
+-- !query 28
+select schema_of_json('{"c1":0, "c2":[1]}')
+-- !query 28 schema
+struct<schemaofjson({"c1":0, "c2":[1]}):string>
+-- !query 28 output
+struct<c1:bigint,c2:array<bigint>>
+
+
+-- !query 29
+select from_json('{"c1":[1, 2, 3]}', schema_of_json('{"c1":[0]}'))
+-- !query 29 schema
+struct<jsontostructs({"c1":[1, 2, 3]}):struct<c1:array<bigint>>>
+-- !query 29 output
+{"c1":[1,2,3]}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
index 7bf17cbcd9c97..d3b2701f2558e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql
 
-import org.apache.spark.sql.functions.{from_json, lit, map, struct, to_json}
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 
@@ -311,7 +311,7 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext {
     val errMsg1 = intercept[AnalysisException] {
       df3.selectExpr("from_json(value, 1)")
     }
-    assert(errMsg1.getMessage.startsWith("Expected a string literal instead of"))
+    assert(errMsg1.getMessage.startsWith("Schema should be specified in DDL format as a string"))
     val errMsg2 = intercept[AnalysisException] {
       df3.selectExpr("""from_json(value, 'time InvalidType')""")
     }
@@ -392,4 +392,17 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext {
     checkAnswer(Seq("""{"{"f": 1}": "a"}""").toDS().select(from_json($"value", schema)),
       Row(null))
   }
+
+  test("SPARK-24709: infers schemas of json strings and pass them to from_json") {
+    val in = Seq("""{"a": [1, 2, 3]}""").toDS()
+    val out = in.select(from_json('value, schema_of_json(lit("""{"a": [1]}"""))) as "parsed")
+    val expected = StructType(StructField(
+      "parsed",
+      StructType(StructField(
+        "a",
+        ArrayType(LongType, true), true) :: Nil),
+      true) :: Nil)
+
+    assert(out.schema == expected)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index 897424daca0cb..eab15b35c97d3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -31,11 +31,11 @@ import org.apache.hadoop.io.compress.GzipCodec
 import org.apache.spark.{SparkException, TestUtils}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{functions => F, _}
-import org.apache.spark.sql.catalyst.json.{CreateJacksonParser, JacksonParser, JSONOptions}
+import org.apache.spark.sql.catalyst.json.{CreateJacksonParser, JacksonParser, JsonInferSchema, JSONOptions}
+import org.apache.spark.sql.catalyst.json.JsonInferSchema.compatibleType
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.ExternalRDD
 import org.apache.spark.sql.execution.datasources.DataSource
-import org.apache.spark.sql.execution.datasources.json.JsonInferSchema.compatibleType
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._

From b42fda8ab3b5f82b33b96fce3f584c50f2ed5a3a Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Wed, 4 Jul 2018 09:40:58 +0800
Subject: [PATCH 46/79] [SPARK-23698] Remove raw_input() from Python 2

Signed-off-by: cclauss <cclaussbluewin.ch>

## What changes were proposed in this pull request?

Humans will be able to enter text in Python 3 prompts which they can not do today.
The Python builtin __raw_input()__ was removed in Python 3 in favor of __input()__.  This PR does the same thing in Python 2.

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)
flake8 testing

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: cclauss <cclauss@bluewin.ch>

Closes #21702 from cclauss/python-fix-raw_input.
---
 dev/create-release/releaseutils.py |  5 ++++-
 dev/merge_spark_pr.py              | 21 ++++++++++++---------
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py
index 32f6cbb29f0be..ab812e1bb7c04 100755
--- a/dev/create-release/releaseutils.py
+++ b/dev/create-release/releaseutils.py
@@ -49,13 +49,16 @@
     print("Install using 'sudo pip install unidecode'")
     sys.exit(-1)
 
+if sys.version < '3':
+    input = raw_input
+
 # Contributors list file name
 contributors_file_name = "contributors.txt"
 
 
 # Prompt the user to answer yes or no until they do so
 def yesOrNoPrompt(msg):
-    response = raw_input("%s [y/n]: " % msg)
+    response = input("%s [y/n]: " % msg)
     while response != "y" and response != "n":
         return yesOrNoPrompt(msg)
     return response == "y"
diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py
index 7f46a1c8f6a7c..79c7c021fe74a 100755
--- a/dev/merge_spark_pr.py
+++ b/dev/merge_spark_pr.py
@@ -39,6 +39,9 @@
 except ImportError:
     JIRA_IMPORTED = False
 
+if sys.version < '3':
+    input = raw_input
+
 # Location of your Spark git development area
 SPARK_HOME = os.environ.get("SPARK_HOME", os.getcwd())
 # Remote name which points to the Gihub site
@@ -95,7 +98,7 @@ def run_cmd(cmd):
 
 
 def continue_maybe(prompt):
-    result = raw_input("\n%s (y/n): " % prompt)
+    result = input("\n%s (y/n): " % prompt)
     if result.lower() != "y":
         fail("Okay, exiting")
 
@@ -134,7 +137,7 @@ def merge_pr(pr_num, target_ref, title, body, pr_repo_desc):
                              '--pretty=format:%an <%ae>']).split("\n")
     distinct_authors = sorted(set(commit_authors),
                               key=lambda x: commit_authors.count(x), reverse=True)
-    primary_author = raw_input(
+    primary_author = input(
         "Enter primary author in the format of \"name <email>\" [%s]: " %
         distinct_authors[0])
     if primary_author == "":
@@ -184,7 +187,7 @@ def merge_pr(pr_num, target_ref, title, body, pr_repo_desc):
 
 
 def cherry_pick(pr_num, merge_hash, default_branch):
-    pick_ref = raw_input("Enter a branch name [%s]: " % default_branch)
+    pick_ref = input("Enter a branch name [%s]: " % default_branch)
     if pick_ref == "":
         pick_ref = default_branch
 
@@ -231,7 +234,7 @@ def resolve_jira_issue(merge_branches, comment, default_jira_id=""):
     asf_jira = jira.client.JIRA({'server': JIRA_API_BASE},
                                 basic_auth=(JIRA_USERNAME, JIRA_PASSWORD))
 
-    jira_id = raw_input("Enter a JIRA id [%s]: " % default_jira_id)
+    jira_id = input("Enter a JIRA id [%s]: " % default_jira_id)
     if jira_id == "":
         jira_id = default_jira_id
 
@@ -276,7 +279,7 @@ def resolve_jira_issue(merge_branches, comment, default_jira_id=""):
                 default_fix_versions = filter(lambda x: x != v, default_fix_versions)
     default_fix_versions = ",".join(default_fix_versions)
 
-    fix_versions = raw_input("Enter comma-separated fix version(s) [%s]: " % default_fix_versions)
+    fix_versions = input("Enter comma-separated fix version(s) [%s]: " % default_fix_versions)
     if fix_versions == "":
         fix_versions = default_fix_versions
     fix_versions = fix_versions.replace(" ", "").split(",")
@@ -315,7 +318,7 @@ def choose_jira_assignee(issue, asf_jira):
                 if author in commentors:
                     annotations.append("Commentor")
                 print("[%d] %s (%s)" % (idx, author.displayName, ",".join(annotations)))
-            raw_assignee = raw_input(
+            raw_assignee = input(
                 "Enter number of user, or userid,  to assign to (blank to leave unassigned):")
             if raw_assignee == "":
                 return None
@@ -428,7 +431,7 @@ def main():
     # Assumes branch names can be sorted lexicographically
     latest_branch = sorted(branch_names, reverse=True)[0]
 
-    pr_num = raw_input("Which pull request would you like to merge? (e.g. 34): ")
+    pr_num = input("Which pull request would you like to merge? (e.g. 34): ")
     pr = get_json("%s/pulls/%s" % (GITHUB_API_BASE, pr_num))
     pr_events = get_json("%s/issues/%s/events" % (GITHUB_API_BASE, pr_num))
 
@@ -440,7 +443,7 @@ def main():
         print("I've re-written the title as follows to match the standard format:")
         print("Original: %s" % pr["title"])
         print("Modified: %s" % modified_title)
-        result = raw_input("Would you like to use the modified title? (y/n): ")
+        result = input("Would you like to use the modified title? (y/n): ")
         if result.lower() == "y":
             title = modified_title
             print("Using modified title:")
@@ -491,7 +494,7 @@ def main():
     merge_hash = merge_pr(pr_num, target_ref, title, body, pr_repo_desc)
 
     pick_prompt = "Would you like to pick %s into another branch?" % merge_hash
-    while raw_input("\n%s (y/n): " % pick_prompt).lower() == "y":
+    while input("\n%s (y/n): " % pick_prompt).lower() == "y":
         merged_refs = merged_refs + [cherry_pick(pr_num, merge_hash, latest_branch)]
 
     if JIRA_IMPORTED:

From 5bf95f2a37e624eb6fb0ef6fbd2a40a129d5a470 Mon Sep 17 00:00:00 2001
From: Sean Owen <srowen@gmail.com>
Date: Wed, 4 Jul 2018 09:53:04 +0800
Subject: [PATCH 47/79] [BUILD] Close stale PRs

Closes #20932
Closes #17843
Closes #13477
Closes #14291
Closes #20919
Closes #17907
Closes #18766
Closes #20809
Closes #8849
Closes #21076
Closes #21507
Closes #21336
Closes #21681
Closes #21691

Author: Sean Owen <srowen@gmail.com>

Closes #21708 from srowen/CloseStalePRs.

From 7c08eb6d61d55ce45229f3302e6d463e7669183d Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Wed, 4 Jul 2018 12:21:26 +0800
Subject: [PATCH 48/79] [SPARK-24732][SQL] Type coercion between MapTypes.

## What changes were proposed in this pull request?

Currently we don't allow type coercion between maps.
We can support type coercion between MapTypes where both the key types and the value types are compatible.

## How was this patch tested?

Added tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #21703 from ueshin/issues/SPARK-24732/maptypecoercion.
---
 .../sql/catalyst/analysis/TypeCoercion.scala  | 12 +++++
 .../catalyst/analysis/TypeCoercionSuite.scala | 45 ++++++++++++++++++-
 2 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
index 3ebab430ffbcd..cf90e6e555fc8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -179,6 +179,12 @@ object TypeCoercion {
       .orElse((t1, t2) match {
         case (ArrayType(et1, containsNull1), ArrayType(et2, containsNull2)) =>
           findWiderTypeForTwo(et1, et2).map(ArrayType(_, containsNull1 || containsNull2))
+        case (MapType(kt1, vt1, valueContainsNull1), MapType(kt2, vt2, valueContainsNull2)) =>
+          findWiderTypeForTwo(kt1, kt2).flatMap { kt =>
+            findWiderTypeForTwo(vt1, vt2).map { vt =>
+              MapType(kt, vt, valueContainsNull1 || valueContainsNull2)
+            }
+          }
         case _ => None
       })
   }
@@ -220,6 +226,12 @@ object TypeCoercion {
         case (ArrayType(et1, containsNull1), ArrayType(et2, containsNull2)) =>
           findWiderTypeWithoutStringPromotionForTwo(et1, et2)
             .map(ArrayType(_, containsNull1 || containsNull2))
+        case (MapType(kt1, vt1, valueContainsNull1), MapType(kt2, vt2, valueContainsNull2)) =>
+          findWiderTypeWithoutStringPromotionForTwo(kt1, kt2).flatMap { kt =>
+            findWiderTypeWithoutStringPromotionForTwo(vt1, vt2).map { vt =>
+              MapType(kt, vt, valueContainsNull1 || valueContainsNull2)
+            }
+          }
         case _ => None
       })
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
index 0acd3b490447d..4e5ca1b8cdd36 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
@@ -54,8 +54,9 @@ class TypeCoercionSuite extends AnalysisTest {
   // | NullType             | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | BinaryType | BooleanType | StringType | DateType | TimestampType | ArrayType  | MapType  | StructType  | NullType | CalendarIntervalType | DecimalType(38, 18) | DoubleType  | IntegerType  |
   // | CalendarIntervalType | X        | X         | X           | X        | X          | X         | X          | X          | X           | X          | X        | X             | X          | X        | X           | X        | CalendarIntervalType | X                   | X           | X            |
   // +----------------------+----------+-----------+-------------+----------+------------+-----------+------------+------------+-------------+------------+----------+---------------+------------+----------+-------------+----------+----------------------+---------------------+-------------+--------------+
-  // Note: MapType*, StructType* are castable only when the internal child types also match; otherwise, not castable.
+  // Note: StructType* is castable only when the internal child types also match; otherwise, not castable.
   // Note: ArrayType* is castable when the element type is castable according to the table.
+  // Note: MapType* is castable when both the key type and the value type are castable according to the table.
   // scalastyle:on line.size.limit
 
   private def shouldCast(from: DataType, to: AbstractDataType, expected: DataType): Unit = {
@@ -487,12 +488,38 @@ class TypeCoercionSuite extends AnalysisTest {
       ArrayType(ArrayType(IntegerType), containsNull = false),
       ArrayType(ArrayType(LongType), containsNull = false),
       Some(ArrayType(ArrayType(LongType), containsNull = false)))
+    widenTestWithStringPromotion(
+      ArrayType(MapType(IntegerType, FloatType), containsNull = false),
+      ArrayType(MapType(LongType, DoubleType), containsNull = false),
+      Some(ArrayType(MapType(LongType, DoubleType), containsNull = false)))
+
+    // MapType
+    widenTestWithStringPromotion(
+      MapType(ShortType, TimestampType, valueContainsNull = true),
+      MapType(DoubleType, StringType, valueContainsNull = false),
+      Some(MapType(DoubleType, StringType, valueContainsNull = true)))
+    widenTestWithStringPromotion(
+      MapType(IntegerType, ArrayType(TimestampType), valueContainsNull = false),
+      MapType(LongType, ArrayType(StringType), valueContainsNull = true),
+      Some(MapType(LongType, ArrayType(StringType), valueContainsNull = true)))
+    widenTestWithStringPromotion(
+      MapType(IntegerType, MapType(ShortType, TimestampType), valueContainsNull = false),
+      MapType(LongType, MapType(DoubleType, StringType), valueContainsNull = false),
+      Some(MapType(LongType, MapType(DoubleType, StringType), valueContainsNull = false)))
 
     // Without string promotion
     widenTestWithoutStringPromotion(IntegerType, StringType, None)
     widenTestWithoutStringPromotion(StringType, TimestampType, None)
     widenTestWithoutStringPromotion(ArrayType(LongType), ArrayType(StringType), None)
     widenTestWithoutStringPromotion(ArrayType(StringType), ArrayType(TimestampType), None)
+    widenTestWithoutStringPromotion(
+      MapType(LongType, IntegerType), MapType(StringType, IntegerType), None)
+    widenTestWithoutStringPromotion(
+      MapType(IntegerType, LongType), MapType(IntegerType, StringType), None)
+    widenTestWithoutStringPromotion(
+      MapType(StringType, IntegerType), MapType(TimestampType, IntegerType), None)
+    widenTestWithoutStringPromotion(
+      MapType(IntegerType, StringType), MapType(IntegerType, TimestampType), None)
 
     // String promotion
     widenTestWithStringPromotion(IntegerType, StringType, Some(StringType))
@@ -501,6 +528,22 @@ class TypeCoercionSuite extends AnalysisTest {
       ArrayType(LongType), ArrayType(StringType), Some(ArrayType(StringType)))
     widenTestWithStringPromotion(
       ArrayType(StringType), ArrayType(TimestampType), Some(ArrayType(StringType)))
+    widenTestWithStringPromotion(
+      MapType(LongType, IntegerType),
+      MapType(StringType, IntegerType),
+      Some(MapType(StringType, IntegerType)))
+    widenTestWithStringPromotion(
+      MapType(IntegerType, LongType),
+      MapType(IntegerType, StringType),
+      Some(MapType(IntegerType, StringType)))
+    widenTestWithStringPromotion(
+      MapType(StringType, IntegerType),
+      MapType(TimestampType, IntegerType),
+      Some(MapType(StringType, IntegerType)))
+    widenTestWithStringPromotion(
+      MapType(IntegerType, StringType),
+      MapType(IntegerType, TimestampType),
+      Some(MapType(IntegerType, StringType)))
   }
 
   private def ruleTest(rule: Rule[LogicalPlan], initial: Expression, transformed: Expression) {

From 772060d0940a97d89807befd682a70ae82e83ef4 Mon Sep 17 00:00:00 2001
From: Stan Zhai <mail@stanzhai.site>
Date: Wed, 4 Jul 2018 10:12:36 +0200
Subject: [PATCH 49/79] [SPARK-24704][WEBUI] Fix the order of stages in the DAG
 graph

## What changes were proposed in this pull request?

Before:

![wx20180630-155537](https://user-images.githubusercontent.com/1438757/42123357-2c2e2d84-7c83-11e8-8abd-1c2860f38783.png)

After:

![wx20180630-155604](https://user-images.githubusercontent.com/1438757/42123359-32fae990-7c83-11e8-8a7b-cdcee94f9123.png)

## How was this patch tested?

Manual tests.

Author: Stan Zhai <mail@stanzhai.site>

Closes #21680 from stanzhai/fix-dag-graph.
---
 .../src/main/scala/org/apache/spark/status/AppStatusStore.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
index 688f25a9fdea1..e237281c552b1 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
@@ -471,7 +471,7 @@ private[spark] class AppStatusStore(
 
   def operationGraphForJob(jobId: Int): Seq[RDDOperationGraph] = {
     val job = store.read(classOf[JobDataWrapper], jobId)
-    val stages = job.info.stageIds
+    val stages = job.info.stageIds.sorted
 
     stages.map { id =>
       val g = store.read(classOf[RDDOperationGraphWrapper], id).toRDDOperationGraph()

From b2deef64f604ddd9502a31105ed47cb63470ec85 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Wed, 4 Jul 2018 20:04:18 +0800
Subject: [PATCH 50/79] [SPARK-24727][SQL] Add a static config to control cache
 size for generated classes

## What changes were proposed in this pull request?
Since SPARK-24250 has been resolved, executors correctly references user-defined configurations. So, this pr added a static config to control cache size for generated classes in `CodeGenerator`.

## How was this patch tested?
Added tests in `ExecutorSideSQLConfSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #21705 from maropu/SPARK-24727.
---
 .../expressions/codegen/CodeGenerator.scala   |  2 +-
 .../apache/spark/sql/internal/SQLConf.scala   |  2 ++
 .../spark/sql/internal/StaticSQLConf.scala    |  8 +++++
 .../internal/ExecutorSideSQLConfSuite.scala   | 31 +++++++++++++++----
 4 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 4cc0968911cb5..838c045d5bcce 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -1415,7 +1415,7 @@ object CodeGenerator extends Logging {
    * weak keys/values and thus does not respond to memory pressure.
    */
   private val cache = CacheBuilder.newBuilder()
-    .maximumSize(100)
+    .maximumSize(SQLConf.get.codegenCacheMaxEntries)
     .build(
       new CacheLoader[CodeAndComment, (GeneratedClass, Int)]() {
         override def load(code: CodeAndComment): (GeneratedClass, Int) = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index e2c48e2d8a14c..50965c1abc68c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1508,6 +1508,8 @@ class SQLConf extends Serializable with Logging {
   def tableRelationCacheSize: Int =
     getConf(StaticSQLConf.FILESOURCE_TABLE_RELATION_CACHE_SIZE)
 
+  def codegenCacheMaxEntries: Int = getConf(StaticSQLConf.CODEGEN_CACHE_MAX_ENTRIES)
+
   def exchangeReuseEnabled: Boolean = getConf(EXCHANGE_REUSE_ENABLED)
 
   def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala
index 382ef28f49a7a..384b1917a1f79 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala
@@ -66,6 +66,14 @@ object StaticSQLConf {
       .checkValue(cacheSize => cacheSize >= 0, "The maximum size of the cache must not be negative")
       .createWithDefault(1000)
 
+  val CODEGEN_CACHE_MAX_ENTRIES = buildStaticConf("spark.sql.codegen.cache.maxEntries")
+      .internal()
+      .doc("When nonzero, enable caching of generated classes for operators and expressions. " +
+        "All jobs share the cache that can use up to the specified number for generated classes.")
+      .intConf
+      .checkValue(maxEntries => maxEntries >= 0, "The maximum must not be negative")
+      .createWithDefault(100)
+
   // When enabling the debug, Spark SQL internal table properties are not filtered out; however,
   // some related DDL commands (e.g., ANALYZE TABLE and CREATE TABLE LIKE) might not work properly.
   val DEBUG_MODE = buildStaticConf("spark.sql.debug")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala
index 3dd0712e02448..855fe4f4523f2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.internal
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.{AnalysisException, SparkSession}
 import org.apache.spark.sql.test.SQLTestUtils
 
 class ExecutorSideSQLConfSuite extends SparkFunSuite with SQLTestUtils {
@@ -40,16 +40,24 @@ class ExecutorSideSQLConfSuite extends SparkFunSuite with SQLTestUtils {
     spark = null
   }
 
+  override def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
+    pairs.foreach { case (k, v) =>
+      SQLConf.get.setConfString(k, v)
+    }
+    try f finally {
+      pairs.foreach { case (k, _) =>
+        SQLConf.get.unsetConf(k)
+      }
+    }
+  }
+
   test("ReadOnlySQLConf is correctly created at the executor side") {
-    SQLConf.get.setConfString("spark.sql.x", "a")
-    try {
-      val checks = spark.range(10).mapPartitions { it =>
+    withSQLConf("spark.sql.x" -> "a") {
+      val checks = spark.range(10).mapPartitions { _ =>
         val conf = SQLConf.get
         Iterator(conf.isInstanceOf[ReadOnlySQLConf] && conf.getConfString("spark.sql.x") == "a")
       }.collect()
       assert(checks.forall(_ == true))
-    } finally {
-      SQLConf.get.unsetConf("spark.sql.x")
     }
   }
 
@@ -63,4 +71,15 @@ class ExecutorSideSQLConfSuite extends SparkFunSuite with SQLTestUtils {
       }
     }
   }
+
+  test("SPARK-24727 CODEGEN_CACHE_MAX_ENTRIES is correctly referenced at the executor side") {
+    withSQLConf(StaticSQLConf.CODEGEN_CACHE_MAX_ENTRIES.key -> "300") {
+      val checks = spark.range(10).mapPartitions { _ =>
+        val conf = SQLConf.get
+        Iterator(conf.isInstanceOf[ReadOnlySQLConf] &&
+          conf.getConfString(StaticSQLConf.CODEGEN_CACHE_MAX_ENTRIES.key) == "300")
+      }.collect()
+      assert(checks.forall(_ == true))
+    }
+  }
 }

From 021145f36432b386cce30450c888a85393d5169f Mon Sep 17 00:00:00 2001
From: Yuming Wang <yumwang@ebay.com>
Date: Wed, 4 Jul 2018 20:15:40 +0800
Subject: [PATCH 51/79] [SPARK-24716][SQL] Refactor ParquetFilters

## What changes were proposed in this pull request?

Replace DataFrame schema to Parquet file schema when create `ParquetFilters`.
Thus we can easily implement `Decimal` and `Timestamp` push down. some thing like this:
```scala
// DecimalType: 32BitDecimalType
case ParquetSchemaType(DECIMAL, INT32, decimal)
  if pushDownDecimal =>
  (n: String, v: Any) => FilterApi.eq(
    intColumn(n),
    Option(v).map(_.asInstanceOf[JBigDecimal].unscaledValue().intValue()
      .asInstanceOf[Integer]).orNull)
// DecimalType: 64BitDecimalType
case ParquetSchemaType(DECIMAL, INT64, decimal)
  if pushDownDecimal =>
  (n: String, v: Any) => FilterApi.eq(
    longColumn(n),
    Option(v).map(_.asInstanceOf[JBigDecimal].unscaledValue().longValue()
      .asInstanceOf[java.lang.Long]).orNull)
// DecimalType: LegacyParquetFormat 32BitDecimalType & 64BitDecimalType
case ParquetSchemaType(DECIMAL, FIXED_LEN_BYTE_ARRAY, decimal)
  if pushDownDecimal && decimal.getPrecision <= Decimal.MAX_LONG_DIGITS =>
  (n: String, v: Any) => FilterApi.eq(
    binaryColumn(n),
    Option(v).map(d => decimalToBinaryUsingUnscaledLong(decimal.getPrecision,
      d.asInstanceOf[JBigDecimal])).orNull)
// DecimalType: ByteArrayDecimalType
case ParquetSchemaType(DECIMAL, FIXED_LEN_BYTE_ARRAY, decimal)
  if pushDownDecimal && decimal.getPrecision > Decimal.MAX_LONG_DIGITS =>
  (n: String, v: Any) => FilterApi.eq(
    binaryColumn(n),
    Option(v).map(d => decimalToBinaryUsingUnscaledBytes(decimal.getPrecision,
      d.asInstanceOf[JBigDecimal])).orNull)
```

```scala
// INT96 doesn't support pushdown
case ParquetSchemaType(TIMESTAMP_MICROS, INT64, null) =>
  (n: String, v: Any) => FilterApi.eq(
    longColumn(n),
    Option(v).map(t => DateTimeUtils.fromJavaTimestamp(t.asInstanceOf[Timestamp])
      .asInstanceOf[java.lang.Long]).orNull)
case ParquetSchemaType(TIMESTAMP_MILLIS, INT64, null) =>
  (n: String, v: Any) => FilterApi.eq(
    longColumn(n),
    Option(v).map(_.asInstanceOf[Timestamp].getTime.asInstanceOf[java.lang.Long]).orNull)
```

## How was this patch tested?

unit tests

Author: Yuming Wang <yumwang@ebay.com>

Closes #21696 from wangyum/SPARK-24716.
---
 .../parquet/ParquetFileFormat.scala           |  34 ++--
 .../datasources/parquet/ParquetFilters.scala  | 173 ++++++++++--------
 .../apache/spark/sql/sources/filters.scala    |   2 +-
 .../parquet/ParquetFilterSuite.scala          |  13 +-
 4 files changed, 121 insertions(+), 101 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
index 93de1faef527a..52a18abb55241 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -353,25 +353,13 @@ class ParquetFileFormat
     (file: PartitionedFile) => {
       assert(file.partitionValues.numFields == partitionSchema.size)
 
-      // Try to push down filters when filter push-down is enabled.
-      val pushed = if (enableParquetFilterPushDown) {
-        filters
-          // Collects all converted Parquet filter predicates. Notice that not all predicates can be
-          // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap`
-          // is used here.
-          .flatMap(new ParquetFilters(pushDownDate, pushDownStringStartWith)
-          .createFilter(requiredSchema, _))
-          .reduceOption(FilterApi.and)
-      } else {
-        None
-      }
-
       val fileSplit =
         new FileSplit(new Path(new URI(file.filePath)), file.start, file.length, Array.empty)
+      val filePath = fileSplit.getPath
 
       val split =
         new org.apache.parquet.hadoop.ParquetInputSplit(
-          fileSplit.getPath,
+          filePath,
           fileSplit.getStart,
           fileSplit.getStart + fileSplit.getLength,
           fileSplit.getLength,
@@ -379,12 +367,28 @@ class ParquetFileFormat
           null)
 
       val sharedConf = broadcastedHadoopConf.value.value
+
+      // Try to push down filters when filter push-down is enabled.
+      val pushed = if (enableParquetFilterPushDown) {
+        val parquetSchema = ParquetFileReader.readFooter(sharedConf, filePath, SKIP_ROW_GROUPS)
+          .getFileMetaData.getSchema
+        filters
+          // Collects all converted Parquet filter predicates. Notice that not all predicates can be
+          // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap`
+          // is used here.
+          .flatMap(new ParquetFilters(pushDownDate, pushDownStringStartWith)
+          .createFilter(parquetSchema, _))
+          .reduceOption(FilterApi.and)
+      } else {
+        None
+      }
+
       // PARQUET_INT96_TIMESTAMP_CONVERSION says to apply timezone conversions to int96 timestamps'
       // *only* if the file was created by something other than "parquet-mr", so check the actual
       // writer here for this file.  We have to do this per-file, as each file in the table may
       // have different writers.
       def isCreatedByParquetMr(): Boolean = {
-        val footer = ParquetFileReader.readFooter(sharedConf, fileSplit.getPath, SKIP_ROW_GROUPS)
+        val footer = ParquetFileReader.readFooter(sharedConf, filePath, SKIP_ROW_GROUPS)
         footer.getFileMetaData().getCreatedBy().startsWith("parquet-mr")
       }
       val convertTz =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
index 21c9e2e4f82b4..4827f706e6016 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
@@ -19,15 +19,19 @@ package org.apache.spark.sql.execution.datasources.parquet
 
 import java.sql.Date
 
+import scala.collection.JavaConverters.asScalaBufferConverter
+
 import org.apache.parquet.filter2.predicate._
 import org.apache.parquet.filter2.predicate.FilterApi._
 import org.apache.parquet.io.api.Binary
-import org.apache.parquet.schema.PrimitiveComparator
+import org.apache.parquet.schema.{DecimalMetadata, MessageType, OriginalType, PrimitiveComparator, PrimitiveType}
+import org.apache.parquet.schema.OriginalType._
+import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
+import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._
 
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.catalyst.util.DateTimeUtils.SQLDate
 import org.apache.spark.sql.sources
-import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
 /**
@@ -35,171 +39,180 @@ import org.apache.spark.unsafe.types.UTF8String
  */
 private[parquet] class ParquetFilters(pushDownDate: Boolean, pushDownStartWith: Boolean) {
 
+  private case class ParquetSchemaType(
+      originalType: OriginalType,
+      primitiveTypeName: PrimitiveTypeName,
+      decimalMetadata: DecimalMetadata)
+
+  private val ParquetBooleanType = ParquetSchemaType(null, BOOLEAN, null)
+  private val ParquetIntegerType = ParquetSchemaType(null, INT32, null)
+  private val ParquetLongType = ParquetSchemaType(null, INT64, null)
+  private val ParquetFloatType = ParquetSchemaType(null, FLOAT, null)
+  private val ParquetDoubleType = ParquetSchemaType(null, DOUBLE, null)
+  private val ParquetStringType = ParquetSchemaType(UTF8, BINARY, null)
+  private val ParquetBinaryType = ParquetSchemaType(null, BINARY, null)
+  private val ParquetDateType = ParquetSchemaType(DATE, INT32, null)
+
   private def dateToDays(date: Date): SQLDate = {
     DateTimeUtils.fromJavaDate(date)
   }
 
-  private val makeEq: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
-    case BooleanType =>
+  private val makeEq: PartialFunction[ParquetSchemaType, (String, Any) => FilterPredicate] = {
+    case ParquetBooleanType =>
       (n: String, v: Any) => FilterApi.eq(booleanColumn(n), v.asInstanceOf[java.lang.Boolean])
-    case IntegerType =>
+    case ParquetIntegerType =>
       (n: String, v: Any) => FilterApi.eq(intColumn(n), v.asInstanceOf[Integer])
-    case LongType =>
+    case ParquetLongType =>
       (n: String, v: Any) => FilterApi.eq(longColumn(n), v.asInstanceOf[java.lang.Long])
-    case FloatType =>
+    case ParquetFloatType =>
       (n: String, v: Any) => FilterApi.eq(floatColumn(n), v.asInstanceOf[java.lang.Float])
-    case DoubleType =>
+    case ParquetDoubleType =>
       (n: String, v: Any) => FilterApi.eq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
 
     // Binary.fromString and Binary.fromByteArray don't accept null values
-    case StringType =>
+    case ParquetStringType =>
       (n: String, v: Any) => FilterApi.eq(
         binaryColumn(n),
         Option(v).map(s => Binary.fromString(s.asInstanceOf[String])).orNull)
-    case BinaryType =>
+    case ParquetBinaryType =>
       (n: String, v: Any) => FilterApi.eq(
         binaryColumn(n),
         Option(v).map(b => Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]])).orNull)
-    case DateType if pushDownDate =>
+    case ParquetDateType if pushDownDate =>
       (n: String, v: Any) => FilterApi.eq(
         intColumn(n),
         Option(v).map(date => dateToDays(date.asInstanceOf[Date]).asInstanceOf[Integer]).orNull)
   }
 
-  private val makeNotEq: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
-    case BooleanType =>
+  private val makeNotEq: PartialFunction[ParquetSchemaType, (String, Any) => FilterPredicate] = {
+    case ParquetBooleanType =>
       (n: String, v: Any) => FilterApi.notEq(booleanColumn(n), v.asInstanceOf[java.lang.Boolean])
-    case IntegerType =>
+    case ParquetIntegerType =>
       (n: String, v: Any) => FilterApi.notEq(intColumn(n), v.asInstanceOf[Integer])
-    case LongType =>
+    case ParquetLongType =>
       (n: String, v: Any) => FilterApi.notEq(longColumn(n), v.asInstanceOf[java.lang.Long])
-    case FloatType =>
+    case ParquetFloatType =>
       (n: String, v: Any) => FilterApi.notEq(floatColumn(n), v.asInstanceOf[java.lang.Float])
-    case DoubleType =>
+    case ParquetDoubleType =>
       (n: String, v: Any) => FilterApi.notEq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
 
-    case StringType =>
+    case ParquetStringType =>
       (n: String, v: Any) => FilterApi.notEq(
         binaryColumn(n),
         Option(v).map(s => Binary.fromString(s.asInstanceOf[String])).orNull)
-    case BinaryType =>
+    case ParquetBinaryType =>
       (n: String, v: Any) => FilterApi.notEq(
         binaryColumn(n),
         Option(v).map(b => Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]])).orNull)
-    case DateType if pushDownDate =>
+    case ParquetDateType if pushDownDate =>
       (n: String, v: Any) => FilterApi.notEq(
         intColumn(n),
         Option(v).map(date => dateToDays(date.asInstanceOf[Date]).asInstanceOf[Integer]).orNull)
   }
 
-  private val makeLt: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
-    case IntegerType =>
+  private val makeLt: PartialFunction[ParquetSchemaType, (String, Any) => FilterPredicate] = {
+    case ParquetIntegerType =>
       (n: String, v: Any) => FilterApi.lt(intColumn(n), v.asInstanceOf[Integer])
-    case LongType =>
+    case ParquetLongType =>
       (n: String, v: Any) => FilterApi.lt(longColumn(n), v.asInstanceOf[java.lang.Long])
-    case FloatType =>
+    case ParquetFloatType =>
       (n: String, v: Any) => FilterApi.lt(floatColumn(n), v.asInstanceOf[java.lang.Float])
-    case DoubleType =>
+    case ParquetDoubleType =>
       (n: String, v: Any) => FilterApi.lt(doubleColumn(n), v.asInstanceOf[java.lang.Double])
 
-    case StringType =>
+    case ParquetStringType =>
       (n: String, v: Any) =>
-        FilterApi.lt(binaryColumn(n),
-          Binary.fromString(v.asInstanceOf[String]))
-    case BinaryType =>
+        FilterApi.lt(binaryColumn(n), Binary.fromString(v.asInstanceOf[String]))
+    case ParquetBinaryType =>
       (n: String, v: Any) =>
         FilterApi.lt(binaryColumn(n), Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]]))
-    case DateType if pushDownDate =>
-      (n: String, v: Any) => FilterApi.lt(
-        intColumn(n),
-        Option(v).map(date => dateToDays(date.asInstanceOf[Date]).asInstanceOf[Integer]).orNull)
+    case ParquetDateType if pushDownDate =>
+      (n: String, v: Any) =>
+        FilterApi.lt(intColumn(n), dateToDays(v.asInstanceOf[Date]).asInstanceOf[Integer])
   }
 
-  private val makeLtEq: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
-    case IntegerType =>
-      (n: String, v: Any) => FilterApi.ltEq(intColumn(n), v.asInstanceOf[java.lang.Integer])
-    case LongType =>
+  private val makeLtEq: PartialFunction[ParquetSchemaType, (String, Any) => FilterPredicate] = {
+    case ParquetIntegerType =>
+      (n: String, v: Any) => FilterApi.ltEq(intColumn(n), v.asInstanceOf[Integer])
+    case ParquetLongType =>
       (n: String, v: Any) => FilterApi.ltEq(longColumn(n), v.asInstanceOf[java.lang.Long])
-    case FloatType =>
+    case ParquetFloatType =>
       (n: String, v: Any) => FilterApi.ltEq(floatColumn(n), v.asInstanceOf[java.lang.Float])
-    case DoubleType =>
+    case ParquetDoubleType =>
       (n: String, v: Any) => FilterApi.ltEq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
 
-    case StringType =>
+    case ParquetStringType =>
       (n: String, v: Any) =>
-        FilterApi.ltEq(binaryColumn(n),
-          Binary.fromString(v.asInstanceOf[String]))
-    case BinaryType =>
+        FilterApi.ltEq(binaryColumn(n), Binary.fromString(v.asInstanceOf[String]))
+    case ParquetBinaryType =>
       (n: String, v: Any) =>
         FilterApi.ltEq(binaryColumn(n), Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]]))
-    case DateType if pushDownDate =>
-      (n: String, v: Any) => FilterApi.ltEq(
-        intColumn(n),
-        Option(v).map(date => dateToDays(date.asInstanceOf[Date]).asInstanceOf[Integer]).orNull)
+    case ParquetDateType if pushDownDate =>
+      (n: String, v: Any) =>
+        FilterApi.ltEq(intColumn(n), dateToDays(v.asInstanceOf[Date]).asInstanceOf[Integer])
   }
 
-  private val makeGt: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
-    case IntegerType =>
-      (n: String, v: Any) => FilterApi.gt(intColumn(n), v.asInstanceOf[java.lang.Integer])
-    case LongType =>
+  private val makeGt: PartialFunction[ParquetSchemaType, (String, Any) => FilterPredicate] = {
+    case ParquetIntegerType =>
+      (n: String, v: Any) => FilterApi.gt(intColumn(n), v.asInstanceOf[Integer])
+    case ParquetLongType =>
       (n: String, v: Any) => FilterApi.gt(longColumn(n), v.asInstanceOf[java.lang.Long])
-    case FloatType =>
+    case ParquetFloatType =>
       (n: String, v: Any) => FilterApi.gt(floatColumn(n), v.asInstanceOf[java.lang.Float])
-    case DoubleType =>
+    case ParquetDoubleType =>
       (n: String, v: Any) => FilterApi.gt(doubleColumn(n), v.asInstanceOf[java.lang.Double])
 
-    case StringType =>
+    case ParquetStringType =>
       (n: String, v: Any) =>
-        FilterApi.gt(binaryColumn(n),
-          Binary.fromString(v.asInstanceOf[String]))
-    case BinaryType =>
+        FilterApi.gt(binaryColumn(n), Binary.fromString(v.asInstanceOf[String]))
+    case ParquetBinaryType =>
       (n: String, v: Any) =>
         FilterApi.gt(binaryColumn(n), Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]]))
-    case DateType if pushDownDate =>
-      (n: String, v: Any) => FilterApi.gt(
-        intColumn(n),
-        Option(v).map(date => dateToDays(date.asInstanceOf[Date]).asInstanceOf[Integer]).orNull)
+    case ParquetDateType if pushDownDate =>
+      (n: String, v: Any) =>
+        FilterApi.gt(intColumn(n), dateToDays(v.asInstanceOf[Date]).asInstanceOf[Integer])
   }
 
-  private val makeGtEq: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
-    case IntegerType =>
-      (n: String, v: Any) => FilterApi.gtEq(intColumn(n), v.asInstanceOf[java.lang.Integer])
-    case LongType =>
+  private val makeGtEq: PartialFunction[ParquetSchemaType, (String, Any) => FilterPredicate] = {
+    case ParquetIntegerType =>
+      (n: String, v: Any) => FilterApi.gtEq(intColumn(n), v.asInstanceOf[Integer])
+    case ParquetLongType =>
       (n: String, v: Any) => FilterApi.gtEq(longColumn(n), v.asInstanceOf[java.lang.Long])
-    case FloatType =>
+    case ParquetFloatType =>
       (n: String, v: Any) => FilterApi.gtEq(floatColumn(n), v.asInstanceOf[java.lang.Float])
-    case DoubleType =>
+    case ParquetDoubleType =>
       (n: String, v: Any) => FilterApi.gtEq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
 
-    case StringType =>
+    case ParquetStringType =>
       (n: String, v: Any) =>
-        FilterApi.gtEq(binaryColumn(n),
-          Binary.fromString(v.asInstanceOf[String]))
-    case BinaryType =>
+        FilterApi.gtEq(binaryColumn(n), Binary.fromString(v.asInstanceOf[String]))
+    case ParquetBinaryType =>
       (n: String, v: Any) =>
         FilterApi.gtEq(binaryColumn(n), Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]]))
-    case DateType if pushDownDate =>
-      (n: String, v: Any) => FilterApi.gtEq(
-        intColumn(n),
-        Option(v).map(date => dateToDays(date.asInstanceOf[Date]).asInstanceOf[Integer]).orNull)
+    case ParquetDateType if pushDownDate =>
+      (n: String, v: Any) =>
+        FilterApi.gtEq(intColumn(n), dateToDays(v.asInstanceOf[Date]).asInstanceOf[Integer])
   }
 
   /**
    * Returns a map from name of the column to the data type, if predicate push down applies.
    */
-  private def getFieldMap(dataType: DataType): Map[String, DataType] = dataType match {
-    case StructType(fields) =>
+  private def getFieldMap(dataType: MessageType): Map[String, ParquetSchemaType] = dataType match {
+    case m: MessageType =>
       // Here we don't flatten the fields in the nested schema but just look up through
       // root fields. Currently, accessing to nested fields does not push down filters
       // and it does not support to create filters for them.
-      fields.map(f => f.name -> f.dataType).toMap
-    case _ => Map.empty[String, DataType]
+      m.getFields.asScala.filter(_.isPrimitive).map(_.asPrimitiveType()).map { f =>
+        f.getName -> ParquetSchemaType(
+          f.getOriginalType, f.getPrimitiveTypeName, f.getDecimalMetadata)
+      }.toMap
+    case _ => Map.empty[String, ParquetSchemaType]
   }
 
   /**
    * Converts data sources filters to Parquet filter predicates.
    */
-  def createFilter(schema: StructType, predicate: sources.Filter): Option[FilterPredicate] = {
+  def createFilter(schema: MessageType, predicate: sources.Filter): Option[FilterPredicate] = {
     val nameToType = getFieldMap(schema)
 
     // Parquet does not allow dots in the column name because dots are used as a column path
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
index 2499e9b604f3e..bdd8c4da6bd30 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
@@ -199,7 +199,7 @@ case class StringStartsWith(attribute: String, value: String) extends Filter {
 
 /**
  * A filter that evaluates to `true` iff the attribute evaluates to
- * a string that starts with `value`.
+ * a string that ends with `value`.
  *
  * @since 1.3.1
  */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index d9ae5858e5ed0..8b96c841c8c6e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -103,7 +103,8 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
         assert(selectedFilters.nonEmpty, "No filter is pushed down")
 
         selectedFilters.foreach { pred =>
-          val maybeFilter = parquetFilters.createFilter(df.schema, pred)
+          val maybeFilter = parquetFilters.createFilter(
+            new SparkToParquetSchemaConverter(conf).convert(df.schema), pred)
           assert(maybeFilter.isDefined, s"Couldn't generate filter predicate for $pred")
           // Doesn't bother checking type parameters here (e.g. `Eq[Integer]`)
           maybeFilter.exists(_.getClass === filterClass)
@@ -542,12 +543,14 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
       StructField("c", DoubleType, nullable = true)
     ))
 
+    val parquetSchema = new SparkToParquetSchemaConverter(conf).convert(schema)
+
     assertResult(Some(and(
       lt(intColumn("a"), 10: Integer),
       gt(doubleColumn("c"), 1.5: java.lang.Double)))
     ) {
       parquetFilters.createFilter(
-        schema,
+        parquetSchema,
         sources.And(
           sources.LessThan("a", 10),
           sources.GreaterThan("c", 1.5D)))
@@ -555,7 +558,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
 
     assertResult(None) {
       parquetFilters.createFilter(
-        schema,
+        parquetSchema,
         sources.And(
           sources.LessThan("a", 10),
           sources.StringContains("b", "prefix")))
@@ -563,7 +566,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
 
     assertResult(None) {
       parquetFilters.createFilter(
-        schema,
+        parquetSchema,
         sources.Not(
           sources.And(
             sources.GreaterThan("a", 1),
@@ -729,7 +732,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
 
       assertResult(None) {
         parquetFilters.createFilter(
-          df.schema,
+          new SparkToParquetSchemaConverter(conf).convert(df.schema),
           sources.StringStartsWith("_1", null))
       }
     }

From 1a2655a9e75627b584787f9e4c6cdaa92e61fa3f Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 4 Jul 2018 20:42:08 +0800
Subject: [PATCH 52/79] [SPARK-24635][SQL] Remove Blocks class from JavaCode
 class hierarchy

## What changes were proposed in this pull request?

The `Blocks` class in `JavaCode` class hierarchy is not necessary. Its function can be taken by `CodeBlock`. We should remove it to make simpler class hierarchy.

## How was this patch tested?

Existing tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #21619 from viirya/SPARK-24635.
---
 .../expressions/codegen/javaCode.scala        | 40 +++++++------------
 1 file changed, 14 insertions(+), 26 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala
index 250ce48d059e0..44f63e21e93bb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala
@@ -119,6 +119,7 @@ object JavaCode {
  * A trait representing a block of java code.
  */
 trait Block extends JavaCode {
+  import Block._
 
   // The expressions to be evaluated inside this block.
   def exprValues: Set[ExprValue]
@@ -148,14 +149,17 @@ trait Block extends JavaCode {
   }
 
   // Concatenates this block with other block.
-  def + (other: Block): Block
+  def + (other: Block): Block = other match {
+    case EmptyBlock => this
+    case _ => code"$this\n$other"
+  }
 }
 
 object Block {
 
   val CODE_BLOCK_BUFFER_LENGTH: Int = 512
 
-  implicit def blocksToBlock(blocks: Seq[Block]): Block = Blocks(blocks)
+  implicit def blocksToBlock(blocks: Seq[Block]): Block = blocks.reduceLeft(_ + _)
 
   implicit class BlockHelper(val sc: StringContext) extends AnyVal {
     def code(args: Any*): Block = {
@@ -190,18 +194,17 @@ object Block {
     while (strings.hasNext) {
       val input = inputs.next
       input match {
-        case _: ExprValue | _: Block =>
+        case _: ExprValue | _: CodeBlock =>
           codeParts += buf.toString
           buf.clear
           blockInputs += input.asInstanceOf[JavaCode]
+        case EmptyBlock =>
         case _ =>
           buf.append(input)
       }
       buf.append(strings.next)
     }
-    if (buf.nonEmpty) {
-      codeParts += buf.toString
-    }
+    codeParts += buf.toString
 
     (codeParts.toSeq, blockInputs.toSeq)
   }
@@ -209,7 +212,11 @@ object Block {
 
 /**
  * A block of java code. Including a sequence of code parts and some inputs to this block.
- * The actual java code is generated by embedding the inputs into the code parts.
+ * The actual java code is generated by embedding the inputs into the code parts. Here we keep
+ * inputs of `JavaCode` instead of simply folding them as a string of code, because we need to
+ * track expressions (`ExprValue`) in this code block. We need to be able to manipulate the
+ * expressions later without changing the behavior of this code block in some applications, e.g.,
+ * method splitting.
  */
 case class CodeBlock(codeParts: Seq[String], blockInputs: Seq[JavaCode]) extends Block {
   override lazy val exprValues: Set[ExprValue] = {
@@ -230,30 +237,11 @@ case class CodeBlock(codeParts: Seq[String], blockInputs: Seq[JavaCode]) extends
     }
     buf.toString
   }
-
-  override def + (other: Block): Block = other match {
-    case c: CodeBlock => Blocks(Seq(this, c))
-    case b: Blocks => Blocks(Seq(this) ++ b.blocks)
-    case EmptyBlock => this
-  }
-}
-
-case class Blocks(blocks: Seq[Block]) extends Block {
-  override lazy val exprValues: Set[ExprValue] = blocks.flatMap(_.exprValues).toSet
-  override lazy val code: String = blocks.map(_.toString).mkString("\n")
-
-  override def + (other: Block): Block = other match {
-    case c: CodeBlock => Blocks(blocks :+ c)
-    case b: Blocks => Blocks(blocks ++ b.blocks)
-    case EmptyBlock => this
-  }
 }
 
 object EmptyBlock extends Block with Serializable {
   override val code: String = ""
   override val exprValues: Set[ExprValue] = Set.empty
-
-  override def + (other: Block): Block = other
 }
 
 /**

From ca8243f30fc6939ee099a9534e3b811d5c64d2cf Mon Sep 17 00:00:00 2001
From: Shahid <shahidki31@gmail.com>
Date: Wed, 4 Jul 2018 09:56:24 -0500
Subject: [PATCH 53/79] [MINOR][ML] Minor correction in the powerIterationSuite

## What changes were proposed in this pull request?

Currently the power iteration clustering test in  spark ml, maps the results to the labels 0 and 1 for assertion. Since the clustering outputs need not be the same as the mapped labels, it may cause failure in the test case. Even if it correctly maps, theoretically we cannot guarantee which set belongs to which cluster label. KMeans can assign label 0 to either of the set.

PowerIterationClusteringSuite in the MLLib checks the clustering results without mapping to the particular cluster label, as shown below.
``  val predictions = Array.fill(2)(mutable.Set.empty[Long])
    model.assignments.collect().foreach { a =>
      predictions(a.cluster) += a.id
    }
    assert(predictions.toSet == Set((0 until n1).toSet, (n1 until n).toSet))
``

## How was this patch tested?
Existing tests

Author: Shahid <shahidki31@gmail.com>

Closes #21689 from shahidki31/picTestSuiteMinorCorrection.
---
 .../PowerIterationClusteringSuite.scala       | 30 ++++++++++++-------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/PowerIterationClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/PowerIterationClusteringSuite.scala
index b7072728d48f0..55b460f1a4524 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/PowerIterationClusteringSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/PowerIterationClusteringSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.ml.clustering
 
+import scala.collection.mutable
+
 import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.util.DefaultReadWriteTest
 import org.apache.spark.mllib.util.MLlibTestSparkContext
@@ -76,12 +78,15 @@ class PowerIterationClusteringSuite extends SparkFunSuite
       .setMaxIter(40)
       .setWeightCol("weight")
       .assignClusters(data)
-    val localAssignments = assignments
-      .select('id, 'cluster)
-      .as[(Long, Int)].collect().toSet
-    val expectedResult = (0 until n1).map(x => (x, 1)).toSet ++
-      (n1 until n).map(x => (x, 0)).toSet
-    assert(localAssignments === expectedResult)
+      .select("id", "cluster")
+      .as[(Long, Int)]
+      .collect()
+
+    val predictions = Array.fill(2)(mutable.Set.empty[Long])
+    assignments.foreach {
+      case (id, cluster) => predictions(cluster) += id
+    }
+    assert(predictions.toSet === Set((0 until n1).toSet, (n1 until n).toSet))
 
     val assignments2 = new PowerIterationClustering()
       .setK(2)
@@ -89,10 +94,15 @@ class PowerIterationClusteringSuite extends SparkFunSuite
       .setInitMode("degree")
       .setWeightCol("weight")
       .assignClusters(data)
-    val localAssignments2 = assignments2
-      .select('id, 'cluster)
-      .as[(Long, Int)].collect().toSet
-    assert(localAssignments2 === expectedResult)
+      .select("id", "cluster")
+      .as[(Long, Int)]
+      .collect()
+
+    val predictions2 = Array.fill(2)(mutable.Set.empty[Long])
+    assignments2.foreach {
+      case (id, cluster) => predictions2(cluster) += id
+    }
+    assert(predictions2.toSet === Set((0 until n1).toSet, (n1 until n).toSet))
   }
 
   test("supported input types") {

From bf764a33bef617aa9bae535a5ea73d6a3e278d42 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 4 Jul 2018 18:36:09 -0700
Subject: [PATCH 54/79] [SPARK-22384][SQL][FOLLOWUP] Refine partition pruning
 when attribute is wrapped in Cast

## What changes were proposed in this pull request?

As mentioned in https://github.com/apache/spark/pull/21586 , `Cast.mayTruncate` is not 100% safe, string to boolean is allowed. Since changing `Cast.mayTruncate` also changes the behavior of Dataset, here I propose to add a new `Cast.canSafeCast` for partition pruning.

## How was this patch tested?

new test cases

Author: Wenchen Fan <wenchen@databricks.com>

Closes #21712 from cloud-fan/safeCast.
---
 .../spark/sql/catalyst/expressions/Cast.scala | 20 +++++++++++++++++++
 .../spark/sql/hive/client/HiveShim.scala      |  5 +++--
 .../sql/hive/client/HiveClientSuite.scala     | 20 +++++++++++++++++--
 3 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 699ea53b5df0f..7971ae602bd37 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -134,6 +134,26 @@ object Cast {
     toPrecedence > 0 && fromPrecedence > toPrecedence
   }
 
+  /**
+   * Returns true iff we can safely cast the `from` type to `to` type without any truncating or
+   * precision lose, e.g. int -> long, date -> timestamp.
+   */
+  def canSafeCast(from: AtomicType, to: AtomicType): Boolean = (from, to) match {
+    case _ if from == to => true
+    case (from: NumericType, to: DecimalType) if to.isWiderThan(from) => true
+    case (from: DecimalType, to: NumericType) if from.isTighterThan(to) => true
+    case (from, to) if legalNumericPrecedence(from, to) => true
+    case (DateType, TimestampType) => true
+    case (_, StringType) => true
+    case _ => false
+  }
+
+  private def legalNumericPrecedence(from: DataType, to: DataType): Boolean = {
+    val fromPrecedence = TypeCoercion.numericPrecedence.indexOf(from)
+    val toPrecedence = TypeCoercion.numericPrecedence.indexOf(to)
+    fromPrecedence >= 0 && fromPrecedence < toPrecedence
+  }
+
   def forceNullable(from: DataType, to: DataType): Boolean = (from, to) match {
     case (NullType, _) => true
     case (_, _) if from == to => false
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index 8620f3f6d99fb..933384ed43e98 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -45,7 +45,7 @@ import org.apache.spark.sql.catalyst.analysis.NoSuchPermanentFunctionException
 import org.apache.spark.sql.catalyst.catalog.{CatalogFunction, CatalogTablePartition, CatalogUtils, FunctionResource, FunctionResourceType}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.{IntegralType, StringType}
+import org.apache.spark.sql.types.{AtomicType, IntegralType, StringType}
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.Utils
 
@@ -660,7 +660,8 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
       def unapply(expr: Expression): Option[Attribute] = {
         expr match {
           case attr: Attribute => Some(attr)
-          case Cast(child, dt, _) if !Cast.mayTruncate(child.dataType, dt) => unapply(child)
+          case Cast(child @ AtomicType(), dt: AtomicType, _)
+              if Cast.canSafeCast(child.dataType.asInstanceOf[AtomicType], dt) => unapply(child)
           case _ => None
         }
       }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala
index 55275f6b37945..fa9f753795f65 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala
@@ -24,7 +24,7 @@ import org.scalatest.BeforeAndAfterAll
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.types.LongType
+import org.apache.spark.sql.types.{BooleanType, IntegerType, LongType}
 
 // TODO: Refactor this to `HivePartitionFilteringSuite`
 class HiveClientSuite(version: String)
@@ -122,6 +122,22 @@ class HiveClientSuite(version: String)
       "aa" :: Nil)
   }
 
+  test("getPartitionsByFilter: cast(chunk as int)=1 (not a valid partition predicate)") {
+    testMetastorePartitionFiltering(
+      attr("chunk").cast(IntegerType) === 1,
+      20170101 to 20170103,
+      0 to 23,
+      "aa" :: "ab" :: "ba" :: "bb" :: Nil)
+  }
+
+  test("getPartitionsByFilter: cast(chunk as boolean)=true (not a valid partition predicate)") {
+    testMetastorePartitionFiltering(
+      attr("chunk").cast(BooleanType) === true,
+      20170101 to 20170103,
+      0 to 23,
+      "aa" :: "ab" :: "ba" :: "bb" :: Nil)
+  }
+
   test("getPartitionsByFilter: 20170101=ds") {
     testMetastorePartitionFiltering(
       Literal(20170101) === attr("ds"),
@@ -138,7 +154,7 @@ class HiveClientSuite(version: String)
       "aa" :: "ab" :: "ba" :: "bb" :: Nil)
   }
 
-  test("getPartitionsByFilter: chunk in cast(ds as long)=20170101L") {
+  test("getPartitionsByFilter: cast(ds as long)=20170101L and h=10") {
     testMetastorePartitionFiltering(
       attr("ds").cast(LongType) === 20170101L && attr("h") === 10,
       20170101 to 20170101,

From 489a5294d106130beda1509e3cbbaf707a3d703d Mon Sep 17 00:00:00 2001
From: Xiao Li <gatorsmile@gmail.com>
Date: Thu, 5 Jul 2018 09:56:48 +0800
Subject: [PATCH 55/79] [SPARK-17213][SPARK-17213][FOLLOW-UP] Improve the test
 of

## What changes were proposed in this pull request?
This is a minor improvement for the test of SPARK-17213

## How was this patch tested?
N/A

Author: Xiao Li <gatorsmile@gmail.com>

Closes #21716 from gatorsmile/testMaster23.
---
 .../parquet/ParquetFilterSuite.scala          | 28 +++++++++++--------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index 8b96c841c8c6e..f2c0bda256239 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -618,21 +618,25 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
   }
 
   test("SPARK-17213: Broken Parquet filter push-down for string columns") {
-    withTempPath { dir =>
-      import testImplicits._
+    Seq(true, false).foreach { vectorizedEnabled =>
+      withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorizedEnabled.toString) {
+        withTempPath { dir =>
+          import testImplicits._
 
-      val path = dir.getCanonicalPath
-      // scalastyle:off nonascii
-      Seq("a", "é").toDF("name").write.parquet(path)
-      // scalastyle:on nonascii
+          val path = dir.getCanonicalPath
+          // scalastyle:off nonascii
+          Seq("a", "é").toDF("name").write.parquet(path)
+          // scalastyle:on nonascii
 
-      assert(spark.read.parquet(path).where("name > 'a'").count() == 1)
-      assert(spark.read.parquet(path).where("name >= 'a'").count() == 2)
+          assert(spark.read.parquet(path).where("name > 'a'").count() == 1)
+          assert(spark.read.parquet(path).where("name >= 'a'").count() == 2)
 
-      // scalastyle:off nonascii
-      assert(spark.read.parquet(path).where("name < 'é'").count() == 1)
-      assert(spark.read.parquet(path).where("name <= 'é'").count() == 2)
-      // scalastyle:on nonascii
+          // scalastyle:off nonascii
+          assert(spark.read.parquet(path).where("name < 'é'").count() == 1)
+          assert(spark.read.parquet(path).where("name <= 'é'").count() == 2)
+          // scalastyle:on nonascii
+        }
+      }
     }
   }
 

From f997be0c3136f85762b841469e7dfcde7e699ced Mon Sep 17 00:00:00 2001
From: mcteo <mc_teo@live.ie>
Date: Thu, 5 Jul 2018 10:05:41 +0800
Subject: [PATCH 56/79] [SPARK-24698][PYTHON] Fixed typo in pyspark.ml's
 Identifiable class.

## What changes were proposed in this pull request?

Fixed a small typo in the code that caused 20 random characters to be added to the UID, rather than 12.

Author: mcteo <mc_teo@live.ie>

Closes #21675 from mcteo/SPARK-24698-fix.
---
 python/pyspark/ml/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index 080cd299f4fde..e846834761e49 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -63,7 +63,7 @@ def _randomUID(cls):
         Generate a unique unicode id for the object. The default implementation
         concatenates the class name, "_", and 12 random hex chars.
         """
-        return unicode(cls.__name__ + "_" + uuid.uuid4().hex[12:])
+        return unicode(cls.__name__ + "_" + uuid.uuid4().hex[-12:])
 
 
 @inherit_doc

From 4be9f0c028cebb0d2975e93a6ebc56337cd2c585 Mon Sep 17 00:00:00 2001
From: Antonio Murgia <antonio.murgia@agilelab.it>
Date: Thu, 5 Jul 2018 16:10:34 +0800
Subject: [PATCH 57/79] [SPARK-24673][SQL] scala sql function
 from_utc_timestamp second argument could be Column instead of String

## What changes were proposed in this pull request?

Add an overloaded version to `from_utc_timestamp` and `to_utc_timestamp` having second argument as a `Column` instead of `String`.

## How was this patch tested?

Unit testing, especially adding two tests to org.apache.spark.sql.DateFunctionsSuite.scala

Author: Antonio Murgia <antonio.murgia@agilelab.it>
Author: Antonio Murgia <antonio.murgia2@studio.unibo.it>

Closes #21693 from tmnd1991/feature/SPARK-24673.
---
 .../org/apache/spark/sql/functions.scala      | 22 +++++++++++
 .../apache/spark/sql/DateFunctionsSuite.scala | 38 ++++++++++++++++++-
 2 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 614f65f0faaba..f2627e69939cd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2934,6 +2934,17 @@ object functions {
     FromUTCTimestamp(ts.expr, Literal(tz))
   }
 
+  /**
+   * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in UTC, and renders
+   * that time as a timestamp in the given time zone. For example, 'GMT+1' would yield
+   * '2017-07-14 03:40:00.0'.
+   * @group datetime_funcs
+   * @since 2.4.0
+   */
+  def from_utc_timestamp(ts: Column, tz: Column): Column = withExpr {
+    FromUTCTimestamp(ts.expr, tz.expr)
+  }
+
   /**
    * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in the given time
    * zone, and renders that time as a timestamp in UTC. For example, 'GMT+1' would yield
@@ -2945,6 +2956,17 @@ object functions {
     ToUTCTimestamp(ts.expr, Literal(tz))
   }
 
+  /**
+   * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in the given time
+   * zone, and renders that time as a timestamp in UTC. For example, 'GMT+1' would yield
+   * '2017-07-14 01:40:00.0'.
+   * @group datetime_funcs
+   * @since 2.4.0
+   */
+  def to_utc_timestamp(ts: Column, tz: Column): Column = withExpr {
+    ToUTCTimestamp(ts.expr, tz.expr)
+  }
+
   /**
    * Bucketize rows into one or more time windows given a timestamp specifying column. Window
    * starts are inclusive but the window ends are exclusive, e.g. 12:05 will be in the window
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index 237412aa692e5..3af80b36ec42c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -663,7 +663,7 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext {
     checkAnswer(df.selectExpr("datediff(a, d)"), Seq(Row(1), Row(1)))
   }
 
-  test("from_utc_timestamp") {
+  test("from_utc_timestamp with literal zone") {
     val df = Seq(
       (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00"),
       (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00")
@@ -680,7 +680,24 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext {
         Row(Timestamp.valueOf("2015-07-24 17:00:00"))))
   }
 
-  test("to_utc_timestamp") {
+  test("from_utc_timestamp with column zone") {
+    val df = Seq(
+      (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00", "CET"),
+      (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00", "PST")
+    ).toDF("a", "b", "c")
+    checkAnswer(
+      df.select(from_utc_timestamp(col("a"), col("c"))),
+      Seq(
+        Row(Timestamp.valueOf("2015-07-24 02:00:00")),
+        Row(Timestamp.valueOf("2015-07-24 17:00:00"))))
+    checkAnswer(
+      df.select(from_utc_timestamp(col("b"), col("c"))),
+      Seq(
+        Row(Timestamp.valueOf("2015-07-24 02:00:00")),
+        Row(Timestamp.valueOf("2015-07-24 17:00:00"))))
+  }
+
+  test("to_utc_timestamp with literal zone") {
     val df = Seq(
       (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00"),
       (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00")
@@ -697,6 +714,23 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext {
         Row(Timestamp.valueOf("2015-07-25 07:00:00"))))
   }
 
+  test("to_utc_timestamp with column zone") {
+    val df = Seq(
+      (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00", "PST"),
+      (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00", "CET")
+    ).toDF("a", "b", "c")
+    checkAnswer(
+      df.select(to_utc_timestamp(col("a"), col("c"))),
+      Seq(
+        Row(Timestamp.valueOf("2015-07-24 07:00:00")),
+        Row(Timestamp.valueOf("2015-07-24 22:00:00"))))
+    checkAnswer(
+      df.select(to_utc_timestamp(col("b"), col("c"))),
+      Seq(
+        Row(Timestamp.valueOf("2015-07-24 07:00:00")),
+        Row(Timestamp.valueOf("2015-07-24 22:00:00"))))
+  }
+
   test("SPARK-23715: to/from_utc_timestamp can retain the previous behavior") {
     withSQLConf(SQLConf.REJECT_TIMEZONE_IN_STRING.key -> "false") {
       checkAnswer(

From 32cfd3e75a5ca65696fedfa4d49681e6fc3e698d Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 5 Jul 2018 20:48:55 +0800
Subject: [PATCH 58/79] [SPARK-24361][SQL] Polish code block manipulation API

## What changes were proposed in this pull request?

Current code block manipulation API is immature and hacky. We need a formal API to manipulate code blocks.

The basic idea is making `JavaCode`  as `TreeNode`. So we can use familiar `transform` API to manipulate code blocks and expressions in code blocks.

For example, we can replace `SimpleExprValue` in a code block like this:

```scala
code.transformExprValues {
  case SimpleExprValue("1 + 1", _) => aliasedParam
}
```

The example use case is splitting code to methods.

For example, we have an `ExprCode` containing generated code. But it is too long and we need to split it as method. Because statement-based expressions can't be directly passed into. We need to transform them as variables first:

```scala

def getExprValues(block: Block): Set[ExprValue] = block match {
  case c: CodeBlock =>
    c.blockInputs.collect {
      case e: ExprValue => e
    }.toSet
  case _ => Set.empty
}

def currentCodegenInputs(ctx: CodegenContext): Set[ExprValue] = {
  // Collects current variables in ctx.currentVars and ctx.INPUT_ROW.
  // It looks roughly like...
  ctx.currentVars.flatMap { v =>
    getExprValues(v.code) ++ Set(v.value, v.isNull)
  }.toSet + ctx.INPUT_ROW
}

// A code block of an expression contains too long code, making it as method
if (eval.code.length > 1024) {
  val setIsNull = if (!eval.isNull.isInstanceOf[LiteralValue]) {
    ...
  } else {
    ""
  }

  // Pick up variables and statements necessary to pass in.
  val currentVars = currentCodegenInputs(ctx)
  val varsPassIn = getExprValues(eval.code).intersect(currentVars)
  val aliasedExprs = HashMap.empty[SimpleExprValue, VariableValue]

  // Replace statement-based expressions which can't be directly passed in the method.
  val newCode = eval.code.transform {
    case block =>
      block.transformExprValues {
        case s: SimpleExprValue(_, javaType) if varsPassIn.contains(s) =>
          if (aliasedExprs.contains(s)) {
            aliasedExprs(s)
          } else {
            val aliasedVariable = JavaCode.variable(ctx.freshName("aliasedVar"), javaType)
            aliasedExprs += s -> aliasedVariable
            varsPassIn += aliasedVariable
            aliasedVariable
          }
      }
  }

  val params = varsPassIn.filter(!_.isInstanceOf[SimpleExprValue])).map { variable =>
    s"${variable.javaType.getName} ${variable.variableName}"
  }.mkString(", ")

  val funcName = ctx.freshName("nodeName")
  val javaType = CodeGenerator.javaType(dataType)
  val newValue = JavaCode.variable(ctx.freshName("value"), dataType)
  val funcFullName = ctx.addNewFunction(funcName,
    s"""
      |private $javaType $funcName($params) {
      |  $newCode
      |  $setIsNull
      |  return ${eval.value};
      |}
    """.stripMargin))

  eval.value = newValue
  val args = varsPassIn.filter(!_.isInstanceOf[SimpleExprValue])).map { variable =>
    s"${variable.variableName}"
  }

  // Create a code block to assign statements to aliased variables.
  val createVariables = aliasedExprs.foldLeft(EmptyBlock) { (block, (statement, variable)) =>
    block + code"${statement.javaType.getName} $variable = $statement;"
  }
  eval.code = createVariables + code"$javaType $newValue = $funcFullName($args);"
}
```

## How was this patch tested?

Added unite tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #21405 from viirya/codeblock-api.
---
 .../expressions/codegen/javaCode.scala        | 48 +++++++++---
 .../expressions/codegen/CodeBlockSuite.scala  | 75 +++++++++++++++++--
 2 files changed, 104 insertions(+), 19 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala
index 44f63e21e93bb..2f8c853e836ba 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala
@@ -22,6 +22,7 @@ import java.lang.{Boolean => JBool}
 import scala.collection.mutable.ArrayBuffer
 import scala.language.{existentials, implicitConversions}
 
+import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.types.{BooleanType, DataType}
 
 /**
@@ -118,12 +119,9 @@ object JavaCode {
 /**
  * A trait representing a block of java code.
  */
-trait Block extends JavaCode {
+trait Block extends TreeNode[Block] with JavaCode {
   import Block._
 
-  // The expressions to be evaluated inside this block.
-  def exprValues: Set[ExprValue]
-
   // Returns java code string for this code block.
   override def toString: String = _marginChar match {
     case Some(c) => code.stripMargin(c).trim
@@ -148,11 +146,41 @@ trait Block extends JavaCode {
     this
   }
 
+  /**
+   * Apply a map function to each java expression codes present in this java code, and return a new
+   * java code based on the mapped java expression codes.
+   */
+  def transformExprValues(f: PartialFunction[ExprValue, ExprValue]): this.type = {
+    var changed = false
+
+    @inline def transform(e: ExprValue): ExprValue = {
+      val newE = f lift e
+      if (!newE.isDefined || newE.get.equals(e)) {
+        e
+      } else {
+        changed = true
+        newE.get
+      }
+    }
+
+    def doTransform(arg: Any): AnyRef = arg match {
+      case e: ExprValue => transform(e)
+      case Some(value) => Some(doTransform(value))
+      case seq: Traversable[_] => seq.map(doTransform)
+      case other: AnyRef => other
+    }
+
+    val newArgs = mapProductIterator(doTransform)
+    if (changed) makeCopy(newArgs).asInstanceOf[this.type] else this
+  }
+
   // Concatenates this block with other block.
   def + (other: Block): Block = other match {
     case EmptyBlock => this
     case _ => code"$this\n$other"
   }
+
+  override def verboseString: String = toString
 }
 
 object Block {
@@ -219,12 +247,8 @@ object Block {
  * method splitting.
  */
 case class CodeBlock(codeParts: Seq[String], blockInputs: Seq[JavaCode]) extends Block {
-  override lazy val exprValues: Set[ExprValue] = {
-    blockInputs.flatMap {
-      case b: Block => b.exprValues
-      case e: ExprValue => Set(e)
-    }.toSet
-  }
+  override def children: Seq[Block] =
+    blockInputs.filter(_.isInstanceOf[Block]).asInstanceOf[Seq[Block]]
 
   override lazy val code: String = {
     val strings = codeParts.iterator
@@ -239,9 +263,9 @@ case class CodeBlock(codeParts: Seq[String], blockInputs: Seq[JavaCode]) extends
   }
 }
 
-object EmptyBlock extends Block with Serializable {
+case object EmptyBlock extends Block with Serializable {
   override val code: String = ""
-  override val exprValues: Set[ExprValue] = Set.empty
+  override def children: Seq[Block] = Seq.empty
 }
 
 /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeBlockSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeBlockSuite.scala
index d2c6420eadb20..55569b6f2933e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeBlockSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeBlockSuite.scala
@@ -65,7 +65,9 @@ class CodeBlockSuite extends SparkFunSuite {
            |boolean $isNull = false;
            |int $value = -1;
           """.stripMargin
-    val exprValues = code.exprValues
+    val exprValues = code.asInstanceOf[CodeBlock].blockInputs.collect {
+      case e: ExprValue => e
+    }.toSet
     assert(exprValues.size == 2)
     assert(exprValues === Set(value, isNull))
   }
@@ -94,7 +96,9 @@ class CodeBlockSuite extends SparkFunSuite {
 
     assert(code.toString == expected)
 
-    val exprValues = code.exprValues
+    val exprValues = code.children.flatMap(_.asInstanceOf[CodeBlock].blockInputs.collect {
+      case e: ExprValue => e
+    }).toSet
     assert(exprValues.size == 5)
     assert(exprValues === Set(isNull1, value1, isNull2, value2, literal))
   }
@@ -107,7 +111,7 @@ class CodeBlockSuite extends SparkFunSuite {
     assert(e.getMessage().contains(s"Can not interpolate ${obj.getClass.getName}"))
   }
 
-  test("replace expr values in code block") {
+  test("transform expr in code block") {
     val expr = JavaCode.expression("1 + 1", IntegerType)
     val isNull = JavaCode.isNullVariable("expr1_isNull")
     val exprInFunc = JavaCode.variable("expr1", IntegerType)
@@ -120,11 +124,11 @@ class CodeBlockSuite extends SparkFunSuite {
            |}""".stripMargin
 
     val aliasedParam = JavaCode.variable("aliased", expr.javaType)
-    val aliasedInputs = code.asInstanceOf[CodeBlock].blockInputs.map {
-      case _: SimpleExprValue => aliasedParam
-      case other => other
+
+    // We want to replace all occurrences of `expr` with the variable `aliasedParam`.
+    val aliasedCode = code.transformExprValues {
+      case SimpleExprValue("1 + 1", java.lang.Integer.TYPE) => aliasedParam
     }
-    val aliasedCode = CodeBlock(code.asInstanceOf[CodeBlock].codeParts, aliasedInputs).stripMargin
     val expected =
       code"""
            |callFunc(int $aliasedParam) {
@@ -133,4 +137,61 @@ class CodeBlockSuite extends SparkFunSuite {
            |}""".stripMargin
     assert(aliasedCode.toString == expected.toString)
   }
+
+  test ("transform expr in nested blocks") {
+    val expr = JavaCode.expression("1 + 1", IntegerType)
+    val isNull = JavaCode.isNullVariable("expr1_isNull")
+    val exprInFunc = JavaCode.variable("expr1", IntegerType)
+
+    val funcs = Seq("callFunc1", "callFunc2", "callFunc3")
+    val subBlocks = funcs.map { funcName =>
+      code"""
+           |$funcName(int $expr) {
+           |  boolean $isNull = false;
+           |  int $exprInFunc = $expr + 1;
+           |}""".stripMargin
+    }
+
+    val aliasedParam = JavaCode.variable("aliased", expr.javaType)
+
+    val block = code"${subBlocks(0)}\n${subBlocks(1)}\n${subBlocks(2)}"
+    val transformedBlock = block.transform {
+      case b: Block => b.transformExprValues {
+        case SimpleExprValue("1 + 1", java.lang.Integer.TYPE) => aliasedParam
+      }
+    }.asInstanceOf[CodeBlock]
+
+    val expected1 =
+      code"""
+        |callFunc1(int aliased) {
+        |  boolean expr1_isNull = false;
+        |  int expr1 = aliased + 1;
+        |}""".stripMargin
+
+    val expected2 =
+      code"""
+        |callFunc2(int aliased) {
+        |  boolean expr1_isNull = false;
+        |  int expr1 = aliased + 1;
+        |}""".stripMargin
+
+    val expected3 =
+      code"""
+        |callFunc3(int aliased) {
+        |  boolean expr1_isNull = false;
+        |  int expr1 = aliased + 1;
+        |}""".stripMargin
+
+    val exprValues = transformedBlock.children.flatMap { block =>
+      block.asInstanceOf[CodeBlock].blockInputs.collect {
+        case e: ExprValue => e
+      }
+    }.toSet
+
+    assert(transformedBlock.children(0).toString == expected1.toString)
+    assert(transformedBlock.children(1).toString == expected2.toString)
+    assert(transformedBlock.children(2).toString == expected3.toString)
+    assert(transformedBlock.toString == (expected1 + expected2 + expected3).toString)
+    assert(exprValues === Set(isNull, exprInFunc, aliasedParam))
+  }
 }

From e58dadb77ed6cac3e1b2a037a6449e5a6e7f2cec Mon Sep 17 00:00:00 2001
From: Michael Mior <mmior@uwaterloo.ca>
Date: Thu, 5 Jul 2018 08:32:20 -0500
Subject: [PATCH 59/79] [SPARK-23820][CORE] Enable use of long form of callsite
 in logs

This adds an option to event logging to include the long form of the callsite instead of the short form.

Author: Michael Mior <mmior@uwaterloo.ca>

Closes #21433 from michaelmior/long-callsite.
---
 .../org/apache/spark/internal/config/package.scala     |  3 +++
 .../main/scala/org/apache/spark/storage/RDDInfo.scala  | 10 +++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 38a043c85ae33..bda9795a0b925 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -72,6 +72,9 @@ package object config {
   private[spark] val EVENT_LOG_OVERWRITE =
     ConfigBuilder("spark.eventLog.overwrite").booleanConf.createWithDefault(false)
 
+  private[spark] val EVENT_LOG_CALLSITE_FORM =
+    ConfigBuilder("spark.eventLog.callsite").stringConf.createWithDefault("short")
+
   private[spark] val EXECUTOR_CLASS_PATH =
     ConfigBuilder(SparkLauncher.EXECUTOR_EXTRA_CLASSPATH).stringConf.createOptional
 
diff --git a/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala b/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala
index e5abbf745cc41..9ccc8f9cc585b 100644
--- a/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala
+++ b/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala
@@ -17,7 +17,9 @@
 
 package org.apache.spark.storage
 
+import org.apache.spark.SparkEnv
 import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.internal.config._
 import org.apache.spark.rdd.{RDD, RDDOperationScope}
 import org.apache.spark.util.Utils
 
@@ -53,10 +55,16 @@ class RDDInfo(
 }
 
 private[spark] object RDDInfo {
+  private val callsiteForm = SparkEnv.get.conf.get(EVENT_LOG_CALLSITE_FORM)
+
   def fromRdd(rdd: RDD[_]): RDDInfo = {
     val rddName = Option(rdd.name).getOrElse(Utils.getFormattedClassName(rdd))
     val parentIds = rdd.dependencies.map(_.rdd.id)
+    val callSite = callsiteForm match {
+      case "short" => rdd.creationSite.shortForm
+      case "long" => rdd.creationSite.longForm
+    }
     new RDDInfo(rdd.id, rddName, rdd.partitions.length,
-      rdd.getStorageLevel, parentIds, rdd.creationSite.shortForm, rdd.scope)
+      rdd.getStorageLevel, parentIds, callSite, rdd.scope)
   }
 }

From 7bd6d5412072643f2320fd389f323cfc51368c81 Mon Sep 17 00:00:00 2001
From: Stavros Kontopoulos <stavros.kontopoulos@lightbend.com>
Date: Thu, 5 Jul 2018 08:38:26 -0500
Subject: [PATCH 60/79] [SPARK-24711][K8S] Fix tags for integration tests

## What changes were proposed in this pull request?

- disables maven surfire plugin to allow tags function properly, doc here: http://www.scalatest.org/user_guide/using_the_scalatest_maven_plugin

## How was this patch tested?

Manually by adding tags.

Author: Stavros Kontopoulos <stavros.kontopoulos@lightbend.com>

Closes #21697 from skonto/fix-tags.
---
 pom.xml                                       |  2 +-
 .../dev/dev-run-integration-tests.sh          | 20 +++++++++++++++++++
 .../kubernetes/integration-tests/pom.xml      | 11 ++++++++++
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index ca30f9f12b098..cd567e227f331 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2122,7 +2122,7 @@
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>
           <artifactId>maven-surefire-plugin</artifactId>
-          <version>2.20.1</version>
+          <version>2.22.0</version>
           <!-- Note config is repeated in scalatest config -->
           <configuration>
             <includes>
diff --git a/resource-managers/kubernetes/integration-tests/dev/dev-run-integration-tests.sh b/resource-managers/kubernetes/integration-tests/dev/dev-run-integration-tests.sh
index ea893fa39eede..3acd0f5cd3349 100755
--- a/resource-managers/kubernetes/integration-tests/dev/dev-run-integration-tests.sh
+++ b/resource-managers/kubernetes/integration-tests/dev/dev-run-integration-tests.sh
@@ -27,6 +27,8 @@ IMAGE_TAG="N/A"
 SPARK_MASTER=
 NAMESPACE=
 SERVICE_ACCOUNT=
+INCLUDE_TAGS=
+EXCLUDE_TAGS=
 
 # Parse arguments
 while (( "$#" )); do
@@ -59,6 +61,14 @@ while (( "$#" )); do
       SERVICE_ACCOUNT="$2"
       shift
       ;;
+    --include-tags)
+      INCLUDE_TAGS="$2"
+      shift
+      ;;
+    --exclude-tags)
+      EXCLUDE_TAGS="$2"
+      shift
+      ;;
     *)
       break
       ;;
@@ -90,4 +100,14 @@ then
   properties=( ${properties[@]} -Dspark.kubernetes.test.master=$SPARK_MASTER )
 fi
 
+if [ -n $EXCLUDE_TAGS ];
+then
+  properties=( ${properties[@]} -Dtest.exclude.tags=$EXCLUDE_TAGS )
+fi
+
+if [ -n $INCLUDE_TAGS ];
+then
+  properties=( ${properties[@]} -Dtest.include.tags=$INCLUDE_TAGS )
+fi
+
 ../../../build/mvn integration-test ${properties[@]}
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index 520bda89e034d..6a2fff891098b 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -40,6 +40,7 @@
     <spark.kubernetes.test.deployMode>minikube</spark.kubernetes.test.deployMode>
     <spark.kubernetes.test.imageRepo>docker.io/kubespark</spark.kubernetes.test.imageRepo>
     <test.exclude.tags></test.exclude.tags>
+    <test.include.tags></test.include.tags>
   </properties>
   <packaging>jar</packaging>
   <name>Spark Project Kubernetes Integration Tests</name>
@@ -102,6 +103,15 @@
           </execution>
         </executions>
       </plugin>
+
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-surefire-plugin</artifactId>
+        <configuration>
+          <skipTests>true</skipTests>
+        </configuration>
+      </plugin>
+
       <plugin>
         <!-- Triggers scalatest plugin in the integration-test phase instead of
              the test phase. -->
@@ -126,6 +136,7 @@
             <spark.kubernetes.test.serviceAccountName>${spark.kubernetes.test.serviceAccountName}</spark.kubernetes.test.serviceAccountName>
           </systemProperties>
           <tagsToExclude>${test.exclude.tags}</tagsToExclude>
+          <tagsToInclude>${test.include.tags}</tagsToInclude>
         </configuration>
         <executions>
           <execution>

From ac78bcce00ff8ec8e5b7335c2807aa0cd0f5406a Mon Sep 17 00:00:00 2001
From: cluo <0512lc@163.com>
Date: Thu, 5 Jul 2018 09:06:25 -0500
Subject: [PATCH 61/79] [SPARK-24743][EXAMPLES] Update the
 JavaDirectKafkaWordCount example to support the new API of kafka

## What changes were proposed in this pull request?

Add some required configs for Kafka consumer in JavaDirectKafkaWordCount class.

## How was this patch tested?

Manual tests on Local mode.

Author: cluo <0512lc@163.com>

Closes #21717 from cluo512/SPARK-24743-update-JavaDirectKafkaWordCount.
---
 .../streaming/JavaDirectKafkaWordCount.java   | 24 ++++++++++++-------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java b/examples/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java
index b6b163fa8b2cd..748bf58f30350 100644
--- a/examples/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java
+++ b/examples/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java
@@ -26,7 +26,9 @@
 
 import scala.Tuple2;
 
+import org.apache.kafka.clients.consumer.ConsumerConfig;
 import org.apache.kafka.clients.consumer.ConsumerRecord;
+import org.apache.kafka.common.serialization.StringDeserializer;
 
 import org.apache.spark.SparkConf;
 import org.apache.spark.streaming.api.java.*;
@@ -37,30 +39,33 @@
 
 /**
  * Consumes messages from one or more topics in Kafka and does wordcount.
- * Usage: JavaDirectKafkaWordCount <brokers> <topics>
+ * Usage: JavaDirectKafkaWordCount <brokers> <groupId> <topics>
  *   <brokers> is a list of one or more Kafka brokers
+ *   <groupId> is a consumer group name to consume from topics
  *   <topics> is a list of one or more kafka topics to consume from
  *
  * Example:
  *    $ bin/run-example streaming.JavaDirectKafkaWordCount broker1-host:port,broker2-host:port \
- *      topic1,topic2
+ *      consumer-group topic1,topic2
  */
 
 public final class JavaDirectKafkaWordCount {
   private static final Pattern SPACE = Pattern.compile(" ");
 
   public static void main(String[] args) throws Exception {
-    if (args.length < 2) {
-      System.err.println("Usage: JavaDirectKafkaWordCount <brokers> <topics>\n" +
-          "  <brokers> is a list of one or more Kafka brokers\n" +
-          "  <topics> is a list of one or more kafka topics to consume from\n\n");
+    if (args.length < 3) {
+      System.err.println("Usage: JavaDirectKafkaWordCount <brokers> <groupId> <topics>\n" +
+                         "  <brokers> is a list of one or more Kafka brokers\n" +
+                         "  <groupId> is a consumer group name to consume from topics\n" +
+                         "  <topics> is a list of one or more kafka topics to consume from\n\n");
       System.exit(1);
     }
 
     StreamingExamples.setStreamingLogLevels();
 
     String brokers = args[0];
-    String topics = args[1];
+    String groupId = args[1];
+    String topics = args[2];
 
     // Create context with a 2 seconds batch interval
     SparkConf sparkConf = new SparkConf().setAppName("JavaDirectKafkaWordCount");
@@ -68,7 +73,10 @@ public static void main(String[] args) throws Exception {
 
     Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
     Map<String, Object> kafkaParams = new HashMap<>();
-    kafkaParams.put("metadata.broker.list", brokers);
+    kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers);
+    kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, groupId);
+    kafkaParams.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
+    kafkaParams.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
 
     // Create direct kafka stream with brokers and topics
     JavaInputDStream<ConsumerRecord<String, String>> messages = KafkaUtils.createDirectStream(

From 33952cfa8182c1e925083e18c63c6152dcc3c8b4 Mon Sep 17 00:00:00 2001
From: Gengliang Wang <gengliang.wang@databricks.com>
Date: Thu, 5 Jul 2018 09:25:19 -0700
Subject: [PATCH 62/79] [SPARK-24675][SQL] Rename table: validate existence of
 new location
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?
If table is renamed to a existing new location, data won't show up.
```
scala>  Seq("hello").toDF("a").write.format("parquet").saveAsTable("t")

scala> sql("select * from t").show()
+-----+
|    a|
+-----+
|hello|
+-----+

scala> sql("alter table t rename to test")
res2: org.apache.spark.sql.DataFrame = []

scala> sql("select * from test").show()
+---+
|  a|
+---+
+---+
```
The file layout is like
```
$ tree test
test
├── gabage
└── t
    ├── _SUCCESS
    └── part-00000-856b0f10-08f1-42d6-9eb3-7719261f3d5e-c000.snappy.parquet
```

In Hive, if the new location exists, the renaming will fail even the location is empty.

We should have the same validation in Catalog, in case of unexpected bugs.

## How was this patch tested?

New unit test.

Author: Gengliang Wang <gengliang.wang@databricks.com>

Closes #21655 from gengliangwang/validate_rename_table.
---
 docs/sql-programming-guide.md                 |  1 +
 .../sql/catalyst/catalog/SessionCatalog.scala | 20 +++++++++++++++++++
 .../sql/execution/command/DDLSuite.scala      | 18 +++++++++++++++++
 3 files changed, 39 insertions(+)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index cd7329b621122..ad23dae7c6b7c 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1850,6 +1850,7 @@ working with timestamps in `pandas_udf`s to get the best performance, see
   - Since Spark 2.4, writing a dataframe with an empty or nested empty schema using any file formats (parquet, orc, json, text, csv etc.) is not allowed. An exception is thrown when attempting to write dataframes with empty schema.
   - Since Spark 2.4, Spark compares a DATE type with a TIMESTAMP type after promotes both sides to TIMESTAMP. To set `false` to `spark.sql.hive.compareDateTimestampInTimestamp` restores the previous behavior. This option will be removed in Spark 3.0.
   - Since Spark 2.4, creating a managed table with nonempty location is not allowed. An exception is thrown when attempting to create a managed table with nonempty location. To set `true` to `spark.sql.allowCreatingManagedTableUsingNonemptyLocation` restores the previous behavior. This option will be removed in Spark 3.0.
+  - Since Spark 2.4, renaming a managed table to existing location is not allowed. An exception is thrown when attempting to rename a managed table to existing location.
   - Since Spark 2.4, the type coercion rules can automatically promote the argument types of the variadic SQL functions (e.g., IN/COALESCE) to the widest common type, no matter how the input arguments order. In prior Spark versions, the promotion could fail in some specific orders (e.g., TimestampType, IntegerType and StringType) and throw an exception.
   - Since Spark 2.4, Spark has enabled non-cascading SQL cache invalidation in addition to the traditional cache invalidation mechanism. The non-cascading cache invalidation mechanism allows users to remove a cache without impacting its dependent caches. This new cache invalidation mechanism is used in scenarios where the data of the cache to be removed is still valid, e.g., calling unpersist() on a Dataset, or dropping a temporary view. This allows users to free up memory and keep the desired caches valid at the same time.
   - In version 2.3 and earlier, `to_utc_timestamp` and `from_utc_timestamp` respect the timezone in the input timestamp string, which breaks the assumption that the input timestamp is in a specific timezone. Therefore, these 2 functions can return unexpected results. In version 2.4 and later, this problem has been fixed. `to_utc_timestamp` and `from_utc_timestamp` will return null if the input timestamp string contains timezone. As an example, `from_utc_timestamp('2000-10-10 00:00:00', 'GMT+1')` will return `2000-10-10 01:00:00` in both Spark 2.3 and 2.4. However, `from_utc_timestamp('2000-10-10 00:00:00+00:00', 'GMT+1')`, assuming a local timezone of GMT+8, will return `2000-10-10 09:00:00` in Spark 2.3 but `null` in 2.4. For people who don't care about this problem and want to retain the previous behaivor to keep their query unchanged, you can set `spark.sql.function.rejectTimezoneInString` to false. This option will be removed in Spark 3.0 and should only be used as a temporary workaround.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index c390337c03ff5..c26a34528c162 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -619,6 +619,7 @@ class SessionCatalog(
         requireTableExists(TableIdentifier(oldTableName, Some(db)))
         requireTableNotExists(TableIdentifier(newTableName, Some(db)))
         validateName(newTableName)
+        validateNewLocationOfRename(oldName, newName)
         externalCatalog.renameTable(db, oldTableName, newTableName)
       } else {
         if (newName.database.isDefined) {
@@ -1366,4 +1367,23 @@ class SessionCatalog(
     // copy over temporary views
     tempViews.foreach(kv => target.tempViews.put(kv._1, kv._2))
   }
+
+  /**
+   * Validate the new locatoin before renaming a managed table, which should be non-existent.
+   */
+  private def validateNewLocationOfRename(
+      oldName: TableIdentifier,
+      newName: TableIdentifier): Unit = {
+    val oldTable = getTableMetadata(oldName)
+    if (oldTable.tableType == CatalogTableType.MANAGED) {
+      val databaseLocation =
+        externalCatalog.getDatabase(oldName.database.getOrElse(currentDb)).locationUri
+      val newTableLocation = new Path(new Path(databaseLocation), formatTableName(newName.table))
+      val fs = newTableLocation.getFileSystem(hadoopConf)
+      if (fs.exists(newTableLocation)) {
+        throw new AnalysisException(s"Can not rename the managed table('$oldName')" +
+          s". The associated location('$newTableLocation') already exists.")
+      }
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 3998ceca38b30..270ed7f80197c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -441,6 +441,24 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
     }
   }
 
+  test("rename a managed table with existing empty directory") {
+    val tableLoc = new File(spark.sessionState.catalog.defaultTablePath(TableIdentifier("tab2")))
+    try {
+      withTable("tab1") {
+        sql(s"CREATE TABLE tab1 USING $dataSource AS SELECT 1, 'a'")
+        tableLoc.mkdir()
+        val ex = intercept[AnalysisException] {
+          sql("ALTER TABLE tab1 RENAME TO tab2")
+        }.getMessage
+        val expectedMsg = "Can not rename the managed table('`tab1`'). The associated location"
+        assert(ex.contains(expectedMsg))
+      }
+    } finally {
+      waitForTasksToFinish()
+      Utils.deleteRecursively(tableLoc)
+    }
+  }
+
   private def checkSchemaInCreatedDataSourceTable(
       path: File,
       userSpecifiedSchema: Option[String],

From e71e93aaaa0d26301e10d3dc65f4db298424e99a Mon Sep 17 00:00:00 2001
From: Stavros Kontopoulos <stavros.kontopoulos@lightbend.com>
Date: Thu, 5 Jul 2018 16:35:16 -0500
Subject: [PATCH 63/79] [SPARK-24694][K8S] Pass all app args to integration
 tests

## What changes were proposed in this pull request?
- Allows to pass more than one app args to tests.
## How was this patch tested?
Manually tested it with a spark test that requires more than on app args.

Author: Stavros Kontopoulos <stavros.kontopoulos@lightbend.com>

Closes #21672 from skonto/fix_itsets-args.
---
 .../k8s/integrationtest/KubernetesTestComponents.scala       | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesTestComponents.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesTestComponents.scala
index 48727142dd052..b2471e51116cb 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesTestComponents.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesTestComponents.scala
@@ -105,16 +105,13 @@ private[spark] object SparkAppLauncher extends Logging {
       sparkHomeDir: Path): Unit = {
     val sparkSubmitExecutable = sparkHomeDir.resolve(Paths.get("bin", "spark-submit"))
     logInfo(s"Launching a spark app with arguments $appArguments and conf $appConf")
-    val appArgsArray =
-      if (appArguments.appArgs.length > 0) Array(appArguments.appArgs.mkString(" "))
-      else Array[String]()
     val commandLine = (Array(sparkSubmitExecutable.toFile.getAbsolutePath,
       "--deploy-mode", "cluster",
       "--class", appArguments.mainClass,
       "--master", appConf.get("spark.master")
     ) ++ appConf.toStringArray :+
       appArguments.mainAppResource) ++
-      appArgsArray
+      appArguments.appArgs
     ProcessUtils.executeProcess(commandLine, timeoutSecs)
   }
 }

From 01fcba2c685be0603a404392685e9d52fb4cb82a Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Fri, 6 Jul 2018 11:10:50 +0800
Subject: [PATCH 64/79] [SPARK-24737][SQL] Type coercion between StructTypes.

## What changes were proposed in this pull request?

We can support type coercion between `StructType`s where all the internal types are compatible.

## How was this patch tested?

Added tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #21713 from ueshin/issues/SPARK-24737/structtypecoercion.
---
 .../sql/catalyst/analysis/TypeCoercion.scala  | 69 ++++++--------
 .../catalyst/analysis/TypeCoercionSuite.scala | 93 +++++++++++++++++--
 2 files changed, 114 insertions(+), 48 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
index cf90e6e555fc8..b6ca30c7398f2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -102,25 +102,7 @@ object TypeCoercion {
     case (_: TimestampType, _: DateType) | (_: DateType, _: TimestampType) =>
       Some(TimestampType)
 
-    case (t1 @ StructType(fields1), t2 @ StructType(fields2)) if t1.sameType(t2) =>
-      Some(StructType(fields1.zip(fields2).map { case (f1, f2) =>
-        // Since `t1.sameType(t2)` is true, two StructTypes have the same DataType
-        // except `name` (in case of `spark.sql.caseSensitive=false`) and `nullable`.
-        // - Different names: use f1.name
-        // - Different nullabilities: `nullable` is true iff one of them is nullable.
-        val dataType = findTightestCommonType(f1.dataType, f2.dataType).get
-        StructField(f1.name, dataType, nullable = f1.nullable || f2.nullable)
-      }))
-
-    case (a1 @ ArrayType(et1, hasNull1), a2 @ ArrayType(et2, hasNull2)) if a1.sameType(a2) =>
-      findTightestCommonType(et1, et2).map(ArrayType(_, hasNull1 || hasNull2))
-
-    case (m1 @ MapType(kt1, vt1, hasNull1), m2 @ MapType(kt2, vt2, hasNull2)) if m1.sameType(m2) =>
-      val keyType = findTightestCommonType(kt1, kt2)
-      val valueType = findTightestCommonType(vt1, vt2)
-      Some(MapType(keyType.get, valueType.get, hasNull1 || hasNull2))
-
-    case _ => None
+    case (t1, t2) => findTypeForComplex(t1, t2, findTightestCommonType)
   }
 
   /** Promotes all the way to StringType. */
@@ -166,6 +148,30 @@ object TypeCoercion {
     case (l, r) => None
   }
 
+  private def findTypeForComplex(
+      t1: DataType,
+      t2: DataType,
+      findTypeFunc: (DataType, DataType) => Option[DataType]): Option[DataType] = (t1, t2) match {
+    case (ArrayType(et1, containsNull1), ArrayType(et2, containsNull2)) =>
+      findTypeFunc(et1, et2).map(ArrayType(_, containsNull1 || containsNull2))
+    case (MapType(kt1, vt1, valueContainsNull1), MapType(kt2, vt2, valueContainsNull2)) =>
+      findTypeFunc(kt1, kt2).flatMap { kt =>
+        findTypeFunc(vt1, vt2).map { vt =>
+          MapType(kt, vt, valueContainsNull1 || valueContainsNull2)
+        }
+      }
+    case (StructType(fields1), StructType(fields2)) if fields1.length == fields2.length =>
+      val resolver = SQLConf.get.resolver
+      fields1.zip(fields2).foldLeft(Option(new StructType())) {
+        case (Some(struct), (field1, field2)) if resolver(field1.name, field2.name) =>
+          findTypeFunc(field1.dataType, field2.dataType).map {
+            dt => struct.add(field1.name, dt, field1.nullable || field2.nullable)
+          }
+        case _ => None
+      }
+    case _ => None
+  }
+
   /**
    * Case 2 type widening (see the classdoc comment above for TypeCoercion).
    *
@@ -176,17 +182,7 @@ object TypeCoercion {
     findTightestCommonType(t1, t2)
       .orElse(findWiderTypeForDecimal(t1, t2))
       .orElse(stringPromotion(t1, t2))
-      .orElse((t1, t2) match {
-        case (ArrayType(et1, containsNull1), ArrayType(et2, containsNull2)) =>
-          findWiderTypeForTwo(et1, et2).map(ArrayType(_, containsNull1 || containsNull2))
-        case (MapType(kt1, vt1, valueContainsNull1), MapType(kt2, vt2, valueContainsNull2)) =>
-          findWiderTypeForTwo(kt1, kt2).flatMap { kt =>
-            findWiderTypeForTwo(vt1, vt2).map { vt =>
-              MapType(kt, vt, valueContainsNull1 || valueContainsNull2)
-            }
-          }
-        case _ => None
-      })
+      .orElse(findTypeForComplex(t1, t2, findWiderTypeForTwo))
   }
 
   /**
@@ -222,18 +218,7 @@ object TypeCoercion {
       t2: DataType): Option[DataType] = {
     findTightestCommonType(t1, t2)
       .orElse(findWiderTypeForDecimal(t1, t2))
-      .orElse((t1, t2) match {
-        case (ArrayType(et1, containsNull1), ArrayType(et2, containsNull2)) =>
-          findWiderTypeWithoutStringPromotionForTwo(et1, et2)
-            .map(ArrayType(_, containsNull1 || containsNull2))
-        case (MapType(kt1, vt1, valueContainsNull1), MapType(kt2, vt2, valueContainsNull2)) =>
-          findWiderTypeWithoutStringPromotionForTwo(kt1, kt2).flatMap { kt =>
-            findWiderTypeWithoutStringPromotionForTwo(vt1, vt2).map { vt =>
-              MapType(kt, vt, valueContainsNull1 || valueContainsNull2)
-            }
-          }
-        case _ => None
-      })
+      .orElse(findTypeForComplex(t1, t2, findWiderTypeWithoutStringPromotionForTwo))
   }
 
   def findWiderTypeWithoutStringPromotion(types: Seq[DataType]): Option[DataType] = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
index 4e5ca1b8cdd36..8cc5a23779a2a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
@@ -54,7 +54,7 @@ class TypeCoercionSuite extends AnalysisTest {
   // | NullType             | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | BinaryType | BooleanType | StringType | DateType | TimestampType | ArrayType  | MapType  | StructType  | NullType | CalendarIntervalType | DecimalType(38, 18) | DoubleType  | IntegerType  |
   // | CalendarIntervalType | X        | X         | X           | X        | X          | X         | X          | X          | X           | X          | X        | X             | X          | X        | X           | X        | CalendarIntervalType | X                   | X           | X            |
   // +----------------------+----------+-----------+-------------+----------+------------+-----------+------------+------------+-------------+------------+----------+---------------+------------+----------+-------------+----------+----------------------+---------------------+-------------+--------------+
-  // Note: StructType* is castable only when the internal child types also match; otherwise, not castable.
+  // Note: StructType* is castable when all the internal child types are castable according to the table.
   // Note: ArrayType* is castable when the element type is castable according to the table.
   // Note: MapType* is castable when both the key type and the value type are castable according to the table.
   // scalastyle:on line.size.limit
@@ -397,7 +397,7 @@ class TypeCoercionSuite extends AnalysisTest {
     widenTest(
       StructType(Seq(StructField("a", IntegerType, nullable = false))),
       StructType(Seq(StructField("a", DoubleType, nullable = false))),
-      None)
+      Some(StructType(Seq(StructField("a", DoubleType, nullable = false)))))
 
     widenTest(
       StructType(Seq(StructField("a", IntegerType, nullable = false))),
@@ -454,15 +454,18 @@ class TypeCoercionSuite extends AnalysisTest {
     def widenTestWithStringPromotion(
         t1: DataType,
         t2: DataType,
-        expected: Option[DataType]): Unit = {
-      checkWidenType(TypeCoercion.findWiderTypeForTwo, t1, t2, expected)
+        expected: Option[DataType],
+        isSymmetric: Boolean = true): Unit = {
+      checkWidenType(TypeCoercion.findWiderTypeForTwo, t1, t2, expected, isSymmetric)
     }
 
     def widenTestWithoutStringPromotion(
         t1: DataType,
         t2: DataType,
-        expected: Option[DataType]): Unit = {
-      checkWidenType(TypeCoercion.findWiderTypeWithoutStringPromotionForTwo, t1, t2, expected)
+        expected: Option[DataType],
+        isSymmetric: Boolean = true): Unit = {
+      checkWidenType(
+        TypeCoercion.findWiderTypeWithoutStringPromotionForTwo, t1, t2, expected, isSymmetric)
     }
 
     // Decimal
@@ -492,6 +495,10 @@ class TypeCoercionSuite extends AnalysisTest {
       ArrayType(MapType(IntegerType, FloatType), containsNull = false),
       ArrayType(MapType(LongType, DoubleType), containsNull = false),
       Some(ArrayType(MapType(LongType, DoubleType), containsNull = false)))
+    widenTestWithStringPromotion(
+      ArrayType(new StructType().add("num", ShortType), containsNull = false),
+      ArrayType(new StructType().add("num", LongType), containsNull = false),
+      Some(ArrayType(new StructType().add("num", LongType), containsNull = false)))
 
     // MapType
     widenTestWithStringPromotion(
@@ -506,6 +513,64 @@ class TypeCoercionSuite extends AnalysisTest {
       MapType(IntegerType, MapType(ShortType, TimestampType), valueContainsNull = false),
       MapType(LongType, MapType(DoubleType, StringType), valueContainsNull = false),
       Some(MapType(LongType, MapType(DoubleType, StringType), valueContainsNull = false)))
+    widenTestWithStringPromotion(
+      MapType(IntegerType, new StructType().add("num", ShortType), valueContainsNull = false),
+      MapType(LongType, new StructType().add("num", LongType), valueContainsNull = false),
+      Some(MapType(LongType, new StructType().add("num", LongType), valueContainsNull = false)))
+
+    // StructType
+    widenTestWithStringPromotion(
+      new StructType()
+        .add("num", ShortType, nullable = true).add("ts", StringType, nullable = false),
+      new StructType()
+        .add("num", DoubleType, nullable = false).add("ts", TimestampType, nullable = true),
+      Some(new StructType()
+        .add("num", DoubleType, nullable = true).add("ts", StringType, nullable = true)))
+    widenTestWithStringPromotion(
+      new StructType()
+        .add("arr", ArrayType(ShortType, containsNull = false), nullable = false),
+      new StructType()
+        .add("arr", ArrayType(DoubleType, containsNull = true), nullable = false),
+      Some(new StructType()
+        .add("arr", ArrayType(DoubleType, containsNull = true), nullable = false)))
+    widenTestWithStringPromotion(
+      new StructType()
+        .add("map", MapType(ShortType, TimestampType, valueContainsNull = true), nullable = false),
+      new StructType()
+        .add("map", MapType(DoubleType, StringType, valueContainsNull = false), nullable = false),
+      Some(new StructType()
+        .add("map", MapType(DoubleType, StringType, valueContainsNull = true), nullable = false)))
+
+    widenTestWithStringPromotion(
+      new StructType().add("num", IntegerType),
+      new StructType().add("num", LongType).add("str", StringType),
+      None)
+    widenTestWithoutStringPromotion(
+      new StructType().add("num", IntegerType),
+      new StructType().add("num", LongType).add("str", StringType),
+      None)
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") {
+      widenTestWithStringPromotion(
+        new StructType().add("a", IntegerType),
+        new StructType().add("A", LongType),
+        None)
+      widenTestWithoutStringPromotion(
+        new StructType().add("a", IntegerType),
+        new StructType().add("A", LongType),
+        None)
+    }
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
+      widenTestWithStringPromotion(
+        new StructType().add("a", IntegerType),
+        new StructType().add("A", LongType),
+        Some(new StructType().add("a", LongType)),
+        isSymmetric = false)
+      widenTestWithoutStringPromotion(
+        new StructType().add("a", IntegerType),
+        new StructType().add("A", LongType),
+        Some(new StructType().add("a", LongType)),
+        isSymmetric = false)
+    }
 
     // Without string promotion
     widenTestWithoutStringPromotion(IntegerType, StringType, None)
@@ -520,6 +585,14 @@ class TypeCoercionSuite extends AnalysisTest {
       MapType(StringType, IntegerType), MapType(TimestampType, IntegerType), None)
     widenTestWithoutStringPromotion(
       MapType(IntegerType, StringType), MapType(IntegerType, TimestampType), None)
+    widenTestWithoutStringPromotion(
+      new StructType().add("a", IntegerType),
+      new StructType().add("a", StringType),
+      None)
+    widenTestWithoutStringPromotion(
+      new StructType().add("a", StringType),
+      new StructType().add("a", IntegerType),
+      None)
 
     // String promotion
     widenTestWithStringPromotion(IntegerType, StringType, Some(StringType))
@@ -544,6 +617,14 @@ class TypeCoercionSuite extends AnalysisTest {
       MapType(IntegerType, StringType),
       MapType(IntegerType, TimestampType),
       Some(MapType(IntegerType, StringType)))
+    widenTestWithStringPromotion(
+      new StructType().add("a", IntegerType),
+      new StructType().add("a", StringType),
+      Some(new StructType().add("a", StringType)))
+    widenTestWithStringPromotion(
+      new StructType().add("a", StringType),
+      new StructType().add("a", IntegerType),
+      Some(new StructType().add("a", StringType)))
   }
 
   private def ruleTest(rule: Rule[LogicalPlan], initial: Expression, transformed: Expression) {

From bf67f70c48881ee99751f7d51fbcbda1e593d90a Mon Sep 17 00:00:00 2001
From: Yuming Wang <yumwang@ebay.com>
Date: Fri, 6 Jul 2018 11:13:57 +0800
Subject: [PATCH 65/79] [SPARK-24692][TESTS] Improvement
 FilterPushdownBenchmark

## What changes were proposed in this pull request?
Refer to the [`WideSchemaBenchmark`](https://github.com/apache/spark/blob/v2.3.1/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/WideSchemaBenchmark.scala) update `FilterPushdownBenchmark`:
1. Write the result to `benchmarks/FilterPushdownBenchmark-results.txt` for easy maintenance.
2. Add more benchmark case: `StringStartsWith`, `Decimal`, `InSet -> InFilters` and `tinyint`.

## How was this patch tested?

manual tests

Author: Yuming Wang <yumwang@ebay.com>

Closes #21677 from wangyum/SPARK-24692.
---
 .../FilterPushdownBenchmark-results.txt       | 580 ++++++++++++++++++
 .../benchmark/FilterPushdownBenchmark.scala   | 405 +++++-------
 2 files changed, 748 insertions(+), 237 deletions(-)
 create mode 100644 sql/core/benchmarks/FilterPushdownBenchmark-results.txt

diff --git a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt
new file mode 100644
index 0000000000000..29fe4345d69da
--- /dev/null
+++ b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt
@@ -0,0 +1,580 @@
+================================================================================================
+Pushdown for many distinct value case
+================================================================================================
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 0 string row (value IS NULL):     Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            8970 / 9122          1.8         570.3       1.0X
+Parquet Vectorized (Pushdown)                  471 /  491         33.4          30.0      19.0X
+Native ORC Vectorized                         7661 / 7853          2.1         487.0       1.2X
+Native ORC Vectorized (Pushdown)              1134 / 1161         13.9          72.1       7.9X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 0 string row ('7864320' < value < '7864320'): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            9246 / 9297          1.7         587.8       1.0X
+Parquet Vectorized (Pushdown)                  480 /  488         32.8          30.5      19.3X
+Native ORC Vectorized                         7838 / 7850          2.0         498.3       1.2X
+Native ORC Vectorized (Pushdown)              1054 / 1118         14.9          67.0       8.8X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 1 string row (value = '7864320'): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            8989 / 9100          1.7         571.5       1.0X
+Parquet Vectorized (Pushdown)                  448 /  467         35.1          28.5      20.1X
+Native ORC Vectorized                         7680 / 7768          2.0         488.3       1.2X
+Native ORC Vectorized (Pushdown)              1067 / 1118         14.7          67.8       8.4X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 1 string row (value <=> '7864320'): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            9115 / 9266          1.7         579.5       1.0X
+Parquet Vectorized (Pushdown)                  466 /  492         33.7          29.7      19.5X
+Native ORC Vectorized                         7800 / 7914          2.0         495.9       1.2X
+Native ORC Vectorized (Pushdown)              1075 / 1102         14.6          68.4       8.5X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 1 string row ('7864320' <= value <= '7864320'): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            9099 / 9237          1.7         578.5       1.0X
+Parquet Vectorized (Pushdown)                  462 /  475         34.1          29.3      19.7X
+Native ORC Vectorized                         7847 / 7925          2.0         498.9       1.2X
+Native ORC Vectorized (Pushdown)              1078 / 1114         14.6          68.5       8.4X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select all string rows (value IS NOT NULL): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                          19303 / 19547          0.8        1227.3       1.0X
+Parquet Vectorized (Pushdown)               19924 / 20089          0.8        1266.7       1.0X
+Native ORC Vectorized                       18725 / 19079          0.8        1190.5       1.0X
+Native ORC Vectorized (Pushdown)            19310 / 19492          0.8        1227.7       1.0X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 0 int row (value IS NULL):        Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            8117 / 8323          1.9         516.1       1.0X
+Parquet Vectorized (Pushdown)                  484 /  494         32.5          30.8      16.8X
+Native ORC Vectorized                         6811 / 7036          2.3         433.0       1.2X
+Native ORC Vectorized (Pushdown)              1061 / 1082         14.8          67.5       7.6X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 0 int row (7864320 < value < 7864320): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            8105 / 8140          1.9         515.3       1.0X
+Parquet Vectorized (Pushdown)                  478 /  505         32.9          30.4      17.0X
+Native ORC Vectorized                         6914 / 7211          2.3         439.6       1.2X
+Native ORC Vectorized (Pushdown)              1044 / 1064         15.1          66.4       7.8X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 1 int row (value = 7864320):      Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            7983 / 8116          2.0         507.6       1.0X
+Parquet Vectorized (Pushdown)                  464 /  487         33.9          29.5      17.2X
+Native ORC Vectorized                         6703 / 6774          2.3         426.1       1.2X
+Native ORC Vectorized (Pushdown)              1017 / 1058         15.5          64.6       7.9X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 1 int row (value <=> 7864320):    Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            7942 / 7983          2.0         504.9       1.0X
+Parquet Vectorized (Pushdown)                  468 /  479         33.6          29.7      17.0X
+Native ORC Vectorized                         6677 / 6779          2.4         424.5       1.2X
+Native ORC Vectorized (Pushdown)              1021 / 1068         15.4          64.9       7.8X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 1 int row (7864320 <= value <= 7864320): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            7909 / 7958          2.0         502.8       1.0X
+Parquet Vectorized (Pushdown)                  485 /  494         32.4          30.8      16.3X
+Native ORC Vectorized                         6751 / 6846          2.3         429.2       1.2X
+Native ORC Vectorized (Pushdown)              1043 / 1077         15.1          66.3       7.6X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 1 int row (7864319 < value < 7864321): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            8010 / 8033          2.0         509.2       1.0X
+Parquet Vectorized (Pushdown)                  472 /  489         33.3          30.0      17.0X
+Native ORC Vectorized                         6655 / 6808          2.4         423.1       1.2X
+Native ORC Vectorized (Pushdown)              1015 / 1067         15.5          64.5       7.9X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 10% int rows (value < 1572864):   Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            8983 / 9035          1.8         571.1       1.0X
+Parquet Vectorized (Pushdown)                 2204 / 2231          7.1         140.1       4.1X
+Native ORC Vectorized                         7864 / 8011          2.0         500.0       1.1X
+Native ORC Vectorized (Pushdown)              2674 / 2789          5.9         170.0       3.4X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 50% int rows (value < 7864320):   Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                          12723 / 12903          1.2         808.9       1.0X
+Parquet Vectorized (Pushdown)                 9112 / 9282          1.7         579.3       1.4X
+Native ORC Vectorized                       12090 / 12230          1.3         768.7       1.1X
+Native ORC Vectorized (Pushdown)              9242 / 9372          1.7         587.6       1.4X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 90% int rows (value < 14155776):  Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                          16453 / 16678          1.0        1046.1       1.0X
+Parquet Vectorized (Pushdown)               15997 / 16262          1.0        1017.0       1.0X
+Native ORC Vectorized                       16652 / 17070          0.9        1058.7       1.0X
+Native ORC Vectorized (Pushdown)            15843 / 16112          1.0        1007.2       1.0X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select all int rows (value IS NOT NULL): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                          17098 / 17254          0.9        1087.1       1.0X
+Parquet Vectorized (Pushdown)               17302 / 17529          0.9        1100.1       1.0X
+Native ORC Vectorized                       16790 / 17098          0.9        1067.5       1.0X
+Native ORC Vectorized (Pushdown)            17329 / 17914          0.9        1101.7       1.0X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select all int rows (value > -1):        Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                          17088 / 17392          0.9        1086.4       1.0X
+Parquet Vectorized (Pushdown)               17609 / 17863          0.9        1119.5       1.0X
+Native ORC Vectorized                       18334 / 69831          0.9        1165.7       0.9X
+Native ORC Vectorized (Pushdown)            17465 / 17629          0.9        1110.4       1.0X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select all int rows (value != -1):       Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                          16903 / 17233          0.9        1074.6       1.0X
+Parquet Vectorized (Pushdown)               16945 / 17032          0.9        1077.3       1.0X
+Native ORC Vectorized                       16377 / 16762          1.0        1041.2       1.0X
+Native ORC Vectorized (Pushdown)            16950 / 17212          0.9        1077.7       1.0X
+
+
+================================================================================================
+Pushdown for few distinct value case (use dictionary encoding)
+================================================================================================
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 0 distinct string row (value IS NULL): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            7245 / 7322          2.2         460.7       1.0X
+Parquet Vectorized (Pushdown)                  378 /  389         41.6          24.0      19.2X
+Native ORC Vectorized                         6720 / 6778          2.3         427.2       1.1X
+Native ORC Vectorized (Pushdown)              1009 / 1032         15.6          64.2       7.2X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 0 distinct string row ('100' < value < '100'): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            7627 / 7795          2.1         484.9       1.0X
+Parquet Vectorized (Pushdown)                  384 /  406         41.0          24.4      19.9X
+Native ORC Vectorized                         6724 / 7824          2.3         427.5       1.1X
+Native ORC Vectorized (Pushdown)               968 /  986         16.3          61.5       7.9X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 1 distinct string row (value = '100'): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            7157 / 7534          2.2         455.0       1.0X
+Parquet Vectorized (Pushdown)                  542 /  565         29.0          34.5      13.2X
+Native ORC Vectorized                         6716 / 7214          2.3         427.0       1.1X
+Native ORC Vectorized (Pushdown)              1212 / 1288         13.0          77.0       5.9X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 1 distinct string row (value <=> '100'): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            7368 / 7552          2.1         468.4       1.0X
+Parquet Vectorized (Pushdown)                  544 /  556         28.9          34.6      13.5X
+Native ORC Vectorized                         6740 / 6867          2.3         428.5       1.1X
+Native ORC Vectorized (Pushdown)              1230 / 1426         12.8          78.2       6.0X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 1 distinct string row ('100' <= value <= '100'): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            7427 / 7734          2.1         472.2       1.0X
+Parquet Vectorized (Pushdown)                  556 /  568         28.3          35.4      13.3X
+Native ORC Vectorized                         6847 / 7059          2.3         435.3       1.1X
+Native ORC Vectorized (Pushdown)              1226 / 1230         12.8          77.9       6.1X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select all distinct string rows (value IS NOT NULL): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                          16998 / 17311          0.9        1080.7       1.0X
+Parquet Vectorized (Pushdown)               16977 / 17250          0.9        1079.4       1.0X
+Native ORC Vectorized                       18447 / 19852          0.9        1172.8       0.9X
+Native ORC Vectorized (Pushdown)            16614 / 17102          0.9        1056.3       1.0X
+
+
+================================================================================================
+Pushdown benchmark for StringStartsWith
+================================================================================================
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+StringStartsWith filter: (value like '10%'): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                           9705 / 10814          1.6         617.0       1.0X
+Parquet Vectorized (Pushdown)                 3086 / 3574          5.1         196.2       3.1X
+Native ORC Vectorized                       10094 / 10695          1.6         641.8       1.0X
+Native ORC Vectorized (Pushdown)              9611 / 9999          1.6         611.0       1.0X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+StringStartsWith filter: (value like '1000%'): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            8016 / 8183          2.0         509.7       1.0X
+Parquet Vectorized (Pushdown)                  444 /  457         35.4          28.2      18.0X
+Native ORC Vectorized                         6970 / 7169          2.3         443.2       1.2X
+Native ORC Vectorized (Pushdown)              7447 / 7503          2.1         473.5       1.1X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+StringStartsWith filter: (value like '786432%'): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            7908 / 8046          2.0         502.8       1.0X
+Parquet Vectorized (Pushdown)                  408 /  429         38.6          25.9      19.4X
+Native ORC Vectorized                         7021 / 7100          2.2         446.4       1.1X
+Native ORC Vectorized (Pushdown)              7310 / 7490          2.2         464.8       1.1X
+
+
+================================================================================================
+Pushdown benchmark for decimal
+================================================================================================
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 1 decimal(9, 2) row (value = 7864320): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            3785 / 3867          4.2         240.6       1.0X
+Parquet Vectorized (Pushdown)                 3820 / 3928          4.1         242.9       1.0X
+Native ORC Vectorized                         3981 / 4049          4.0         253.1       1.0X
+Native ORC Vectorized (Pushdown)               702 /  735         22.4          44.6       5.4X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 10% decimal(9, 2) rows (value < 1572864): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            4694 / 4813          3.4         298.4       1.0X
+Parquet Vectorized (Pushdown)                 4839 / 4907          3.3         307.6       1.0X
+Native ORC Vectorized                         4943 / 5032          3.2         314.2       0.9X
+Native ORC Vectorized (Pushdown)              2043 / 2085          7.7         129.9       2.3X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 50% decimal(9, 2) rows (value < 7864320): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            8321 / 8472          1.9         529.0       1.0X
+Parquet Vectorized (Pushdown)                 8125 / 8471          1.9         516.6       1.0X
+Native ORC Vectorized                         8524 / 8616          1.8         541.9       1.0X
+Native ORC Vectorized (Pushdown)              7961 / 8383          2.0         506.1       1.0X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 90% decimal(9, 2) rows (value < 14155776): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                           9587 / 10112          1.6         609.5       1.0X
+Parquet Vectorized (Pushdown)                9726 / 10370          1.6         618.3       1.0X
+Native ORC Vectorized                       10119 / 11147          1.6         643.4       0.9X
+Native ORC Vectorized (Pushdown)              9366 / 9497          1.7         595.5       1.0X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 1 decimal(18, 2) row (value = 7864320): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            4060 / 4093          3.9         258.1       1.0X
+Parquet Vectorized (Pushdown)                 4037 / 4125          3.9         256.6       1.0X
+Native ORC Vectorized                         4756 / 4811          3.3         302.4       0.9X
+Native ORC Vectorized (Pushdown)               824 /  889         19.1          52.4       4.9X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 10% decimal(18, 2) rows (value < 1572864): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            5157 / 5271          3.0         327.9       1.0X
+Parquet Vectorized (Pushdown)                 5051 / 5141          3.1         321.1       1.0X
+Native ORC Vectorized                         5723 / 6146          2.7         363.9       0.9X
+Native ORC Vectorized (Pushdown)              2198 / 2317          7.2         139.8       2.3X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 50% decimal(18, 2) rows (value < 7864320): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            8608 / 8647          1.8         547.3       1.0X
+Parquet Vectorized (Pushdown)                 8471 / 8584          1.9         538.6       1.0X
+Native ORC Vectorized                        9249 / 10048          1.7         588.0       0.9X
+Native ORC Vectorized (Pushdown)              7645 / 8091          2.1         486.1       1.1X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 90% decimal(18, 2) rows (value < 14155776): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                          11658 / 11888          1.3         741.2       1.0X
+Parquet Vectorized (Pushdown)               11812 / 12098          1.3         751.0       1.0X
+Native ORC Vectorized                       12943 / 13312          1.2         822.9       0.9X
+Native ORC Vectorized (Pushdown)            13139 / 13465          1.2         835.4       0.9X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 1 decimal(38, 2) row (value = 7864320): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            5491 / 5716          2.9         349.1       1.0X
+Parquet Vectorized (Pushdown)                 5515 / 5615          2.9         350.6       1.0X
+Native ORC Vectorized                         4582 / 4654          3.4         291.3       1.2X
+Native ORC Vectorized (Pushdown)               815 /  861         19.3          51.8       6.7X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 10% decimal(38, 2) rows (value < 1572864): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            6432 / 6527          2.4         409.0       1.0X
+Parquet Vectorized (Pushdown)                 6513 / 6607          2.4         414.1       1.0X
+Native ORC Vectorized                         5618 / 6085          2.8         357.2       1.1X
+Native ORC Vectorized (Pushdown)              2403 / 2443          6.5         152.8       2.7X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 50% decimal(38, 2) rows (value < 7864320): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                          11041 / 11467          1.4         701.9       1.0X
+Parquet Vectorized (Pushdown)               10909 / 11484          1.4         693.5       1.0X
+Native ORC Vectorized                        9860 / 10436          1.6         626.9       1.1X
+Native ORC Vectorized (Pushdown)              7908 / 8069          2.0         502.8       1.4X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 90% decimal(38, 2) rows (value < 14155776): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                          14816 / 16877          1.1         942.0       1.0X
+Parquet Vectorized (Pushdown)               15383 / 15740          1.0         978.0       1.0X
+Native ORC Vectorized                       14408 / 14771          1.1         916.0       1.0X
+Native ORC Vectorized (Pushdown)            13968 / 14805          1.1         888.1       1.1X
+
+
+================================================================================================
+Pushdown benchmark for InSet -> InFilters
+================================================================================================
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+InSet -> InFilters (values count: 5, distribution: 10): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            7477 / 7587          2.1         475.4       1.0X
+Parquet Vectorized (Pushdown)                 7862 / 8346          2.0         499.9       1.0X
+Native ORC Vectorized                         6447 / 7021          2.4         409.9       1.2X
+Native ORC Vectorized (Pushdown)               983 / 1003         16.0          62.5       7.6X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+InSet -> InFilters (values count: 5, distribution: 50): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            7107 / 7290          2.2         451.9       1.0X
+Parquet Vectorized (Pushdown)                 7196 / 7258          2.2         457.5       1.0X
+Native ORC Vectorized                         6102 / 6222          2.6         388.0       1.2X
+Native ORC Vectorized (Pushdown)               926 /  958         17.0          58.9       7.7X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+InSet -> InFilters (values count: 5, distribution: 90): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            7374 / 7692          2.1         468.8       1.0X
+Parquet Vectorized (Pushdown)                 7771 / 7848          2.0         494.1       0.9X
+Native ORC Vectorized                         6184 / 6356          2.5         393.2       1.2X
+Native ORC Vectorized (Pushdown)               920 /  963         17.1          58.5       8.0X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+InSet -> InFilters (values count: 10, distribution: 10): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            7073 / 7326          2.2         449.7       1.0X
+Parquet Vectorized (Pushdown)                 7304 / 7647          2.2         464.4       1.0X
+Native ORC Vectorized                         6222 / 6579          2.5         395.6       1.1X
+Native ORC Vectorized (Pushdown)               958 /  994         16.4          60.9       7.4X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+InSet -> InFilters (values count: 10, distribution: 50): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            7121 / 7501          2.2         452.7       1.0X
+Parquet Vectorized (Pushdown)                 7751 / 8334          2.0         492.8       0.9X
+Native ORC Vectorized                         6225 / 6680          2.5         395.8       1.1X
+Native ORC Vectorized (Pushdown)               998 / 1020         15.8          63.5       7.1X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+InSet -> InFilters (values count: 10, distribution: 90): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            7157 / 7399          2.2         455.1       1.0X
+Parquet Vectorized (Pushdown)                 7806 / 7911          2.0         496.3       0.9X
+Native ORC Vectorized                         6548 / 6720          2.4         416.3       1.1X
+Native ORC Vectorized (Pushdown)              1016 / 1050         15.5          64.6       7.0X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+InSet -> InFilters (values count: 50, distribution: 10): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            7662 / 7805          2.1         487.1       1.0X
+Parquet Vectorized (Pushdown)                 7590 / 7861          2.1         482.5       1.0X
+Native ORC Vectorized                         6840 / 8073          2.3         434.9       1.1X
+Native ORC Vectorized (Pushdown)              1041 / 1075         15.1          66.2       7.4X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+InSet -> InFilters (values count: 50, distribution: 50): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            8230 / 9266          1.9         523.2       1.0X
+Parquet Vectorized (Pushdown)                 7735 / 7960          2.0         491.8       1.1X
+Native ORC Vectorized                         6945 / 7109          2.3         441.6       1.2X
+Native ORC Vectorized (Pushdown)              1123 / 1144         14.0          71.4       7.3X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+InSet -> InFilters (values count: 50, distribution: 90): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            7656 / 8058          2.1         486.7       1.0X
+Parquet Vectorized (Pushdown)                 7860 / 8247          2.0         499.7       1.0X
+Native ORC Vectorized                         6684 / 7003          2.4         424.9       1.1X
+Native ORC Vectorized (Pushdown)              1085 / 1172         14.5          69.0       7.1X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+InSet -> InFilters (values count: 100, distribution: 10): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            7594 / 8128          2.1         482.8       1.0X
+Parquet Vectorized (Pushdown)                 7845 / 7923          2.0         498.8       1.0X
+Native ORC Vectorized                         5859 / 6421          2.7         372.5       1.3X
+Native ORC Vectorized (Pushdown)              1037 / 1054         15.2          66.0       7.3X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+InSet -> InFilters (values count: 100, distribution: 50): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            6762 / 6775          2.3         429.9       1.0X
+Parquet Vectorized (Pushdown)                 6911 / 6970          2.3         439.4       1.0X
+Native ORC Vectorized                         5884 / 5960          2.7         374.1       1.1X
+Native ORC Vectorized (Pushdown)              1028 / 1052         15.3          65.4       6.6X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+InSet -> InFilters (values count: 100, distribution: 90): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            6718 / 6767          2.3         427.1       1.0X
+Parquet Vectorized (Pushdown)                 6812 / 6909          2.3         433.1       1.0X
+Native ORC Vectorized                         5842 / 5883          2.7         371.4       1.1X
+Native ORC Vectorized (Pushdown)              1040 / 1058         15.1          66.1       6.5X
+
+
+================================================================================================
+Pushdown benchmark for tinyint
+================================================================================================
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 1 tinyint row (value = CAST(63 AS tinyint)): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            3726 / 3775          4.2         236.9       1.0X
+Parquet Vectorized (Pushdown)                 3741 / 3789          4.2         237.9       1.0X
+Native ORC Vectorized                         2793 / 2909          5.6         177.6       1.3X
+Native ORC Vectorized (Pushdown)               530 /  561         29.7          33.7       7.0X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 10% tinyint rows (value < CAST(12 AS tinyint)): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            4385 / 4406          3.6         278.8       1.0X
+Parquet Vectorized (Pushdown)                 4398 / 4454          3.6         279.6       1.0X
+Native ORC Vectorized                         3420 / 3501          4.6         217.4       1.3X
+Native ORC Vectorized (Pushdown)              1395 / 1432         11.3          88.7       3.1X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 50% tinyint rows (value < CAST(63 AS tinyint)): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                            7307 / 7394          2.2         464.6       1.0X
+Parquet Vectorized (Pushdown)                 7411 / 7461          2.1         471.2       1.0X
+Native ORC Vectorized                         6501 / 7814          2.4         413.4       1.1X
+Native ORC Vectorized (Pushdown)              7341 / 8637          2.1         466.7       1.0X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
+
+Select 90% tinyint rows (value < CAST(114 AS tinyint)): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Parquet Vectorized                          11886 / 13122          1.3         755.7       1.0X
+Parquet Vectorized (Pushdown)               12557 / 14173          1.3         798.4       0.9X
+Native ORC Vectorized                       10758 / 11971          1.5         684.0       1.1X
+Native ORC Vectorized (Pushdown)            10564 / 10713          1.5         671.6       1.1X
+
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala
index 6d7c7de9a856e..fc716dec9f337 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala
@@ -17,25 +17,30 @@
 
 package org.apache.spark.sql.execution.benchmark
 
-import java.io.File
+import java.io.{File, FileOutputStream, OutputStream}
 
 import scala.util.{Random, Try}
 
+import org.scalatest.{BeforeAndAfterEachTestData, Suite, TestData}
+
 import org.apache.spark.SparkConf
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.functions.monotonically_increasing_id
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.{ByteType, Decimal, DecimalType}
 import org.apache.spark.util.{Benchmark, Utils}
 
-
 /**
  * Benchmark to measure read performance with Filter pushdown.
  * To run this:
- *  spark-submit --class <this class> <spark sql test jar>
+ *  build/sbt "sql/test-only *FilterPushdownBenchmark"
+ *
+ * Results will be written to "benchmarks/FilterPushdownBenchmark-results.txt".
  */
-object FilterPushdownBenchmark {
-  val conf = new SparkConf()
-    .setAppName("FilterPushdownBenchmark")
+class FilterPushdownBenchmark extends SparkFunSuite with BenchmarkBeforeAndAfterEachTest {
+  private val conf = new SparkConf()
+    .setAppName(this.getClass.getSimpleName)
     // Since `spark.master` always exists, overrides this value
     .set("spark.master", "local[1]")
     .setIfMissing("spark.driver.memory", "3g")
@@ -44,8 +49,40 @@ object FilterPushdownBenchmark {
     .setIfMissing("orc.compression", "snappy")
     .setIfMissing("spark.sql.parquet.compression.codec", "snappy")
 
+  private val numRows = 1024 * 1024 * 15
+  private val width = 5
+  private val mid = numRows / 2
+  private val blockSize = 1048576
+
   private val spark = SparkSession.builder().config(conf).getOrCreate()
 
+  private var out: OutputStream = _
+
+  override def beforeAll() {
+    super.beforeAll()
+    out = new FileOutputStream(new File("benchmarks/FilterPushdownBenchmark-results.txt"))
+  }
+
+  override def beforeEach(td: TestData) {
+    super.beforeEach(td)
+    val separator = "=" * 96
+    val testHeader = (separator + '\n' + td.name + '\n' + separator + '\n' + '\n').getBytes
+    out.write(testHeader)
+  }
+
+  override def afterEach(td: TestData) {
+    out.write('\n')
+    super.afterEach(td)
+  }
+
+  override def afterAll() {
+    try {
+      out.close()
+    } finally {
+      super.afterAll()
+    }
+  }
+
   def withTempPath(f: File => Unit): Unit = {
     val path = Utils.createTempDir()
     path.delete()
@@ -81,8 +118,7 @@ object FilterPushdownBenchmark {
       .withColumn("value", valueCol)
       .sort("value")
 
-    saveAsOrcTable(df, dir.getCanonicalPath + "/orc")
-    saveAsParquetTable(df, dir.getCanonicalPath + "/parquet")
+    saveAsTable(df, dir)
   }
 
   private def prepareStringDictTable(
@@ -93,19 +129,22 @@ object FilterPushdownBenchmark {
     }
     val df = spark.range(numRows).selectExpr(selectExpr: _*).sort("value")
 
-    saveAsOrcTable(df, dir.getCanonicalPath + "/orc")
-    saveAsParquetTable(df, dir.getCanonicalPath + "/parquet")
+    saveAsTable(df, dir)
   }
 
-  private def saveAsOrcTable(df: DataFrame, dir: String): Unit = {
-    // To always turn on dictionary encoding, we set 1.0 at the threshold (the default is 0.8)
-    df.write.mode("overwrite").option("orc.dictionary.key.threshold", 1.0).orc(dir)
-    spark.read.orc(dir).createOrReplaceTempView("orcTable")
-  }
+  private def saveAsTable(df: DataFrame, dir: File): Unit = {
+    val orcPath = dir.getCanonicalPath + "/orc"
+    val parquetPath = dir.getCanonicalPath + "/parquet"
 
-  private def saveAsParquetTable(df: DataFrame, dir: String): Unit = {
-    df.write.mode("overwrite").parquet(dir)
-    spark.read.parquet(dir).createOrReplaceTempView("parquetTable")
+    // To always turn on dictionary encoding, we set 1.0 at the threshold (the default is 0.8)
+    df.write.mode("overwrite")
+      .option("orc.dictionary.key.threshold", 1.0)
+      .option("orc.stripe.size", blockSize).orc(orcPath)
+    spark.read.orc(orcPath).createOrReplaceTempView("orcTable")
+
+    df.write.mode("overwrite")
+      .option("parquet.block.size", blockSize).parquet(parquetPath)
+    spark.read.parquet(parquetPath).createOrReplaceTempView("parquetTable")
   }
 
   def filterPushDownBenchmark(
@@ -113,7 +152,7 @@ object FilterPushdownBenchmark {
       title: String,
       whereExpr: String,
       selectExpr: String = "*"): Unit = {
-    val benchmark = new Benchmark(title, values, minNumIters = 5)
+    val benchmark = new Benchmark(title, values, minNumIters = 5, output = Some(out))
 
     Seq(false, true).foreach { pushDownEnabled =>
       val name = s"Parquet Vectorized ${if (pushDownEnabled) s"(Pushdown)" else ""}"
@@ -133,214 +172,6 @@ object FilterPushdownBenchmark {
       }
     }
 
-    /*
-    OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 4.14.33-51.37.amzn1.x86_64
-    Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
-    Select 0 string row (value IS NULL):     Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                            9201 / 9300          1.7         585.0       1.0X
-    Parquet Vectorized (Pushdown)                   89 /  105        176.3           5.7     103.1X
-    Native ORC Vectorized                         8886 / 8898          1.8         564.9       1.0X
-    Native ORC Vectorized (Pushdown)               110 /  128        143.4           7.0      83.9X
-
-
-    Select 0 string row
-    ('7864320' < value < '7864320'):         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                            9336 / 9357          1.7         593.6       1.0X
-    Parquet Vectorized (Pushdown)                  927 /  937         17.0          58.9      10.1X
-    Native ORC Vectorized                         9026 / 9041          1.7         573.9       1.0X
-    Native ORC Vectorized (Pushdown)               257 /  272         61.1          16.4      36.3X
-
-
-    Select 1 string row (value = '7864320'): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                            9209 / 9223          1.7         585.5       1.0X
-    Parquet Vectorized (Pushdown)                  908 /  925         17.3          57.7      10.1X
-    Native ORC Vectorized                         8878 / 8904          1.8         564.4       1.0X
-    Native ORC Vectorized (Pushdown)               248 /  261         63.4          15.8      37.1X
-
-
-    Select 1 string row
-    (value <=> '7864320'):                   Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                            9194 / 9216          1.7         584.5       1.0X
-    Parquet Vectorized (Pushdown)                  899 /  908         17.5          57.2      10.2X
-    Native ORC Vectorized                         8934 / 8962          1.8         568.0       1.0X
-    Native ORC Vectorized (Pushdown)               249 /  254         63.3          15.8      37.0X
-
-
-    Select 1 string row
-    ('7864320' <= value <= '7864320'):       Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                            9332 / 9351          1.7         593.3       1.0X
-    Parquet Vectorized (Pushdown)                  915 /  934         17.2          58.2      10.2X
-    Native ORC Vectorized                         9049 / 9057          1.7         575.3       1.0X
-    Native ORC Vectorized (Pushdown)               248 /  258         63.5          15.8      37.7X
-
-
-    Select all string rows
-    (value IS NOT NULL):                     Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                          20478 / 20497          0.8        1301.9       1.0X
-    Parquet Vectorized (Pushdown)               20461 / 20550          0.8        1300.9       1.0X
-    Native ORC Vectorized                       27464 / 27482          0.6        1746.1       0.7X
-    Native ORC Vectorized (Pushdown)            27454 / 27488          0.6        1745.5       0.7X
-
-
-    Select 0 int row (value IS NULL):        Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                            8489 / 8519          1.9         539.7       1.0X
-    Parquet Vectorized (Pushdown)                   64 /   69        246.1           4.1     132.8X
-    Native ORC Vectorized                         8064 / 8099          2.0         512.7       1.1X
-    Native ORC Vectorized (Pushdown)                88 /   94        178.6           5.6      96.4X
-
-
-    Select 0 int row
-    (7864320 < value < 7864320):             Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                            8494 / 8514          1.9         540.0       1.0X
-    Parquet Vectorized (Pushdown)                  835 /  840         18.8          53.1      10.2X
-    Native ORC Vectorized                         8090 / 8106          1.9         514.4       1.0X
-    Native ORC Vectorized (Pushdown)               249 /  257         63.2          15.8      34.1X
-
-
-    Select 1 int row (value = 7864320):      Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                            8552 / 8560          1.8         543.7       1.0X
-    Parquet Vectorized (Pushdown)                  837 /  841         18.8          53.2      10.2X
-    Native ORC Vectorized                         8178 / 8188          1.9         519.9       1.0X
-    Native ORC Vectorized (Pushdown)               249 /  258         63.2          15.8      34.4X
-
-
-    Select 1 int row (value <=> 7864320):    Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                            8562 / 8580          1.8         544.3       1.0X
-    Parquet Vectorized (Pushdown)                  833 /  836         18.9          53.0      10.3X
-    Native ORC Vectorized                         8164 / 8185          1.9         519.0       1.0X
-    Native ORC Vectorized (Pushdown)               245 /  254         64.3          15.6      35.0X
-
-
-    Select 1 int row
-    (7864320 <= value <= 7864320):           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                            8540 / 8555          1.8         542.9       1.0X
-    Parquet Vectorized (Pushdown)                  837 /  839         18.8          53.2      10.2X
-    Native ORC Vectorized                         8182 / 8231          1.9         520.2       1.0X
-    Native ORC Vectorized (Pushdown)               250 /  259         62.9          15.9      34.1X
-
-
-    Select 1 int row
-    (7864319 < value < 7864321):             Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                            8535 / 8555          1.8         542.6       1.0X
-    Parquet Vectorized (Pushdown)                  835 /  841         18.8          53.1      10.2X
-    Native ORC Vectorized                         8159 / 8179          1.9         518.8       1.0X
-    Native ORC Vectorized (Pushdown)               244 /  250         64.5          15.5      35.0X
-
-
-    Select 10% int rows (value < 1572864):   Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                            9609 / 9634          1.6         610.9       1.0X
-    Parquet Vectorized (Pushdown)                 2663 / 2672          5.9         169.3       3.6X
-    Native ORC Vectorized                         9824 / 9850          1.6         624.6       1.0X
-    Native ORC Vectorized (Pushdown)              2717 / 2722          5.8         172.7       3.5X
-
-
-    Select 50% int rows (value < 7864320):   Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                          13592 / 13613          1.2         864.2       1.0X
-    Parquet Vectorized (Pushdown)                 9720 / 9738          1.6         618.0       1.4X
-    Native ORC Vectorized                       16366 / 16397          1.0        1040.5       0.8X
-    Native ORC Vectorized (Pushdown)            12437 / 12459          1.3         790.7       1.1X
-
-
-    Select 90% int rows (value < 14155776):  Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                          17580 / 17617          0.9        1117.7       1.0X
-    Parquet Vectorized (Pushdown)               16803 / 16827          0.9        1068.3       1.0X
-    Native ORC Vectorized                       24169 / 24187          0.7        1536.6       0.7X
-    Native ORC Vectorized (Pushdown)            22147 / 22341          0.7        1408.1       0.8X
-
-
-    Select all int rows (value IS NOT NULL): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                          18461 / 18491          0.9        1173.7       1.0X
-    Parquet Vectorized (Pushdown)               18466 / 18530          0.9        1174.1       1.0X
-    Native ORC Vectorized                       24231 / 24270          0.6        1540.6       0.8X
-    Native ORC Vectorized (Pushdown)            24207 / 24304          0.6        1539.0       0.8X
-
-
-    Select all int rows (value > -1):        Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                          18414 / 18453          0.9        1170.7       1.0X
-    Parquet Vectorized (Pushdown)               18435 / 18464          0.9        1172.1       1.0X
-    Native ORC Vectorized                       24430 / 24454          0.6        1553.2       0.8X
-    Native ORC Vectorized (Pushdown)            24410 / 24465          0.6        1552.0       0.8X
-
-
-    Select all int rows (value != -1):       Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                          18446 / 18457          0.9        1172.8       1.0X
-    Parquet Vectorized (Pushdown)               18428 / 18440          0.9        1171.6       1.0X
-    Native ORC Vectorized                       24414 / 24450          0.6        1552.2       0.8X
-    Native ORC Vectorized (Pushdown)            24385 / 24472          0.6        1550.4       0.8X
-
-
-    Select 0 distinct string row
-    (value IS NULL):                         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                            8322 / 8352          1.9         529.1       1.0X
-    Parquet Vectorized (Pushdown)                   53 /   57        296.3           3.4     156.7X
-    Native ORC Vectorized                         7903 / 7953          2.0         502.4       1.1X
-    Native ORC Vectorized (Pushdown)                80 /   82        197.2           5.1     104.3X
-
-
-    Select 0 distinct string row
-    ('100' < value < '100'):                 Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                            8712 / 8743          1.8         553.9       1.0X
-    Parquet Vectorized (Pushdown)                  995 / 1030         15.8          63.3       8.8X
-    Native ORC Vectorized                         8345 / 8362          1.9         530.6       1.0X
-    Native ORC Vectorized (Pushdown)                84 /   87        187.6           5.3     103.9X
-
-
-    Select 1 distinct string row
-    (value = '100'):                         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                            8574 / 8610          1.8         545.1       1.0X
-    Parquet Vectorized (Pushdown)                 1127 / 1135         14.0          71.6       7.6X
-    Native ORC Vectorized                         8163 / 8181          1.9         519.0       1.1X
-    Native ORC Vectorized (Pushdown)               426 /  433         36.9          27.1      20.1X
-
-
-    Select 1 distinct string row
-    (value <=> '100'):                       Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                            8549 / 8568          1.8         543.5       1.0X
-    Parquet Vectorized (Pushdown)                 1124 / 1131         14.0          71.4       7.6X
-    Native ORC Vectorized                         8163 / 8210          1.9         519.0       1.0X
-    Native ORC Vectorized (Pushdown)               426 /  436         36.9          27.1      20.1X
-
-
-    Select 1 distinct string row
-    ('100' <= value <= '100'):               Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                            8889 / 8896          1.8         565.2       1.0X
-    Parquet Vectorized (Pushdown)                 1161 / 1168         13.6          73.8       7.7X
-    Native ORC Vectorized                         8519 / 8554          1.8         541.6       1.0X
-    Native ORC Vectorized (Pushdown)               430 /  437         36.6          27.3      20.7X
-
-
-    Select all distinct string rows
-    (value IS NOT NULL):                     Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Parquet Vectorized                          20433 / 20533          0.8        1299.1       1.0X
-    Parquet Vectorized (Pushdown)               20433 / 20456          0.8        1299.1       1.0X
-    Native ORC Vectorized                       25435 / 25513          0.6        1617.1       0.8X
-    Native ORC Vectorized (Pushdown)            25435 / 25507          0.6        1617.1       0.8X
-    */
-
     benchmark.run()
   }
 
@@ -408,14 +239,8 @@ object FilterPushdownBenchmark {
     }
   }
 
-  def main(args: Array[String]): Unit = {
-    val numRows = 1024 * 1024 * 15
-    val width = 5
-
-    // Pushdown for many distinct value case
+  ignore("Pushdown for many distinct value case") {
     withTempPath { dir =>
-      val mid = numRows / 2
-
       withTempTable("orcTable", "patquetTable") {
         Seq(true, false).foreach { useStringForValue =>
           prepareTable(dir, numRows, width, useStringForValue)
@@ -427,16 +252,122 @@ object FilterPushdownBenchmark {
         }
       }
     }
+  }
 
-    // Pushdown for few distinct value case (use dictionary encoding)
+  ignore("Pushdown for few distinct value case (use dictionary encoding)") {
     withTempPath { dir =>
       val numDistinctValues = 200
-      val mid = numDistinctValues / 2
 
       withTempTable("orcTable", "patquetTable") {
         prepareStringDictTable(dir, numRows, numDistinctValues, width)
-        runStringBenchmark(numRows, width, mid, "distinct string")
+        runStringBenchmark(numRows, width, numDistinctValues / 2, "distinct string")
       }
     }
   }
+
+  ignore("Pushdown benchmark for StringStartsWith") {
+    withTempPath { dir =>
+      withTempTable("orcTable", "patquetTable") {
+        prepareTable(dir, numRows, width, true)
+        Seq(
+          "value like '10%'",
+          "value like '1000%'",
+          s"value like '${mid.toString.substring(0, mid.toString.length - 1)}%'"
+        ).foreach { whereExpr =>
+          val title = s"StringStartsWith filter: ($whereExpr)"
+          filterPushDownBenchmark(numRows, title, whereExpr)
+        }
+      }
+    }
+  }
+
+  ignore(s"Pushdown benchmark for ${DecimalType.simpleString}") {
+    withTempPath { dir =>
+      Seq(
+        s"decimal(${Decimal.MAX_INT_DIGITS}, 2)",
+        s"decimal(${Decimal.MAX_LONG_DIGITS}, 2)",
+        s"decimal(${DecimalType.MAX_PRECISION}, 2)"
+      ).foreach { dt =>
+        val columns = (1 to width).map(i => s"CAST(id AS string) c$i")
+        val df = spark.range(numRows).selectExpr(columns: _*)
+          .withColumn("value", monotonically_increasing_id().cast(dt))
+        withTempTable("orcTable", "patquetTable") {
+          saveAsTable(df, dir)
+
+          Seq(s"value = $mid").foreach { whereExpr =>
+            val title = s"Select 1 $dt row ($whereExpr)".replace("value AND value", "value")
+            filterPushDownBenchmark(numRows, title, whereExpr)
+          }
+
+          val selectExpr = (1 to width).map(i => s"MAX(c$i)").mkString("", ",", ", MAX(value)")
+          Seq(10, 50, 90).foreach { percent =>
+            filterPushDownBenchmark(
+              numRows,
+              s"Select $percent% $dt rows (value < ${numRows * percent / 100})",
+              s"value < ${numRows * percent / 100}",
+              selectExpr
+            )
+          }
+        }
+      }
+    }
+  }
+
+  ignore("Pushdown benchmark for InSet -> InFilters") {
+    withTempPath { dir =>
+      withTempTable("orcTable", "patquetTable") {
+        prepareTable(dir, numRows, width, false)
+        Seq(5, 10, 50, 100).foreach { count =>
+          Seq(10, 50, 90).foreach { distribution =>
+            val filter =
+              Range(0, count).map(r => scala.util.Random.nextInt(numRows * distribution / 100))
+            val whereExpr = s"value in(${filter.mkString(",")})"
+            val title = s"InSet -> InFilters (values count: $count, distribution: $distribution)"
+            filterPushDownBenchmark(numRows, title, whereExpr)
+          }
+        }
+      }
+    }
+  }
+
+  ignore(s"Pushdown benchmark for ${ByteType.simpleString}") {
+    withTempPath { dir =>
+      val columns = (1 to width).map(i => s"CAST(id AS string) c$i")
+      val df = spark.range(numRows).selectExpr(columns: _*)
+        .withColumn("value", (monotonically_increasing_id() % Byte.MaxValue).cast(ByteType))
+        .orderBy("value")
+      withTempTable("orcTable", "patquetTable") {
+        saveAsTable(df, dir)
+
+        Seq(s"value = CAST(${Byte.MaxValue / 2} AS ${ByteType.simpleString})")
+          .foreach { whereExpr =>
+            val title = s"Select 1 ${ByteType.simpleString} row ($whereExpr)"
+              .replace("value AND value", "value")
+            filterPushDownBenchmark(numRows, title, whereExpr)
+          }
+
+        val selectExpr = (1 to width).map(i => s"MAX(c$i)").mkString("", ",", ", MAX(value)")
+        Seq(10, 50, 90).foreach { percent =>
+          filterPushDownBenchmark(
+            numRows,
+            s"Select $percent% ${ByteType.simpleString} rows " +
+              s"(value < CAST(${Byte.MaxValue * percent / 100} AS ${ByteType.simpleString}))",
+            s"value < CAST(${Byte.MaxValue * percent / 100} AS ${ByteType.simpleString})",
+            selectExpr
+          )
+        }
+      }
+    }
+  }
+}
+
+trait BenchmarkBeforeAndAfterEachTest extends BeforeAndAfterEachTestData { this: Suite =>
+
+  override def beforeEach(td: TestData) {
+    super.beforeEach(td)
+  }
+
+  override def afterEach(td: TestData) {
+    super.afterEach(td)
+  }
 }

From 141953f4c44dbad1c2a7059e92bec5fe770af932 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Fri, 6 Jul 2018 00:08:03 -0700
Subject: [PATCH 66/79] [SPARK-24535][SPARKR] fix tests on java check error

## What changes were proposed in this pull request?

change to skip tests if
- couldn't determine java version

fix problem on windows

## How was this patch tested?

unit test, manual, win-builder

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #21666 from felixcheung/rjavaskip.
---
 R/pkg/R/client.R                       | 24 +++++++++++++++---------
 R/pkg/R/sparkR.R                       |  2 +-
 R/pkg/inst/tests/testthat/test_basic.R |  8 ++++++++
 3 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R
index 4c87f64e7f0e1..660f0864403e0 100644
--- a/R/pkg/R/client.R
+++ b/R/pkg/R/client.R
@@ -71,15 +71,20 @@ checkJavaVersion <- function() {
 
   # If java is missing from PATH, we get an error in Unix and a warning in Windows
   javaVersionOut <- tryCatch(
-      launchScript(javaBin, "-version", wait = TRUE, stdout = TRUE, stderr = TRUE),
-                   error = function(e) {
-                     stop("Java version check failed. Please make sure Java is installed",
-                          " and set JAVA_HOME to point to the installation directory.", e)
-                   },
-                   warning = function(w) {
-                     stop("Java version check failed. Please make sure Java is installed",
-                          " and set JAVA_HOME to point to the installation directory.", w)
-                   })
+    if (is_windows()) {
+      # See SPARK-24535
+      system2(javaBin, "-version", wait = TRUE, stdout = TRUE, stderr = TRUE)
+    } else {
+      launchScript(javaBin, "-version", wait = TRUE, stdout = TRUE, stderr = TRUE)
+    },
+    error = function(e) {
+      stop("Java version check failed. Please make sure Java is installed",
+           " and set JAVA_HOME to point to the installation directory.", e)
+    },
+    warning = function(w) {
+      stop("Java version check failed. Please make sure Java is installed",
+          " and set JAVA_HOME to point to the installation directory.", w)
+    })
   javaVersionFilter <- Filter(
       function(x) {
         grepl(" version", x)
@@ -93,6 +98,7 @@ checkJavaVersion <- function() {
     stop(paste("Java version", sparkJavaVersion, "is required for this package; found version:",
                javaVersionStr))
   }
+  return(javaVersionNum)
 }
 
 launchBackend <- function(args, sparkHome, jars, sparkSubmitOpts, packages) {
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index f7c1663d32c96..d3a9cbae7d808 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -167,7 +167,7 @@ sparkR.sparkContext <- function(
     submitOps <- getClientModeSparkSubmitOpts(
         Sys.getenv("SPARKR_SUBMIT_ARGS", "sparkr-shell"),
         sparkEnvirMap)
-    checkJavaVersion()
+    invisible(checkJavaVersion())
     launchBackend(
         args = path,
         sparkHome = sparkHome,
diff --git a/R/pkg/inst/tests/testthat/test_basic.R b/R/pkg/inst/tests/testthat/test_basic.R
index 823d26f12feee..243f5f0298284 100644
--- a/R/pkg/inst/tests/testthat/test_basic.R
+++ b/R/pkg/inst/tests/testthat/test_basic.R
@@ -18,6 +18,10 @@
 context("basic tests for CRAN")
 
 test_that("create DataFrame from list or data.frame", {
+  tryCatch( checkJavaVersion(),
+            error = function(e) { skip("error on Java check") },
+            warning = function(e) { skip("warning on Java check") } )
+
   sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE,
                  sparkConfig = sparkRTestConfig)
 
@@ -50,6 +54,10 @@ test_that("create DataFrame from list or data.frame", {
 })
 
 test_that("spark.glm and predict", {
+  tryCatch( checkJavaVersion(),
+            error = function(e) { skip("error on Java check") },
+            warning = function(e) { skip("warning on Java check") } )
+
   sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE,
                  sparkConfig = sparkRTestConfig)
 

From a381bce7285ec30f58f28f523dfcfe0c13221bbf Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Fri, 6 Jul 2018 18:28:54 +0800
Subject: [PATCH 67/79] [SPARK-24673][SQL][PYTHON][FOLLOWUP] Support Column
 arguments in timezone of from_utc_timestamp/to_utc_timestamp

## What changes were proposed in this pull request?
This pr supported column arguments in timezone of `from_utc_timestamp/to_utc_timestamp` (follow-up of #21693).

## How was this patch tested?
Added tests.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #21723 from maropu/SPARK-24673-FOLLOWUP.
---
 python/pyspark/sql/functions.py | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 4d371976364d3..55e7d575b4681 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1285,11 +1285,21 @@ def from_utc_timestamp(timestamp, tz):
     that time as a timestamp in the given time zone. For example, 'GMT+1' would yield
     '2017-07-14 03:40:00.0'.
 
-    >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
-    >>> df.select(from_utc_timestamp(df.t, "PST").alias('local_time')).collect()
+    :param timestamp: the column that contains timestamps
+    :param tz: a string that has the ID of timezone, e.g. "GMT", "America/Los_Angeles", etc
+
+    .. versionchanged:: 2.4
+       `tz` can take a :class:`Column` containing timezone ID strings.
+
+    >>> df = spark.createDataFrame([('1997-02-28 10:30:00', 'JST')], ['ts', 'tz'])
+    >>> df.select(from_utc_timestamp(df.ts, "PST").alias('local_time')).collect()
     [Row(local_time=datetime.datetime(1997, 2, 28, 2, 30))]
+    >>> df.select(from_utc_timestamp(df.ts, df.tz).alias('local_time')).collect()
+    [Row(local_time=datetime.datetime(1997, 2, 28, 19, 30))]
     """
     sc = SparkContext._active_spark_context
+    if isinstance(tz, Column):
+        tz = _to_java_column(tz)
     return Column(sc._jvm.functions.from_utc_timestamp(_to_java_column(timestamp), tz))
 
 
@@ -1300,11 +1310,21 @@ def to_utc_timestamp(timestamp, tz):
     zone, and renders that time as a timestamp in UTC. For example, 'GMT+1' would yield
     '2017-07-14 01:40:00.0'.
 
-    >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['ts'])
+    :param timestamp: the column that contains timestamps
+    :param tz: a string that has the ID of timezone, e.g. "GMT", "America/Los_Angeles", etc
+
+    .. versionchanged:: 2.4
+       `tz` can take a :class:`Column` containing timezone ID strings.
+
+    >>> df = spark.createDataFrame([('1997-02-28 10:30:00', 'JST')], ['ts', 'tz'])
     >>> df.select(to_utc_timestamp(df.ts, "PST").alias('utc_time')).collect()
     [Row(utc_time=datetime.datetime(1997, 2, 28, 18, 30))]
+    >>> df.select(to_utc_timestamp(df.ts, df.tz).alias('utc_time')).collect()
+    [Row(utc_time=datetime.datetime(1997, 2, 28, 1, 30))]
     """
     sc = SparkContext._active_spark_context
+    if isinstance(tz, Column):
+        tz = _to_java_column(tz)
     return Column(sc._jvm.functions.to_utc_timestamp(_to_java_column(timestamp), tz))
 
 

From 4de0425df8d2545718a0583bc26592108aebc5ac Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Sat, 7 Jul 2018 10:54:14 +0800
Subject: [PATCH 68/79] [SPARK-24569][SQL] Aggregator with output type Option
 should produce consistent schema

## What changes were proposed in this pull request?

SQL `Aggregator` with output type `Option[Boolean]` creates column of type `StructType`. It's not in consistency with a Dataset of similar java class.

This changes the way `definedByConstructorParams` checks given type. For `Option[_]`, it goes to check its type argument.

## How was this patch tested?

Added test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #21611 from viirya/SPARK-24569.
---
 .../spark/sql/catalyst/ScalaReflection.scala  |  7 ++-
 .../spark/sql/DatasetAggregatorSuite.scala    | 60 +++++++++++++++++++
 .../org/apache/spark/sql/DatasetSuite.scala   | 11 ++++
 3 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index f9acc208b715e..4543bba8f6ed4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -798,7 +798,12 @@ object ScalaReflection extends ScalaReflection {
    * Whether the fields of the given type is defined entirely by its constructor parameters.
    */
   def definedByConstructorParams(tpe: Type): Boolean = cleanUpReflectionObjects {
-    tpe.dealias <:< localTypeOf[Product] || tpe.dealias <:< localTypeOf[DefinedByConstructorParams]
+    tpe.dealias match {
+      // `Option` is a `Product`, but we don't wanna treat `Option[Int]` as a struct type.
+      case t if t <:< localTypeOf[Option[_]] => definedByConstructorParams(t.typeArgs.head)
+      case _ => tpe.dealias <:< localTypeOf[Product] ||
+        tpe.dealias <:< localTypeOf[DefinedByConstructorParams]
+    }
   }
 
   private val javaKeywords = Set("abstract", "assert", "boolean", "break", "byte", "case", "catch",
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala
index 0e7eaa9e88d57..538ea3c66c40e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala
@@ -148,6 +148,41 @@ object VeryComplexResultAgg extends Aggregator[Row, String, ComplexAggData] {
 }
 
 
+case class OptionBooleanData(name: String, isGood: Option[Boolean])
+
+case class OptionBooleanAggregator(colName: String)
+    extends Aggregator[Row, Option[Boolean], Option[Boolean]] {
+
+  override def zero: Option[Boolean] = None
+
+  override def reduce(buffer: Option[Boolean], row: Row): Option[Boolean] = {
+    val index = row.fieldIndex(colName)
+    val value = if (row.isNullAt(index)) {
+      Option.empty[Boolean]
+    } else {
+      Some(row.getBoolean(index))
+    }
+    merge(buffer, value)
+  }
+
+  override def merge(b1: Option[Boolean], b2: Option[Boolean]): Option[Boolean] = {
+    if ((b1.isDefined && b1.get) || (b2.isDefined && b2.get)) {
+      Some(true)
+    } else if (b1.isDefined) {
+      b1
+    } else {
+      b2
+    }
+  }
+
+  override def finish(reduction: Option[Boolean]): Option[Boolean] = reduction
+
+  override def bufferEncoder: Encoder[Option[Boolean]] = OptionalBoolEncoder
+  override def outputEncoder: Encoder[Option[Boolean]] = OptionalBoolEncoder
+
+  def OptionalBoolEncoder: Encoder[Option[Boolean]] = ExpressionEncoder()
+}
+
 class DatasetAggregatorSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
 
@@ -333,4 +368,29 @@ class DatasetAggregatorSuite extends QueryTest with SharedSQLContext {
       df.groupBy($"i").agg(VeryComplexResultAgg.toColumn),
       Row(1, Row(Row(1, "a"), Row(1, "a"))) :: Row(2, Row(Row(2, "bc"), Row(2, "bc"))) :: Nil)
   }
+
+  test("SPARK-24569: Aggregator with output type Option[Boolean] creates column of type Row") {
+    val df = Seq(
+      OptionBooleanData("bob", Some(true)),
+      OptionBooleanData("bob", Some(false)),
+      OptionBooleanData("bob", None)).toDF()
+    val group = df
+      .groupBy("name")
+      .agg(OptionBooleanAggregator("isGood").toColumn.alias("isGood"))
+    assert(df.schema == group.schema)
+    checkAnswer(group, Row("bob", true) :: Nil)
+    checkDataset(group.as[OptionBooleanData], OptionBooleanData("bob", Some(true)))
+  }
+
+  test("SPARK-24569: groupByKey with Aggregator of output type Option[Boolean]") {
+    val df = Seq(
+      OptionBooleanData("bob", Some(true)),
+      OptionBooleanData("bob", Some(false)),
+      OptionBooleanData("bob", None)).toDF()
+    val grouped = df.groupByKey((r: Row) => r.getString(0))
+      .agg(OptionBooleanAggregator("isGood").toColumn).toDF("name", "isGood")
+
+    assert(grouped.schema == df.schema)
+    checkDataset(grouped.as[OptionBooleanData], OptionBooleanData("bob", Some(true)))
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 2d20c50584c03..ce8db99d4e2f1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1467,6 +1467,17 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
     intercept[NullPointerException](ds.as[(Int, Int)].collect())
   }
 
+  test("SPARK-24569: Option of primitive types are mistakenly mapped to struct type") {
+    withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
+      val a = Seq(Some(1)).toDS
+      val b = Seq(Some(1.2)).toDS
+      val expected = Seq((Some(1), Some(1.2))).toDS
+      val joined = a.joinWith(b, lit(true))
+      assert(joined.schema == expected.schema)
+      checkDataset(joined, expected.collect: _*)
+    }
+  }
+
   test("SPARK-24548: Dataset with tuple encoders should have correct schema") {
     val encoder = Encoders.tuple(newStringEncoder,
       Encoders.tuple(newStringEncoder, newStringEncoder))

From fc43690d36e7a17e45826a69ab86935fb0ee2be4 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Sat, 7 Jul 2018 11:34:30 +0800
Subject: [PATCH 69/79] [SPARK-24749][SQL] Use sameType to compare Array's
 element type in ArrayContains

## What changes were proposed in this pull request?

We should use `DataType.sameType` to compare element type in `ArrayContains`, otherwise nullability affects comparison result.

## How was this patch tested?

Added test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #21724 from viirya/SPARK-24749.
---
 .../sql/catalyst/expressions/collectionOperations.scala    | 2 +-
 .../catalyst/expressions/CollectionExpressionsSuite.scala  | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 8b278f067749e..fcac3a58e6a95 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -1085,7 +1085,7 @@ case class ArrayContains(left: Expression, right: Expression)
     if (right.dataType == NullType) {
       TypeCheckResult.TypeCheckFailure("Null typed values cannot be used as arguments")
     } else if (!left.dataType.isInstanceOf[ArrayType]
-      || left.dataType.asInstanceOf[ArrayType].elementType != right.dataType) {
+      || !left.dataType.asInstanceOf[ArrayType].elementType.sameType(right.dataType)) {
       TypeCheckResult.TypeCheckFailure(
         "Arguments must be an array followed by a value of same type as the array members")
     } else {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
index d7744eb4c7dc7..496ee1d496a36 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -213,6 +213,8 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
     val a1 = Literal.create(Seq[String](null, ""), ArrayType(StringType))
     val a2 = Literal.create(Seq(null), ArrayType(LongType))
     val a3 = Literal.create(null, ArrayType(StringType))
+    val a4 = Literal.create(Seq(create_row(1)), ArrayType(StructType(Seq(
+      StructField("a", IntegerType, true)))))
 
     checkEvaluation(ArrayContains(a0, Literal(1)), true)
     checkEvaluation(ArrayContains(a0, Literal(0)), false)
@@ -228,6 +230,11 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
     checkEvaluation(ArrayContains(a3, Literal("")), null)
     checkEvaluation(ArrayContains(a3, Literal.create(null, StringType)), null)
 
+    checkEvaluation(ArrayContains(a4, Literal.create(create_row(1), StructType(Seq(
+      StructField("a", IntegerType, false))))), true)
+    checkEvaluation(ArrayContains(a4, Literal.create(create_row(0), StructType(Seq(
+      StructField("a", IntegerType, false))))), false)
+
     // binary
     val b0 = Literal.create(Seq[Array[Byte]](Array[Byte](5, 6), Array[Byte](1, 2)),
       ArrayType(BinaryType))

From 74f6a92fcea9196d62c2d531c11ec7efd580b760 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@apache.org>
Date: Sat, 7 Jul 2018 11:37:41 +0800
Subject: [PATCH 70/79] [SPARK-24739][PYTHON] Make PySpark compatible with
 Python 3.7

## What changes were proposed in this pull request?

This PR proposes to make PySpark compatible with Python 3.7.  There are rather radical change in semantic of `StopIteration` within a generator. It now throws it as a `RuntimeError`.

To make it compatible, we should fix it:

```python
try:
    next(...)
except StopIteration
    return
```

See [release note](https://docs.python.org/3/whatsnew/3.7.html#porting-to-python-3-7) and [PEP 479](https://www.python.org/dev/peps/pep-0479/).

## How was this patch tested?

Manually tested:

```
 $ ./run-tests --python-executables=python3.7
Running PySpark tests. Output is in /.../spark/python/unit-tests.log
Will test against the following Python executables: ['python3.7']
Will test the following Python modules: ['pyspark-core', 'pyspark-ml', 'pyspark-mllib', 'pyspark-sql', 'pyspark-streaming']
Starting test(python3.7): pyspark.mllib.tests
Starting test(python3.7): pyspark.sql.tests
Starting test(python3.7): pyspark.streaming.tests
Starting test(python3.7): pyspark.tests
Finished test(python3.7): pyspark.streaming.tests (130s)
Starting test(python3.7): pyspark.accumulators
Finished test(python3.7): pyspark.accumulators (8s)
Starting test(python3.7): pyspark.broadcast
Finished test(python3.7): pyspark.broadcast (9s)
Starting test(python3.7): pyspark.conf
Finished test(python3.7): pyspark.conf (6s)
Starting test(python3.7): pyspark.context
Finished test(python3.7): pyspark.context (27s)
Starting test(python3.7): pyspark.ml.classification
Finished test(python3.7): pyspark.tests (200s) ... 3 tests were skipped
Starting test(python3.7): pyspark.ml.clustering
Finished test(python3.7): pyspark.mllib.tests (244s)
Starting test(python3.7): pyspark.ml.evaluation
Finished test(python3.7): pyspark.ml.classification (63s)
Starting test(python3.7): pyspark.ml.feature
Finished test(python3.7): pyspark.ml.clustering (48s)
Starting test(python3.7): pyspark.ml.fpm
Finished test(python3.7): pyspark.ml.fpm (0s)
Starting test(python3.7): pyspark.ml.image
Finished test(python3.7): pyspark.ml.evaluation (23s)
Starting test(python3.7): pyspark.ml.linalg.__init__
Finished test(python3.7): pyspark.ml.linalg.__init__ (0s)
Starting test(python3.7): pyspark.ml.recommendation
Finished test(python3.7): pyspark.ml.image (20s)
Starting test(python3.7): pyspark.ml.regression
Finished test(python3.7): pyspark.ml.regression (58s)
Starting test(python3.7): pyspark.ml.stat
Finished test(python3.7): pyspark.ml.feature (90s)
Starting test(python3.7): pyspark.ml.tests
Finished test(python3.7): pyspark.ml.recommendation (82s)
Starting test(python3.7): pyspark.ml.tuning
Finished test(python3.7): pyspark.ml.stat (27s)
Starting test(python3.7): pyspark.mllib.classification
Finished test(python3.7): pyspark.sql.tests (362s) ... 102 tests were skipped
Starting test(python3.7): pyspark.mllib.clustering
Finished test(python3.7): pyspark.ml.tuning (29s)
Starting test(python3.7): pyspark.mllib.evaluation
Finished test(python3.7): pyspark.mllib.classification (39s)
Starting test(python3.7): pyspark.mllib.feature
Finished test(python3.7): pyspark.mllib.evaluation (30s)
Starting test(python3.7): pyspark.mllib.fpm
Finished test(python3.7): pyspark.mllib.feature (44s)
Starting test(python3.7): pyspark.mllib.linalg.__init__
Finished test(python3.7): pyspark.mllib.linalg.__init__ (0s)
Starting test(python3.7): pyspark.mllib.linalg.distributed
Finished test(python3.7): pyspark.mllib.clustering (78s)
Starting test(python3.7): pyspark.mllib.random
Finished test(python3.7): pyspark.mllib.fpm (33s)
Starting test(python3.7): pyspark.mllib.recommendation
Finished test(python3.7): pyspark.mllib.random (12s)
Starting test(python3.7): pyspark.mllib.regression
Finished test(python3.7): pyspark.mllib.linalg.distributed (45s)
Starting test(python3.7): pyspark.mllib.stat.KernelDensity
Finished test(python3.7): pyspark.mllib.stat.KernelDensity (0s)
Starting test(python3.7): pyspark.mllib.stat._statistics
Finished test(python3.7): pyspark.mllib.recommendation (41s)
Starting test(python3.7): pyspark.mllib.tree
Finished test(python3.7): pyspark.mllib.regression (44s)
Starting test(python3.7): pyspark.mllib.util
Finished test(python3.7): pyspark.mllib.stat._statistics (20s)
Starting test(python3.7): pyspark.profiler
Finished test(python3.7): pyspark.mllib.tree (26s)
Starting test(python3.7): pyspark.rdd
Finished test(python3.7): pyspark.profiler (11s)
Starting test(python3.7): pyspark.serializers
Finished test(python3.7): pyspark.mllib.util (24s)
Starting test(python3.7): pyspark.shuffle
Finished test(python3.7): pyspark.shuffle (0s)
Starting test(python3.7): pyspark.sql.catalog
Finished test(python3.7): pyspark.serializers (15s)
Starting test(python3.7): pyspark.sql.column
Finished test(python3.7): pyspark.rdd (27s)
Starting test(python3.7): pyspark.sql.conf
Finished test(python3.7): pyspark.sql.catalog (24s)
Starting test(python3.7): pyspark.sql.context
Finished test(python3.7): pyspark.sql.conf (8s)
Starting test(python3.7): pyspark.sql.dataframe
Finished test(python3.7): pyspark.sql.column (29s)
Starting test(python3.7): pyspark.sql.functions
Finished test(python3.7): pyspark.sql.context (26s)
Starting test(python3.7): pyspark.sql.group
Finished test(python3.7): pyspark.sql.dataframe (51s)
Starting test(python3.7): pyspark.sql.readwriter
Finished test(python3.7): pyspark.ml.tests (266s)
Starting test(python3.7): pyspark.sql.session
Finished test(python3.7): pyspark.sql.group (36s)
Starting test(python3.7): pyspark.sql.streaming
Finished test(python3.7): pyspark.sql.functions (57s)
Starting test(python3.7): pyspark.sql.types
Finished test(python3.7): pyspark.sql.session (25s)
Starting test(python3.7): pyspark.sql.udf
Finished test(python3.7): pyspark.sql.types (10s)
Starting test(python3.7): pyspark.sql.window
Finished test(python3.7): pyspark.sql.readwriter (31s)
Starting test(python3.7): pyspark.streaming.util
Finished test(python3.7): pyspark.sql.streaming (22s)
Starting test(python3.7): pyspark.util
Finished test(python3.7): pyspark.util (0s)
Finished test(python3.7): pyspark.streaming.util (0s)
Finished test(python3.7): pyspark.sql.udf (16s)
Finished test(python3.7): pyspark.sql.window (12s)
```

In my local (I have two Macs but both have the same issues), I currently faced some issues for now to install both extra dependencies PyArrow and Pandas same as Jenkins's, against Python 3.7.

Author: hyukjinkwon <gurwls223@apache.org>

Closes #21714 from HyukjinKwon/SPARK-24739.
---
 python/pyspark/rdd.py | 5 ++++-
 python/setup.py       | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 7e7e5822a6b20..951851804b1d8 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -1370,7 +1370,10 @@ def takeUpToNumLeft(iterator):
                 iterator = iter(iterator)
                 taken = 0
                 while taken < left:
-                    yield next(iterator)
+                    try:
+                        yield next(iterator)
+                    except StopIteration:
+                        return
                     taken += 1
 
             p = range(partsScanned, min(partsScanned + numPartsToTry, totalParts))
diff --git a/python/setup.py b/python/setup.py
index d309e0564530a..45eb74eb87ce7 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -219,6 +219,7 @@ def _supports_symlinks():
             'Programming Language :: Python :: 3.4',
             'Programming Language :: Python :: 3.5',
             'Programming Language :: Python :: 3.6',
+            'Programming Language :: Python :: 3.7',
             'Programming Language :: Python :: Implementation :: CPython',
             'Programming Language :: Python :: Implementation :: PyPy']
     )

From 044b33b2ed2d423d798f2a632fab110c46f41567 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@apache.org>
Date: Sat, 7 Jul 2018 11:39:29 +0800
Subject: [PATCH 71/79] [SPARK-24740][PYTHON][ML] Make PySpark's tests
 compatible with NumPy 1.14+

## What changes were proposed in this pull request?

This PR proposes to make PySpark's tests compatible with NumPy 0.14+
NumPy 0.14.x introduced rather radical changes about its string representation.

For example, the tests below are failed:

```
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 895, in __main__.DenseMatrix.__str__
Failed example:
    print(dm)
Expected:
    DenseMatrix([[ 0.,  2.],
                 [ 1.,  3.]])
Got:
    DenseMatrix([[0., 2.],
                 [1., 3.]])
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 899, in __main__.DenseMatrix.__str__
Failed example:
    print(dm)
Expected:
    DenseMatrix([[ 0.,  1.],
                 [ 2.,  3.]])
Got:
    DenseMatrix([[0., 1.],
                 [2., 3.]])
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 939, in __main__.DenseMatrix.toArray
Failed example:
    m.toArray()
Expected:
    array([[ 0.,  2.],
           [ 1.,  3.]])
Got:
    array([[0., 2.],
           [1., 3.]])
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 324, in __main__.DenseVector.dot
Failed example:
    dense.dot(np.reshape([1., 2., 3., 4.], (2, 2), order='F'))
Expected:
    array([  5.,  11.])
Got:
    array([ 5., 11.])
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 567, in __main__.SparseVector.dot
Failed example:
    a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]]))
Expected:
    array([ 22.,  22.])
Got:
    array([22., 22.])
```

See [release note](https://docs.scipy.org/doc/numpy-1.14.0/release.html#compatibility-notes).

## How was this patch tested?

Manually tested:

```
$ ./run-tests --python-executables=python3.6,python2.7 --modules=pyspark-ml,pyspark-mllib
Running PySpark tests. Output is in /.../spark/python/unit-tests.log
Will test against the following Python executables: ['python3.6', 'python2.7']
Will test the following Python modules: ['pyspark-ml', 'pyspark-mllib']
Starting test(python2.7): pyspark.mllib.tests
Starting test(python2.7): pyspark.ml.classification
Starting test(python3.6): pyspark.mllib.tests
Starting test(python2.7): pyspark.ml.clustering
Finished test(python2.7): pyspark.ml.clustering (54s)
Starting test(python2.7): pyspark.ml.evaluation
Finished test(python2.7): pyspark.ml.classification (74s)
Starting test(python2.7): pyspark.ml.feature
Finished test(python2.7): pyspark.ml.evaluation (27s)
Starting test(python2.7): pyspark.ml.fpm
Finished test(python2.7): pyspark.ml.fpm (0s)
Starting test(python2.7): pyspark.ml.image
Finished test(python2.7): pyspark.ml.image (17s)
Starting test(python2.7): pyspark.ml.linalg.__init__
Finished test(python2.7): pyspark.ml.linalg.__init__ (1s)
Starting test(python2.7): pyspark.ml.recommendation
Finished test(python2.7): pyspark.ml.feature (76s)
Starting test(python2.7): pyspark.ml.regression
Finished test(python2.7): pyspark.ml.recommendation (69s)
Starting test(python2.7): pyspark.ml.stat
Finished test(python2.7): pyspark.ml.regression (45s)
Starting test(python2.7): pyspark.ml.tests
Finished test(python2.7): pyspark.ml.stat (28s)
Starting test(python2.7): pyspark.ml.tuning
Finished test(python2.7): pyspark.ml.tuning (20s)
Starting test(python2.7): pyspark.mllib.classification
Finished test(python2.7): pyspark.mllib.classification (31s)
Starting test(python2.7): pyspark.mllib.clustering
Finished test(python2.7): pyspark.mllib.tests (260s)
Starting test(python2.7): pyspark.mllib.evaluation
Finished test(python3.6): pyspark.mllib.tests (266s)
Starting test(python2.7): pyspark.mllib.feature
Finished test(python2.7): pyspark.mllib.evaluation (21s)
Starting test(python2.7): pyspark.mllib.fpm
Finished test(python2.7): pyspark.mllib.feature (38s)
Starting test(python2.7): pyspark.mllib.linalg.__init__
Finished test(python2.7): pyspark.mllib.linalg.__init__ (1s)
Starting test(python2.7): pyspark.mllib.linalg.distributed
Finished test(python2.7): pyspark.mllib.fpm (34s)
Starting test(python2.7): pyspark.mllib.random
Finished test(python2.7): pyspark.mllib.clustering (64s)
Starting test(python2.7): pyspark.mllib.recommendation
Finished test(python2.7): pyspark.mllib.random (15s)
Starting test(python2.7): pyspark.mllib.regression
Finished test(python2.7): pyspark.mllib.linalg.distributed (47s)
Starting test(python2.7): pyspark.mllib.stat.KernelDensity
Finished test(python2.7): pyspark.mllib.stat.KernelDensity (0s)
Starting test(python2.7): pyspark.mllib.stat._statistics
Finished test(python2.7): pyspark.mllib.recommendation (40s)
Starting test(python2.7): pyspark.mllib.tree
Finished test(python2.7): pyspark.mllib.regression (38s)
Starting test(python2.7): pyspark.mllib.util
Finished test(python2.7): pyspark.mllib.stat._statistics (19s)
Starting test(python3.6): pyspark.ml.classification
Finished test(python2.7): pyspark.mllib.tree (26s)
Starting test(python3.6): pyspark.ml.clustering
Finished test(python2.7): pyspark.mllib.util (27s)
Starting test(python3.6): pyspark.ml.evaluation
Finished test(python3.6): pyspark.ml.evaluation (30s)
Starting test(python3.6): pyspark.ml.feature
Finished test(python2.7): pyspark.ml.tests (234s)
Starting test(python3.6): pyspark.ml.fpm
Finished test(python3.6): pyspark.ml.fpm (1s)
Starting test(python3.6): pyspark.ml.image
Finished test(python3.6): pyspark.ml.clustering (55s)
Starting test(python3.6): pyspark.ml.linalg.__init__
Finished test(python3.6): pyspark.ml.linalg.__init__ (0s)
Starting test(python3.6): pyspark.ml.recommendation
Finished test(python3.6): pyspark.ml.classification (71s)
Starting test(python3.6): pyspark.ml.regression
Finished test(python3.6): pyspark.ml.image (18s)
Starting test(python3.6): pyspark.ml.stat
Finished test(python3.6): pyspark.ml.stat (37s)
Starting test(python3.6): pyspark.ml.tests
Finished test(python3.6): pyspark.ml.regression (59s)
Starting test(python3.6): pyspark.ml.tuning
Finished test(python3.6): pyspark.ml.feature (93s)
Starting test(python3.6): pyspark.mllib.classification
Finished test(python3.6): pyspark.ml.recommendation (83s)
Starting test(python3.6): pyspark.mllib.clustering
Finished test(python3.6): pyspark.ml.tuning (29s)
Starting test(python3.6): pyspark.mllib.evaluation
Finished test(python3.6): pyspark.mllib.evaluation (26s)
Starting test(python3.6): pyspark.mllib.feature
Finished test(python3.6): pyspark.mllib.classification (43s)
Starting test(python3.6): pyspark.mllib.fpm
Finished test(python3.6): pyspark.mllib.clustering (81s)
Starting test(python3.6): pyspark.mllib.linalg.__init__
Finished test(python3.6): pyspark.mllib.linalg.__init__ (2s)
Starting test(python3.6): pyspark.mllib.linalg.distributed
Finished test(python3.6): pyspark.mllib.fpm (48s)
Starting test(python3.6): pyspark.mllib.random
Finished test(python3.6): pyspark.mllib.feature (54s)
Starting test(python3.6): pyspark.mllib.recommendation
Finished test(python3.6): pyspark.mllib.random (18s)
Starting test(python3.6): pyspark.mllib.regression
Finished test(python3.6): pyspark.mllib.linalg.distributed (55s)
Starting test(python3.6): pyspark.mllib.stat.KernelDensity
Finished test(python3.6): pyspark.mllib.stat.KernelDensity (1s)
Starting test(python3.6): pyspark.mllib.stat._statistics
Finished test(python3.6): pyspark.mllib.recommendation (51s)
Starting test(python3.6): pyspark.mllib.tree
Finished test(python3.6): pyspark.mllib.regression (45s)
Starting test(python3.6): pyspark.mllib.util
Finished test(python3.6): pyspark.mllib.stat._statistics (21s)
Finished test(python3.6): pyspark.mllib.tree (27s)
Finished test(python3.6): pyspark.mllib.util (27s)
Finished test(python3.6): pyspark.ml.tests (264s)
```

Author: hyukjinkwon <gurwls223@apache.org>

Closes #21715 from HyukjinKwon/SPARK-24740.
---
 python/pyspark/ml/clustering.py            | 6 ++++++
 python/pyspark/ml/linalg/__init__.py       | 5 +++++
 python/pyspark/ml/stat.py                  | 6 ++++++
 python/pyspark/mllib/clustering.py         | 6 ++++++
 python/pyspark/mllib/evaluation.py         | 6 ++++++
 python/pyspark/mllib/linalg/__init__.py    | 6 ++++++
 python/pyspark/mllib/linalg/distributed.py | 6 ++++++
 python/pyspark/mllib/stat/_statistics.py   | 6 ++++++
 8 files changed, 47 insertions(+)

diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 6d77baf7349e4..2f0660040dc7c 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -1345,8 +1345,14 @@ def assignClusters(self, dataset):
 
 if __name__ == "__main__":
     import doctest
+    import numpy
     import pyspark.ml.clustering
     from pyspark.sql import SparkSession
+    try:
+        # Numpy 1.14+ changed it's string format.
+        numpy.set_printoptions(legacy='1.13')
+    except TypeError:
+        pass
     globs = pyspark.ml.clustering.__dict__.copy()
     # The small batch size here ensures that we see multiple batches,
     # even in these small test examples:
diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py
index 6a611a2b5b59d..2548fd0f50b33 100644
--- a/python/pyspark/ml/linalg/__init__.py
+++ b/python/pyspark/ml/linalg/__init__.py
@@ -1156,6 +1156,11 @@ def sparse(numRows, numCols, colPtrs, rowIndices, values):
 
 def _test():
     import doctest
+    try:
+        # Numpy 1.14+ changed it's string format.
+        np.set_printoptions(legacy='1.13')
+    except TypeError:
+        pass
     (failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS)
     if failure_count:
         sys.exit(-1)
diff --git a/python/pyspark/ml/stat.py b/python/pyspark/ml/stat.py
index a06ab31a7a56a..370154fc6d62a 100644
--- a/python/pyspark/ml/stat.py
+++ b/python/pyspark/ml/stat.py
@@ -388,8 +388,14 @@ def summary(self, featuresCol, weightCol=None):
 
 if __name__ == "__main__":
     import doctest
+    import numpy
     import pyspark.ml.stat
     from pyspark.sql import SparkSession
+    try:
+        # Numpy 1.14+ changed it's string format.
+        numpy.set_printoptions(legacy='1.13')
+    except TypeError:
+        pass
 
     globs = pyspark.ml.stat.__dict__.copy()
     # The small batch size here ensures that we see multiple batches,
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index 0cbabab13a896..b09469b9f5c2d 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -1042,7 +1042,13 @@ def train(cls, rdd, k=10, maxIterations=20, docConcentration=-1.0,
 
 def _test():
     import doctest
+    import numpy
     import pyspark.mllib.clustering
+    try:
+        # Numpy 1.14+ changed it's string format.
+        numpy.set_printoptions(legacy='1.13')
+    except TypeError:
+        pass
     globs = pyspark.mllib.clustering.__dict__.copy()
     globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
     (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py
index 36cb03369b8c0..6c65da58e4e2b 100644
--- a/python/pyspark/mllib/evaluation.py
+++ b/python/pyspark/mllib/evaluation.py
@@ -532,8 +532,14 @@ def accuracy(self):
 
 def _test():
     import doctest
+    import numpy
     from pyspark.sql import SparkSession
     import pyspark.mllib.evaluation
+    try:
+        # Numpy 1.14+ changed it's string format.
+        numpy.set_printoptions(legacy='1.13')
+    except TypeError:
+        pass
     globs = pyspark.mllib.evaluation.__dict__.copy()
     spark = SparkSession.builder\
         .master("local[4]")\
diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index 60d96d8d5ceb8..4afd6666400b0 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -1368,6 +1368,12 @@ def R(self):
 
 def _test():
     import doctest
+    import numpy
+    try:
+        # Numpy 1.14+ changed it's string format.
+        numpy.set_printoptions(legacy='1.13')
+    except TypeError:
+        pass
     (failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS)
     if failure_count:
         sys.exit(-1)
diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index bba88542167ad..7e8b15056cabe 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -1364,9 +1364,15 @@ def toCoordinateMatrix(self):
 
 def _test():
     import doctest
+    import numpy
     from pyspark.sql import SparkSession
     from pyspark.mllib.linalg import Matrices
     import pyspark.mllib.linalg.distributed
+    try:
+        # Numpy 1.14+ changed it's string format.
+        numpy.set_printoptions(legacy='1.13')
+    except TypeError:
+        pass
     globs = pyspark.mllib.linalg.distributed.__dict__.copy()
     spark = SparkSession.builder\
         .master("local[2]")\
diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py
index 3c75b132ecad2..937bb154c2356 100644
--- a/python/pyspark/mllib/stat/_statistics.py
+++ b/python/pyspark/mllib/stat/_statistics.py
@@ -303,7 +303,13 @@ def kolmogorovSmirnovTest(data, distName="norm", *params):
 
 def _test():
     import doctest
+    import numpy
     from pyspark.sql import SparkSession
+    try:
+        # Numpy 1.14+ changed it's string format.
+        numpy.set_printoptions(legacy='1.13')
+    except TypeError:
+        pass
     globs = globals().copy()
     spark = SparkSession.builder\
         .master("local[4]")\

From 79c66894296840cc4a5bf6c8718ecfd2b08bcca8 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Sat, 7 Jul 2018 22:16:48 +0200
Subject: [PATCH 72/79] [SPARK-24757][SQL] Improving the error message for
 broadcast timeouts

## What changes were proposed in this pull request?

In the PR, I propose to provide a tip to user how to resolve the issue of timeout expiration for broadcast joins. In particular, they can increase the timeout via **spark.sql.broadcastTimeout** or disable the broadcast at all by setting **spark.sql.autoBroadcastJoinThreshold** to `-1`.

## How was this patch tested?

It tested manually from `spark-shell`:
```
scala> spark.conf.set("spark.sql.broadcastTimeout", 1)
scala> val df = spark.range(100).join(spark.range(15).as[Long].map { x =>
               Thread.sleep(5000)
               x
            }).where("id = value")
scala> df.count()
```
```
org.apache.spark.SparkException: Could not execute broadcast in 1 secs. You can increase the timeout for broadcasts via spark.sql.broadcastTimeout or disable broadcast join by setting spark.sql.autoBroadcastJoinThreshold to -1
  at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:150)
```

Author: Maxim Gekk <maxim.gekk@databricks.com>

Closes #21727 from MaxGekk/broadcast-timeout-error.
---
 .../execution/exchange/BroadcastExchangeExec.scala  | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala
index c55f9b8f1a7fc..a80673c705f1a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution.exchange
 
+import java.util.concurrent.TimeoutException
+
 import scala.concurrent.{ExecutionContext, Future}
 import scala.concurrent.duration._
 import scala.util.control.NonFatal
@@ -140,7 +142,16 @@ case class BroadcastExchangeExec(
   }
 
   override protected[sql] def doExecuteBroadcast[T](): broadcast.Broadcast[T] = {
-    ThreadUtils.awaitResult(relationFuture, timeout).asInstanceOf[broadcast.Broadcast[T]]
+    try {
+      ThreadUtils.awaitResult(relationFuture, timeout).asInstanceOf[broadcast.Broadcast[T]]
+    } catch {
+      case ex: TimeoutException =>
+        logError(s"Could not execute broadcast in ${timeout.toSeconds} secs.", ex)
+        throw new SparkException(s"Could not execute broadcast in ${timeout.toSeconds} secs. " +
+          s"You can increase the timeout for broadcasts via ${SQLConf.BROADCAST_TIMEOUT.key} or " +
+          s"disable broadcast join by setting ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key} to -1",
+          ex)
+    }
   }
 }
 

From e2c7e09f742a7e522efd74fe8e14c2620afdb522 Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Mon, 9 Jul 2018 10:21:40 +0800
Subject: [PATCH 73/79] [SPARK-24646][CORE] Minor change to
 spark.yarn.dist.forceDownloadSchemes to support wildcard '*'

## What changes were proposed in this pull request?

In the case of getting tokens via customized `ServiceCredentialProvider`, it is required that `ServiceCredentialProvider` be available in local spark-submit process classpath. In this case, all the configured remote sources should be forced to download to local.

For the ease of using this configuration, here propose to add wildcard '*' support to `spark.yarn.dist.forceDownloadSchemes`, also clarify the usage of this configuration.

## How was this patch tested?

New UT added.

Author: jerryshao <sshao@hortonworks.com>

Closes #21633 from jerryshao/SPARK-21917-followup.
---
 .../org/apache/spark/deploy/SparkSubmit.scala |  5 ++--
 .../spark/internal/config/package.scala       |  5 ++--
 .../spark/deploy/SparkSubmitSuite.scala       | 29 ++++++++++++-------
 docs/running-on-yarn.md                       |  5 ++--
 4 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 2da778a29779d..e7310ee886103 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -385,7 +385,7 @@ private[spark] class SparkSubmit extends Logging {
       val forceDownloadSchemes = sparkConf.get(FORCE_DOWNLOAD_SCHEMES)
 
       def shouldDownload(scheme: String): Boolean = {
-        forceDownloadSchemes.contains(scheme) ||
+        forceDownloadSchemes.contains("*") || forceDownloadSchemes.contains(scheme) ||
           Try { FileSystem.getFileSystemClass(scheme, hadoopConf) }.isFailure
       }
 
@@ -578,7 +578,8 @@ private[spark] class SparkSubmit extends Logging {
     }
     // Add the main application jar and any added jars to classpath in case YARN client
     // requires these jars.
-    // This assumes both primaryResource and user jars are local jars, otherwise it will not be
+    // This assumes both primaryResource and user jars are local jars, or already downloaded
+    // to local by configuring "spark.yarn.dist.forceDownloadSchemes", otherwise it will not be
     // added to the classpath of YARN client.
     if (isYarnCluster) {
       if (isUserJar(args.primaryResource)) {
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index bda9795a0b925..ba892bf7f60d6 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -486,10 +486,11 @@ package object config {
 
   private[spark] val FORCE_DOWNLOAD_SCHEMES =
     ConfigBuilder("spark.yarn.dist.forceDownloadSchemes")
-      .doc("Comma-separated list of schemes for which files will be downloaded to the " +
+      .doc("Comma-separated list of schemes for which resources will be downloaded to the " +
         "local disk prior to being added to YARN's distributed cache. For use in cases " +
         "where the YARN service does not support schemes that are supported by Spark, like http, " +
-        "https and ftp.")
+        "https and ftp, or jars required to be in the local YARN client's classpath. Wildcard " +
+        "'*' is denoted to download resources for all the schemes.")
       .stringConf
       .toSequence
       .createWithDefault(Nil)
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 545c8d0423dc3..f829fecc30840 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -995,20 +995,24 @@ class SparkSubmitSuite
   }
 
   test("download remote resource if it is not supported by yarn service") {
-    testRemoteResources(enableHttpFs = false, blacklistHttpFs = false)
+    testRemoteResources(enableHttpFs = false)
   }
 
   test("avoid downloading remote resource if it is supported by yarn service") {
-    testRemoteResources(enableHttpFs = true, blacklistHttpFs = false)
+    testRemoteResources(enableHttpFs = true)
   }
 
   test("force download from blacklisted schemes") {
-    testRemoteResources(enableHttpFs = true, blacklistHttpFs = true)
+    testRemoteResources(enableHttpFs = true, blacklistSchemes = Seq("http"))
+  }
+
+  test("force download for all the schemes") {
+    testRemoteResources(enableHttpFs = true, blacklistSchemes = Seq("*"))
   }
 
   private def testRemoteResources(
       enableHttpFs: Boolean,
-      blacklistHttpFs: Boolean): Unit = {
+      blacklistSchemes: Seq[String] = Nil): Unit = {
     val hadoopConf = new Configuration()
     updateConfWithFakeS3Fs(hadoopConf)
     if (enableHttpFs) {
@@ -1025,8 +1029,8 @@ class SparkSubmitSuite
     val tmpHttpJar = TestUtils.createJarWithFiles(Map("test.resource" -> "USER"), tmpDir)
     val tmpHttpJarPath = s"http://${new File(tmpHttpJar.toURI).getAbsolutePath}"
 
-    val forceDownloadArgs = if (blacklistHttpFs) {
-      Seq("--conf", "spark.yarn.dist.forceDownloadSchemes=http")
+    val forceDownloadArgs = if (blacklistSchemes.nonEmpty) {
+      Seq("--conf", s"spark.yarn.dist.forceDownloadSchemes=${blacklistSchemes.mkString(",")}")
     } else {
       Nil
     }
@@ -1044,14 +1048,19 @@ class SparkSubmitSuite
 
     val jars = conf.get("spark.yarn.dist.jars").split(",").toSet
 
-    // The URI of remote S3 resource should still be remote.
-    assert(jars.contains(tmpS3JarPath))
+    def isSchemeBlacklisted(scheme: String) = {
+      blacklistSchemes.contains("*") || blacklistSchemes.contains(scheme)
+    }
+
+    if (!isSchemeBlacklisted("s3")) {
+      assert(jars.contains(tmpS3JarPath))
+    }
 
-    if (enableHttpFs && !blacklistHttpFs) {
+    if (enableHttpFs && blacklistSchemes.isEmpty) {
       // If Http FS is supported by yarn service, the URI of remote http resource should
       // still be remote.
       assert(jars.contains(tmpHttpJarPath))
-    } else {
+    } else if (!enableHttpFs || isSchemeBlacklisted("http")) {
       // If Http FS is not supported by yarn service, or http scheme is configured to be force
       // downloading, the URI of remote http resource should be changed to a local one.
       val jarName = new File(tmpHttpJar.toURI).getName
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index 575da7205b529..0b265b0cb1b31 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -218,9 +218,10 @@ To use a custom metrics.properties for the application master and executors, upd
   <td><code>spark.yarn.dist.forceDownloadSchemes</code></td>
   <td><code>(none)</code></td>
   <td>
-    Comma-separated list of schemes for which files will be downloaded to the local disk prior to
+    Comma-separated list of schemes for which resources will be downloaded to the local disk prior to
     being added to YARN's distributed cache. For use in cases where the YARN service does not
-    support schemes that are supported by Spark, like http, https and ftp.
+    support schemes that are supported by Spark, like http, https and ftp, or jars required to be in the
+    local YARN client's classpath. Wildcard '*' is denoted to download resources for all the schemes.
   </td>
 </tr>
 <tr>

From 034913b62b579ae003431231c0272513de8f496c Mon Sep 17 00:00:00 2001
From: Bruce Robbins <bersprockets@gmail.com>
Date: Mon, 9 Jul 2018 21:21:38 +0900
Subject: [PATCH 74/79] [SPARK-23936][SQL] Implement map_concat

## What changes were proposed in this pull request?

Implement map_concat high order function.

This implementation does not pick a winner when the specified maps have overlapping keys. Therefore, this implementation preserves existing duplicate keys in the maps and potentially introduces new duplicates (After discussion with ueshin, we settled on option 1 from [here](https://issues.apache.org/jira/browse/SPARK-23936?focusedCommentId=16464245&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-16464245)).

## How was this patch tested?

New tests
Manual tests
Run all sbt SQL tests
Run all pyspark sql tests

Author: Bruce Robbins <bersprockets@gmail.com>

Closes #21073 from bersprockets/SPARK-23936.
---
 python/pyspark/sql/functions.py               |  22 ++
 .../sql/catalyst/CatalystTypeConverters.scala |   6 +
 .../catalyst/analysis/FunctionRegistry.scala  |   1 +
 .../sql/catalyst/analysis/TypeCoercion.scala  |   8 +
 .../expressions/collectionOperations.scala    | 231 ++++++++++++++++++
 .../CollectionExpressionsSuite.scala          | 126 ++++++++++
 .../org/apache/spark/sql/functions.scala      |   8 +
 .../inputs/typeCoercion/native/mapconcat.sql  |  94 +++++++
 .../typeCoercion/native/mapconcat.sql.out     | 143 +++++++++++
 .../spark/sql/DataFrameFunctionsSuite.scala   |  78 ++++++
 10 files changed, 717 insertions(+)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/mapconcat.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/typeCoercion/native/mapconcat.sql.out

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 55e7d575b4681..9f61e29f9cd42 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2510,6 +2510,28 @@ def arrays_zip(*cols):
     return Column(sc._jvm.functions.arrays_zip(_to_seq(sc, cols, _to_java_column)))
 
 
+@since(2.4)
+def map_concat(*cols):
+    """Returns the union of all the given maps.
+
+    :param cols: list of column names (string) or list of :class:`Column` expressions
+
+    >>> from pyspark.sql.functions import map_concat
+    >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as map1, map(3, 'c', 1, 'd') as map2")
+    >>> df.select(map_concat("map1", "map2").alias("map3")).show(truncate=False)
+    +--------------------------------+
+    |map3                            |
+    +--------------------------------+
+    |[1 -> a, 2 -> b, 3 -> c, 1 -> d]|
+    +--------------------------------+
+    """
+    sc = SparkContext._active_spark_context
+    if len(cols) == 1 and isinstance(cols[0], (list, set)):
+        cols = cols[0]
+    jc = sc._jvm.functions.map_concat(_to_seq(sc, cols, _to_java_column))
+    return Column(jc)
+
+
 # ---------------------------- User Defined Function ----------------------------------
 
 class PandasUDFType(object):
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index 93df73ab1eaf6..6f5fbdd79e668 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -431,6 +431,12 @@ object CatalystTypeConverters {
         map,
         (key: Any) => convertToCatalyst(key),
         (value: Any) => convertToCatalyst(value))
+    case (keys: Array[_], values: Array[_]) =>
+      // case for mapdata with duplicate keys
+      new ArrayBasedMapData(
+        new GenericArrayData(keys.map(convertToCatalyst)),
+        new GenericArrayData(values.map(convertToCatalyst))
+      )
     case other => other
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 80a0af672bf74..e7517e8c676e3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -422,6 +422,7 @@ object FunctionRegistry {
     expression[MapValues]("map_values"),
     expression[MapEntries]("map_entries"),
     expression[MapFromEntries]("map_from_entries"),
+    expression[MapConcat]("map_concat"),
     expression[Size]("size"),
     expression[Slice]("slice"),
     expression[Size]("cardinality"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
index b6ca30c7398f2..72908c1f433ee 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -548,6 +548,14 @@ object TypeCoercion {
           case None => s
         }
 
+      case m @ MapConcat(children) if children.forall(c => MapType.acceptsType(c.dataType)) &&
+        !haveSameType(children) =>
+        val types = children.map(_.dataType)
+        findWiderCommonType(types) match {
+          case Some(finalDataType) => MapConcat(children.map(Cast(_, finalDataType)))
+          case None => m
+        }
+
       case m @ CreateMap(children) if m.keys.length == m.values.length &&
         (!haveSameType(m.keys) || !haveSameType(m.values)) =>
         val newKeys = if (haveSameType(m.keys)) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index fcac3a58e6a95..879603b66b314 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -503,6 +503,237 @@ case class MapEntries(child: Expression) extends UnaryExpression with ExpectsInp
   override def prettyName: String = "map_entries"
 }
 
+/**
+ * Returns the union of all the given maps.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(map, ...) - Returns the union of all the given maps",
+  examples = """
+    Examples:
+      > SELECT _FUNC_(map(1, 'a', 2, 'b'), map(2, 'c', 3, 'd'));
+       [[1 -> "a"], [2 -> "b"], [2 -> "c"], [3 -> "d"]]
+  """, since = "2.4.0")
+case class MapConcat(children: Seq[Expression]) extends Expression {
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    var funcName = s"function $prettyName"
+    if (children.exists(!_.dataType.isInstanceOf[MapType])) {
+      TypeCheckResult.TypeCheckFailure(
+        s"input to $funcName should all be of type map, but it's " +
+          children.map(_.dataType.simpleString).mkString("[", ", ", "]"))
+    } else {
+      TypeUtils.checkForSameTypeInputExpr(children.map(_.dataType), funcName)
+    }
+  }
+
+  override def dataType: MapType = {
+    val dt = children.map(_.dataType.asInstanceOf[MapType]).headOption
+      .getOrElse(MapType(StringType, StringType))
+    val valueContainsNull = children.map(_.dataType.asInstanceOf[MapType])
+      .exists(_.valueContainsNull)
+    if (dt.valueContainsNull != valueContainsNull) {
+      dt.copy(valueContainsNull = valueContainsNull)
+    } else {
+      dt
+    }
+  }
+
+  override def nullable: Boolean = children.exists(_.nullable)
+
+  override def eval(input: InternalRow): Any = {
+    val maps = children.map(_.eval(input))
+    if (maps.contains(null)) {
+      return null
+    }
+    val keyArrayDatas = maps.map(_.asInstanceOf[MapData].keyArray())
+    val valueArrayDatas = maps.map(_.asInstanceOf[MapData].valueArray())
+
+    val numElements = keyArrayDatas.foldLeft(0L)((sum, ad) => sum + ad.numElements())
+    if (numElements > ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) {
+      throw new RuntimeException(s"Unsuccessful attempt to concat maps with $numElements " +
+        s"elements due to exceeding the map size limit " +
+        s"${ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH}.")
+    }
+    val finalKeyArray = new Array[AnyRef](numElements.toInt)
+    val finalValueArray = new Array[AnyRef](numElements.toInt)
+    var position = 0
+    for (i <- keyArrayDatas.indices) {
+      val keyArray = keyArrayDatas(i).toObjectArray(dataType.keyType)
+      val valueArray = valueArrayDatas(i).toObjectArray(dataType.valueType)
+      Array.copy(keyArray, 0, finalKeyArray, position, keyArray.length)
+      Array.copy(valueArray, 0, finalValueArray, position, valueArray.length)
+      position += keyArray.length
+    }
+
+    new ArrayBasedMapData(new GenericArrayData(finalKeyArray),
+      new GenericArrayData(finalValueArray))
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    val mapCodes = children.map(_.genCode(ctx))
+    val keyType = dataType.keyType
+    val valueType = dataType.valueType
+    val argsName = ctx.freshName("args")
+    val hasNullName = ctx.freshName("hasNull")
+    val mapDataClass = classOf[MapData].getName
+    val arrayBasedMapDataClass = classOf[ArrayBasedMapData].getName
+    val arrayDataClass = classOf[ArrayData].getName
+
+    val init =
+      s"""
+        |$mapDataClass[] $argsName = new $mapDataClass[${mapCodes.size}];
+        |boolean ${ev.isNull}, $hasNullName = false;
+        |$mapDataClass ${ev.value} = null;
+      """.stripMargin
+
+    val assignments = mapCodes.zipWithIndex.map { case (m, i) =>
+      s"""
+         |if (!$hasNullName) {
+         |  ${m.code}
+         |  $argsName[$i] = ${m.value};
+         |  if (${m.isNull}) {
+         |    $hasNullName = true;
+         |  }
+         |}
+       """.stripMargin
+    }
+
+    val codes = ctx.splitExpressionsWithCurrentInputs(
+      expressions = assignments,
+      funcName = "getMapConcatInputs",
+      extraArguments = (s"$mapDataClass[]", argsName) :: ("boolean", hasNullName) :: Nil,
+      returnType = "boolean",
+      makeSplitFunction = body =>
+        s"""
+           |$body
+           |return $hasNullName;
+        """.stripMargin,
+      foldFunctions = _.map(funcCall => s"$hasNullName = $funcCall;").mkString("\n")
+    )
+
+    val idxName = ctx.freshName("idx")
+    val numElementsName = ctx.freshName("numElems")
+    val finKeysName = ctx.freshName("finalKeys")
+    val finValsName = ctx.freshName("finalValues")
+
+    val keyConcatenator = if (CodeGenerator.isPrimitiveType(keyType)) {
+      genCodeForPrimitiveArrays(ctx, keyType, false)
+    } else {
+      genCodeForNonPrimitiveArrays(ctx, keyType)
+    }
+
+    val valueConcatenator = if (CodeGenerator.isPrimitiveType(valueType)) {
+      genCodeForPrimitiveArrays(ctx, valueType, dataType.valueContainsNull)
+    } else {
+      genCodeForNonPrimitiveArrays(ctx, valueType)
+    }
+
+    val keyArgsName = ctx.freshName("keyArgs")
+    val valArgsName = ctx.freshName("valArgs")
+
+    val mapMerge =
+      s"""
+        |${ev.isNull} = $hasNullName;
+        |if (!${ev.isNull}) {
+        |  $arrayDataClass[] $keyArgsName = new $arrayDataClass[${mapCodes.size}];
+        |  $arrayDataClass[] $valArgsName = new $arrayDataClass[${mapCodes.size}];
+        |  long $numElementsName = 0;
+        |  for (int $idxName = 0; $idxName < $argsName.length; $idxName++) {
+        |    $keyArgsName[$idxName] = $argsName[$idxName].keyArray();
+        |    $valArgsName[$idxName] = $argsName[$idxName].valueArray();
+        |    $numElementsName += $argsName[$idxName].numElements();
+        |  }
+        |  if ($numElementsName > ${ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH}) {
+        |    throw new RuntimeException("Unsuccessful attempt to concat maps with " +
+        |       $numElementsName + " elements due to exceeding the map size limit " +
+        |       "${ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH}.");
+        |  }
+        |  $arrayDataClass $finKeysName = $keyConcatenator.concat($keyArgsName,
+        |    (int) $numElementsName);
+        |  $arrayDataClass $finValsName = $valueConcatenator.concat($valArgsName,
+        |    (int) $numElementsName);
+        |  ${ev.value} = new $arrayBasedMapDataClass($finKeysName, $finValsName);
+        |}
+      """.stripMargin
+
+    ev.copy(
+      code = code"""
+        |$init
+        |$codes
+        |$mapMerge
+      """.stripMargin)
+  }
+
+  private def genCodeForPrimitiveArrays(
+      ctx: CodegenContext,
+      elementType: DataType,
+      checkForNull: Boolean): String = {
+    val counter = ctx.freshName("counter")
+    val arrayData = ctx.freshName("arrayData")
+    val argsName = ctx.freshName("args")
+    val numElemName = ctx.freshName("numElements")
+    val primitiveValueTypeName = CodeGenerator.primitiveTypeName(elementType)
+
+    val setterCode1 =
+      s"""
+         |$arrayData.set$primitiveValueTypeName(
+         |  $counter,
+         |  ${CodeGenerator.getValue(s"$argsName[y]", elementType, "z")}
+         |);""".stripMargin
+
+    val setterCode = if (checkForNull) {
+      s"""
+         |if ($argsName[y].isNullAt(z)) {
+         |  $arrayData.setNullAt($counter);
+         |} else {
+         |  $setterCode1
+         |}""".stripMargin
+    } else {
+      setterCode1
+    }
+
+    s"""
+       |new Object() {
+       |  public ArrayData concat(${classOf[ArrayData].getName}[] $argsName, int $numElemName) {
+       |    ${ctx.createUnsafeArray(arrayData, numElemName, elementType, s" $prettyName failed.")}
+       |    int $counter = 0;
+       |    for (int y = 0; y < ${children.length}; y++) {
+       |      for (int z = 0; z < $argsName[y].numElements(); z++) {
+       |        $setterCode
+       |        $counter++;
+       |      }
+       |    }
+       |    return $arrayData;
+       |  }
+       |}""".stripMargin.stripPrefix("\n")
+  }
+
+  private def genCodeForNonPrimitiveArrays(ctx: CodegenContext, elementType: DataType): String = {
+    val genericArrayClass = classOf[GenericArrayData].getName
+    val arrayData = ctx.freshName("arrayObjects")
+    val counter = ctx.freshName("counter")
+    val argsName = ctx.freshName("args")
+    val numElemName = ctx.freshName("numElements")
+
+    s"""
+       |new Object() {
+       |  public ArrayData concat(${classOf[ArrayData].getName}[] $argsName, int $numElemName) {;
+       |    Object[] $arrayData = new Object[$numElemName];
+       |    int $counter = 0;
+       |    for (int y = 0; y < ${children.length}; y++) {
+       |      for (int z = 0; z < $argsName[y].numElements(); z++) {
+       |        $arrayData[$counter] = ${CodeGenerator.getValue(s"$argsName[y]", elementType, "z")};
+       |        $counter++;
+       |      }
+       |    }
+       |    return new $genericArrayClass($arrayData);
+       |  }
+       |}""".stripMargin.stripPrefix("\n")
+  }
+
+  override def prettyName: String = "map_concat"
+}
+
 /**
  * Returns a map created from the given array of entries.
  */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
index 496ee1d496a36..173c98af323b1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -98,6 +98,132 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
     checkEvaluation(MapEntries(ms2), null)
   }
 
+  test("Map Concat") {
+    val m0 = Literal.create(Map("a" -> "1", "b" -> "2"), MapType(StringType, StringType,
+      valueContainsNull = false))
+    val m1 = Literal.create(Map("c" -> "3", "a" -> "4"), MapType(StringType, StringType,
+      valueContainsNull = false))
+    val m2 = Literal.create(Map("d" -> "4", "e" -> "5"), MapType(StringType, StringType))
+    val m3 = Literal.create(Map("a" -> "1", "b" -> "2"), MapType(StringType, StringType))
+    val m4 = Literal.create(Map("a" -> null, "c" -> "3"), MapType(StringType, StringType))
+    val m5 = Literal.create(Map("a" -> 1, "b" -> 2), MapType(StringType, IntegerType))
+    val m6 = Literal.create(Map("a" -> null, "c" -> 3), MapType(StringType, IntegerType))
+    val m7 = Literal.create(Map(List(1, 2) -> 1, List(3, 4) -> 2),
+      MapType(ArrayType(IntegerType), IntegerType))
+    val m8 = Literal.create(Map(List(5, 6) -> 3, List(1, 2) -> 4),
+      MapType(ArrayType(IntegerType), IntegerType))
+    val m9 = Literal.create(Map(Map(1 -> 2, 3 -> 4) -> 1, Map(5 -> 6, 7 -> 8) -> 2),
+      MapType(MapType(IntegerType, IntegerType), IntegerType))
+    val m10 = Literal.create(Map(Map(9 -> 10, 11 -> 12) -> 3, Map(1 -> 2, 3 -> 4) -> 4),
+      MapType(MapType(IntegerType, IntegerType), IntegerType))
+    val m11 = Literal.create(Map(1 -> "1", 2 -> "2"), MapType(IntegerType, StringType,
+      valueContainsNull = false))
+    val m12 = Literal.create(Map(3 -> "3", 4 -> "4"), MapType(IntegerType, StringType,
+      valueContainsNull = false))
+    val mNull = Literal.create(null, MapType(StringType, StringType))
+
+    // overlapping maps
+    checkEvaluation(MapConcat(Seq(m0, m1)),
+      (
+        Array("a", "b", "c", "a"), // keys
+        Array("1", "2", "3", "4") // values
+      )
+    )
+
+    // maps with no overlap
+    checkEvaluation(MapConcat(Seq(m0, m2)),
+      Map("a" -> "1", "b" -> "2", "d" -> "4", "e" -> "5"))
+
+    // 3 maps
+    checkEvaluation(MapConcat(Seq(m0, m1, m2)),
+      (
+        Array("a", "b", "c", "a", "d", "e"), // keys
+        Array("1", "2", "3", "4", "4", "5") // values
+      )
+    )
+
+    // null reference values
+    checkEvaluation(MapConcat(Seq(m3, m4)),
+      (
+        Array("a", "b", "a", "c"), // keys
+        Array("1", "2", null, "3") // values
+      )
+    )
+
+    // null primitive values
+    checkEvaluation(MapConcat(Seq(m5, m6)),
+      (
+        Array("a", "b", "a", "c"), // keys
+        Array(1, 2, null, 3) // values
+      )
+    )
+
+    // keys that are primitive
+    checkEvaluation(MapConcat(Seq(m11, m12)),
+      (
+        Array(1, 2, 3, 4), // keys
+        Array("1", "2", "3", "4") // values
+      )
+    )
+
+    // keys that are arrays, with overlap
+    checkEvaluation(MapConcat(Seq(m7, m8)),
+      (
+        Array(List(1, 2), List(3, 4), List(5, 6), List(1, 2)), // keys
+        Array(1, 2, 3, 4) // values
+      )
+    )
+
+    // keys that are maps, with overlap
+    checkEvaluation(MapConcat(Seq(m9, m10)),
+      (
+        Array(Map(1 -> 2, 3 -> 4), Map(5 -> 6, 7 -> 8), Map(9 -> 10, 11 -> 12),
+          Map(1 -> 2, 3 -> 4)), // keys
+        Array(1, 2, 3, 4) // values
+      )
+    )
+
+    // null map
+    checkEvaluation(MapConcat(Seq(m0, mNull)), null)
+    checkEvaluation(MapConcat(Seq(mNull, m0)), null)
+    checkEvaluation(MapConcat(Seq(mNull, mNull)), null)
+    checkEvaluation(MapConcat(Seq(mNull)), null)
+
+    // single map
+    checkEvaluation(MapConcat(Seq(m0)), Map("a" -> "1", "b" -> "2"))
+
+    // no map
+    checkEvaluation(MapConcat(Seq.empty), Map.empty)
+
+    // force split expressions for input in generated code
+    val expectedKeys = Array.fill(65)(Seq("a", "b")).flatten ++ Array("d", "e")
+    val expectedValues = Array.fill(65)(Seq("1", "2")).flatten ++ Array("4", "5")
+    checkEvaluation(MapConcat(
+      Seq(
+        m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0,
+        m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0,
+        m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m0, m2
+      )),
+      (expectedKeys, expectedValues))
+
+    // argument checking
+    assert(MapConcat(Seq(m0, m1)).checkInputDataTypes().isSuccess)
+    assert(MapConcat(Seq(m5, m6)).checkInputDataTypes().isSuccess)
+    assert(MapConcat(Seq(m0, m5)).checkInputDataTypes().isFailure)
+    assert(MapConcat(Seq(m0, Literal(12))).checkInputDataTypes().isFailure)
+    assert(MapConcat(Seq(m0, m1)).dataType.keyType == StringType)
+    assert(MapConcat(Seq(m0, m1)).dataType.valueType == StringType)
+    assert(!MapConcat(Seq(m0, m1)).dataType.valueContainsNull)
+    assert(MapConcat(Seq(m5, m6)).dataType.keyType == StringType)
+    assert(MapConcat(Seq(m5, m6)).dataType.valueType == IntegerType)
+    assert(MapConcat(Seq.empty).dataType.keyType == StringType)
+    assert(MapConcat(Seq.empty).dataType.valueType == StringType)
+    assert(MapConcat(Seq(m5, m6)).dataType.valueContainsNull)
+    assert(MapConcat(Seq(m6, m5)).dataType.valueContainsNull)
+    assert(!MapConcat(Seq(m1, m2)).nullable)
+    assert(MapConcat(Seq(m1, mNull)).nullable)
+  }
+
   test("MapFromEntries") {
     def arrayType(keyType: DataType, valueType: DataType) : DataType = {
       ArrayType(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index f2627e69939cd..89dbba10a6bf1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -3627,6 +3627,14 @@ object functions {
   @scala.annotation.varargs
   def arrays_zip(e: Column*): Column = withExpr { ArraysZip(e.map(_.expr)) }
 
+  /**
+   * Returns the union of all the given maps.
+   * @group collection_funcs
+   * @since 2.4.0
+   */
+  @scala.annotation.varargs
+  def map_concat(cols: Column*): Column = withExpr { MapConcat(cols.map(_.expr)) }
+
   //////////////////////////////////////////////////////////////////////////////////////////////
   // Mask functions
   //////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/mapconcat.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/mapconcat.sql
new file mode 100644
index 0000000000000..fc26397b881b5
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/mapconcat.sql
@@ -0,0 +1,94 @@
+CREATE TEMPORARY VIEW various_maps AS SELECT * FROM VALUES (
+  map(true, false), map(false, true),
+  map(1Y, 2Y), map(3Y, 4Y),
+  map(1S, 2S), map(3S, 4S),
+  map(4, 6), map(7, 8),
+  map(6L, 7L), map(8L, 9L),
+  map(9223372036854775809, 9223372036854775808), map(9223372036854775808, 9223372036854775809),  
+  map(1.0D, 2.0D), map(3.0D, 4.0D),
+  map(float(1.0D), float(2.0D)), map(float(3.0D), float(4.0D)),
+  map(date '2016-03-14', date '2016-03-13'), map(date '2016-03-12', date '2016-03-11'),
+  map(timestamp '2016-11-15 20:54:00.000', timestamp '2016-11-12 20:54:00.000'),
+  map(timestamp '2016-11-11 20:54:00.000', timestamp '2016-11-09 20:54:00.000'),
+  map('a', 'b'), map('c', 'd'),
+  map(array('a', 'b'), array('c', 'd')), map(array('e'), array('f')),
+  map(struct('a', 1), struct('b', 2)), map(struct('c', 3), struct('d', 4)),
+  map(map('a', 1), map('b', 2)), map(map('c', 3), map('d', 4)),
+  map('a', 1), map('c', 2),
+  map(1, 'a'), map(2, 'c')
+) AS various_maps (
+  boolean_map1, boolean_map2,
+  tinyint_map1, tinyint_map2,
+  smallint_map1, smallint_map2,
+  int_map1, int_map2,
+  bigint_map1, bigint_map2,
+  decimal_map1, decimal_map2,
+  double_map1, double_map2,
+  float_map1, float_map2,
+  date_map1, date_map2,
+  timestamp_map1,
+  timestamp_map2,
+  string_map1, string_map2,
+  array_map1, array_map2,
+  struct_map1, struct_map2,
+  map_map1, map_map2,
+  string_int_map1, string_int_map2,
+  int_string_map1, int_string_map2
+);
+
+-- Concatenate maps of the same type
+SELECT
+    map_concat(boolean_map1, boolean_map2) boolean_map,
+    map_concat(tinyint_map1, tinyint_map2) tinyint_map,
+    map_concat(smallint_map1, smallint_map2) smallint_map,
+    map_concat(int_map1, int_map2) int_map,
+    map_concat(bigint_map1, bigint_map2) bigint_map,
+    map_concat(decimal_map1, decimal_map2) decimal_map,
+    map_concat(float_map1, float_map2) float_map,
+    map_concat(double_map1, double_map2) double_map,
+    map_concat(date_map1, date_map2) date_map,
+    map_concat(timestamp_map1, timestamp_map2) timestamp_map,
+    map_concat(string_map1, string_map2) string_map,
+    map_concat(array_map1, array_map2) array_map,
+    map_concat(struct_map1, struct_map2) struct_map,
+    map_concat(map_map1, map_map2) map_map,
+    map_concat(string_int_map1, string_int_map2) string_int_map,
+    map_concat(int_string_map1, int_string_map2) int_string_map
+FROM various_maps;
+
+-- Concatenate maps of different types
+SELECT
+    map_concat(tinyint_map1, smallint_map2) ts_map,
+    map_concat(smallint_map1, int_map2) si_map,
+    map_concat(int_map1, bigint_map2) ib_map,
+    map_concat(decimal_map1, float_map2) df_map,
+    map_concat(string_map1, date_map2) std_map,
+    map_concat(timestamp_map1, string_map2) tst_map,
+    map_concat(string_map1, int_map2) sti_map,
+    map_concat(int_string_map1, tinyint_map2) istt_map
+FROM various_maps;
+
+-- Concatenate map of incompatible types 1
+SELECT
+    map_concat(tinyint_map1, map_map2) tm_map
+FROM various_maps;
+
+-- Concatenate map of incompatible types 2
+SELECT
+    map_concat(boolean_map1, int_map2) bi_map
+FROM various_maps;
+
+-- Concatenate map of incompatible types 3
+SELECT
+    map_concat(int_map1, struct_map2) is_map
+FROM various_maps;
+
+-- Concatenate map of incompatible types 4
+SELECT
+    map_concat(map_map1, array_map2) ma_map
+FROM various_maps;
+
+-- Concatenate map of incompatible types 5
+SELECT
+    map_concat(map_map1, struct_map2) ms_map
+FROM various_maps;
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/mapconcat.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/mapconcat.sql.out
new file mode 100644
index 0000000000000..d352b7284ae87
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/mapconcat.sql.out
@@ -0,0 +1,143 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 8
+
+
+-- !query 0
+CREATE TEMPORARY VIEW various_maps AS SELECT * FROM VALUES (
+  map(true, false), map(false, true),
+  map(1Y, 2Y), map(3Y, 4Y),
+  map(1S, 2S), map(3S, 4S),
+  map(4, 6), map(7, 8),
+  map(6L, 7L), map(8L, 9L),
+  map(9223372036854775809, 9223372036854775808), map(9223372036854775808, 9223372036854775809),  
+  map(1.0D, 2.0D), map(3.0D, 4.0D),
+  map(float(1.0D), float(2.0D)), map(float(3.0D), float(4.0D)),
+  map(date '2016-03-14', date '2016-03-13'), map(date '2016-03-12', date '2016-03-11'),
+  map(timestamp '2016-11-15 20:54:00.000', timestamp '2016-11-12 20:54:00.000'),
+  map(timestamp '2016-11-11 20:54:00.000', timestamp '2016-11-09 20:54:00.000'),
+  map('a', 'b'), map('c', 'd'),
+  map(array('a', 'b'), array('c', 'd')), map(array('e'), array('f')),
+  map(struct('a', 1), struct('b', 2)), map(struct('c', 3), struct('d', 4)),
+  map(map('a', 1), map('b', 2)), map(map('c', 3), map('d', 4)),
+  map('a', 1), map('c', 2),
+  map(1, 'a'), map(2, 'c')
+) AS various_maps (
+  boolean_map1, boolean_map2,
+  tinyint_map1, tinyint_map2,
+  smallint_map1, smallint_map2,
+  int_map1, int_map2,
+  bigint_map1, bigint_map2,
+  decimal_map1, decimal_map2,
+  double_map1, double_map2,
+  float_map1, float_map2,
+  date_map1, date_map2,
+  timestamp_map1,
+  timestamp_map2,
+  string_map1, string_map2,
+  array_map1, array_map2,
+  struct_map1, struct_map2,
+  map_map1, map_map2,
+  string_int_map1, string_int_map2,
+  int_string_map1, int_string_map2
+)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT
+    map_concat(boolean_map1, boolean_map2) boolean_map,
+    map_concat(tinyint_map1, tinyint_map2) tinyint_map,
+    map_concat(smallint_map1, smallint_map2) smallint_map,
+    map_concat(int_map1, int_map2) int_map,
+    map_concat(bigint_map1, bigint_map2) bigint_map,
+    map_concat(decimal_map1, decimal_map2) decimal_map,
+    map_concat(float_map1, float_map2) float_map,
+    map_concat(double_map1, double_map2) double_map,
+    map_concat(date_map1, date_map2) date_map,
+    map_concat(timestamp_map1, timestamp_map2) timestamp_map,
+    map_concat(string_map1, string_map2) string_map,
+    map_concat(array_map1, array_map2) array_map,
+    map_concat(struct_map1, struct_map2) struct_map,
+    map_concat(map_map1, map_map2) map_map,
+    map_concat(string_int_map1, string_int_map2) string_int_map,
+    map_concat(int_string_map1, int_string_map2) int_string_map
+FROM various_maps
+-- !query 1 schema
+struct<boolean_map:map<boolean,boolean>,tinyint_map:map<tinyint,tinyint>,smallint_map:map<smallint,smallint>,int_map:map<int,int>,bigint_map:map<bigint,bigint>,decimal_map:map<decimal(19,0),decimal(19,0)>,float_map:map<float,float>,double_map:map<double,double>,date_map:map<date,date>,timestamp_map:map<timestamp,timestamp>,string_map:map<string,string>,array_map:map<array<string>,array<string>>,struct_map:map<struct<col1:string,col2:int>,struct<col1:string,col2:int>>,map_map:map<map<string,int>,map<string,int>>,string_int_map:map<string,int>,int_string_map:map<int,string>>
+-- !query 1 output
+{false:true,true:false}	{1:2,3:4}	{1:2,3:4}	{4:6,7:8}	{6:7,8:9}	{9223372036854775808:9223372036854775809,9223372036854775809:9223372036854775808}	{1.0:2.0,3.0:4.0}	{1.0:2.0,3.0:4.0}	{2016-03-12:2016-03-11,2016-03-14:2016-03-13}	{2016-11-11 20:54:00.0:2016-11-09 20:54:00.0,2016-11-15 20:54:00.0:2016-11-12 20:54:00.0}	{"a":"b","c":"d"}	{["a","b"]:["c","d"],["e"]:["f"]}	{{"col1":"a","col2":1}:{"col1":"b","col2":2},{"col1":"c","col2":3}:{"col1":"d","col2":4}}	{{"a":1}:{"b":2},{"c":3}:{"d":4}}	{"a":1,"c":2}	{1:"a",2:"c"}
+
+
+-- !query 2
+SELECT
+    map_concat(tinyint_map1, smallint_map2) ts_map,
+    map_concat(smallint_map1, int_map2) si_map,
+    map_concat(int_map1, bigint_map2) ib_map,
+    map_concat(decimal_map1, float_map2) df_map,
+    map_concat(string_map1, date_map2) std_map,
+    map_concat(timestamp_map1, string_map2) tst_map,
+    map_concat(string_map1, int_map2) sti_map,
+    map_concat(int_string_map1, tinyint_map2) istt_map
+FROM various_maps
+-- !query 2 schema
+struct<ts_map:map<smallint,smallint>,si_map:map<int,int>,ib_map:map<bigint,bigint>,df_map:map<double,double>,std_map:map<string,string>,tst_map:map<string,string>,sti_map:map<string,string>,istt_map:map<int,string>>
+-- !query 2 output
+{1:2,3:4}	{1:2,7:8}	{4:6,8:9}	{3.0:4.0,9.223372036854776E18:9.223372036854776E18}	{"2016-03-12":"2016-03-11","a":"b"}	{"2016-11-15 20:54:00":"2016-11-12 20:54:00","c":"d"}	{"7":"8","a":"b"}	{1:"a",3:"4"}
+
+
+-- !query 3
+SELECT
+    map_concat(tinyint_map1, map_map2) tm_map
+FROM various_maps
+-- !query 3 schema
+struct<>
+-- !query 3 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'map_concat(various_maps.`tinyint_map1`, various_maps.`map_map2`)' due to data type mismatch: input to function map_concat should all be the same type, but it's [map<tinyint,tinyint>, map<map<string,int>,map<string,int>>]; line 2 pos 4
+
+
+-- !query 4
+SELECT
+    map_concat(boolean_map1, int_map2) bi_map
+FROM various_maps
+-- !query 4 schema
+struct<>
+-- !query 4 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'map_concat(various_maps.`boolean_map1`, various_maps.`int_map2`)' due to data type mismatch: input to function map_concat should all be the same type, but it's [map<boolean,boolean>, map<int,int>]; line 2 pos 4
+
+
+-- !query 5
+SELECT
+    map_concat(int_map1, struct_map2) is_map
+FROM various_maps
+-- !query 5 schema
+struct<>
+-- !query 5 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'map_concat(various_maps.`int_map1`, various_maps.`struct_map2`)' due to data type mismatch: input to function map_concat should all be the same type, but it's [map<int,int>, map<struct<col1:string,col2:int>,struct<col1:string,col2:int>>]; line 2 pos 4
+
+
+-- !query 6
+SELECT
+    map_concat(map_map1, array_map2) ma_map
+FROM various_maps
+-- !query 6 schema
+struct<>
+-- !query 6 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'map_concat(various_maps.`map_map1`, various_maps.`array_map2`)' due to data type mismatch: input to function map_concat should all be the same type, but it's [map<map<string,int>,map<string,int>>, map<array<string>,array<string>>]; line 2 pos 4
+
+
+-- !query 7
+SELECT
+    map_concat(map_map1, struct_map2) ms_map
+FROM various_maps
+-- !query 7 schema
+struct<>
+-- !query 7 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'map_concat(various_maps.`map_map1`, various_maps.`struct_map2`)' due to data type mismatch: input to function map_concat should all be the same type, but it's [map<map<string,int>,map<string,int>>, map<struct<col1:string,col2:int>,struct<col1:string,col2:int>>]; line 2 pos 4
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 4c28e2f1cd909..d60ed7a5ef0d9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -657,6 +657,84 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
     checkAnswer(sdf.filter(dummyFilter('m)).select(map_entries('m)), sExpected)
   }
 
+  test("map_concat function") {
+    val df1 = Seq(
+      (Map[Int, Int](1 -> 100, 2 -> 200), Map[Int, Int](3 -> 300, 4 -> 400)),
+      (Map[Int, Int](1 -> 100, 2 -> 200), Map[Int, Int](3 -> 300, 1 -> 400)),
+      (null, Map[Int, Int](3 -> 300, 4 -> 400))
+    ).toDF("map1", "map2")
+
+    val expected1a = Seq(
+      Row(Map(1 -> 100, 2 -> 200, 3 -> 300, 4 -> 400)),
+      Row(Map(1 -> 400, 2 -> 200, 3 -> 300)),
+      Row(null)
+    )
+
+    checkAnswer(df1.selectExpr("map_concat(map1, map2)"), expected1a)
+    checkAnswer(df1.select(map_concat('map1, 'map2)), expected1a)
+
+    val expected1b = Seq(
+      Row(Map(1 -> 100, 2 -> 200)),
+      Row(Map(1 -> 100, 2 -> 200)),
+      Row(null)
+    )
+
+    checkAnswer(df1.selectExpr("map_concat(map1)"), expected1b)
+    checkAnswer(df1.select(map_concat('map1)), expected1b)
+
+    val df2 = Seq(
+      (
+        Map[Array[Int], Int](Array(1) -> 100, Array(2) -> 200),
+        Map[String, Int]("3" -> 300, "4" -> 400)
+      )
+    ).toDF("map1", "map2")
+
+    val expected2 = Seq(Row(Map()))
+
+    checkAnswer(df2.selectExpr("map_concat()"), expected2)
+    checkAnswer(df2.select(map_concat()), expected2)
+
+    val df3 = {
+      val schema = StructType(
+        StructField("map1", MapType(StringType, IntegerType, true), false)  ::
+        StructField("map2", MapType(StringType, IntegerType, false), false) :: Nil
+      )
+      val data = Seq(
+        Row(Map[String, Any]("a" -> 1, "b" -> null), Map[String, Any]("c" -> 3, "d" -> 4)),
+        Row(Map[String, Any]("a" -> 1, "b" -> 2), Map[String, Any]("c" -> 3, "d" -> 4))
+      )
+      spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
+    }
+
+    val expected3 = Seq(
+      Row(Map[String, Any]("a" -> 1, "b" -> null, "c" -> 3, "d" -> 4)),
+      Row(Map[String, Any]("a" -> 1, "b" -> 2, "c" -> 3, "d" -> 4))
+    )
+
+    checkAnswer(df3.selectExpr("map_concat(map1, map2)"), expected3)
+    checkAnswer(df3.select(map_concat('map1, 'map2)), expected3)
+
+    val expectedMessage1 = "input to function map_concat should all be the same type"
+
+    assert(intercept[AnalysisException] {
+      df2.selectExpr("map_concat(map1, map2)").collect()
+    }.getMessage().contains(expectedMessage1))
+
+    assert(intercept[AnalysisException] {
+      df2.select(map_concat('map1, 'map2)).collect()
+    }.getMessage().contains(expectedMessage1))
+
+    val expectedMessage2 = "input to function map_concat should all be of type map"
+
+    assert(intercept[AnalysisException] {
+      df2.selectExpr("map_concat(map1, 12)").collect()
+    }.getMessage().contains(expectedMessage2))
+
+    assert(intercept[AnalysisException] {
+      df2.select(map_concat('map1, lit(12))).collect()
+    }.getMessage().contains(expectedMessage2))
+  }
+
   test("map_from_entries function") {
     def dummyFilter(c: Column): Column = c.isNull || c.isNotNull
     val oneRowDF = Seq(3215).toDF("i")

From 1bd3d61f4191767a94b71b42f4d00706b703e84f Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Mon, 9 Jul 2018 22:59:05 +0800
Subject: [PATCH 75/79] [SPARK-24268][SQL] Use datatype.simpleString in error
 messages

## What changes were proposed in this pull request?

SPARK-22893 tried to unify error messages about dataTypes. Unfortunately, still many places were missing the `simpleString` method in other to have the same representation everywhere.

The PR unified the messages using alway the simpleString representation of the dataTypes in the messages.

## How was this patch tested?

existing/modified UTs

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #21321 from mgaido91/SPARK-24268.
---
 .../spark/sql/kafka010/KafkaWriteTask.scala      |  6 +++---
 .../apache/spark/sql/kafka010/KafkaWriter.scala  |  6 +++---
 .../sql/kafka010/KafkaContinuousSinkSuite.scala  |  4 ++--
 .../spark/sql/kafka010/KafkaSinkSuite.scala      |  4 ++--
 .../scala/org/apache/spark/ml/feature/DCT.scala  |  3 ++-
 .../apache/spark/ml/feature/FeatureHasher.scala  |  5 +++--
 .../org/apache/spark/ml/feature/HashingTF.scala  |  2 +-
 .../apache/spark/ml/feature/Interaction.scala    |  3 ++-
 .../org/apache/spark/ml/feature/NGram.scala      |  2 +-
 .../apache/spark/ml/feature/OneHotEncoder.scala  |  3 ++-
 .../org/apache/spark/ml/feature/RFormula.scala   |  2 +-
 .../spark/ml/feature/StopWordsRemover.scala      |  4 ++--
 .../org/apache/spark/ml/feature/Tokenizer.scala  |  3 ++-
 .../spark/ml/feature/VectorAssembler.scala       |  2 +-
 .../scala/org/apache/spark/ml/fpm/FPGrowth.scala |  2 +-
 .../org/apache/spark/ml/util/SchemaUtils.scala   | 11 +++++++----
 .../BinaryClassificationEvaluatorSuite.scala     |  4 ++--
 .../apache/spark/ml/feature/RFormulaSuite.scala  |  2 +-
 .../spark/ml/feature/VectorAssemblerSuite.scala  |  6 +++---
 .../spark/ml/recommendation/ALSSuite.scala       |  2 +-
 .../regression/AFTSurvivalRegressionSuite.scala  |  2 +-
 .../apache/spark/ml/util/MLTestingUtils.scala    |  6 +++---
 .../expressions/complexTypeCreator.scala         |  4 ++--
 .../catalyst/expressions/jsonExpressions.scala   |  2 +-
 .../catalyst/expressions/stringExpressions.scala |  5 +++--
 .../sql/catalyst/json/JacksonGenerator.scala     |  4 ++--
 .../spark/sql/catalyst/json/JacksonParser.scala  |  6 ++++--
 .../sql/catalyst/json/JsonInferSchema.scala      |  6 ++++--
 .../spark/sql/catalyst/util/TypeUtils.scala      |  5 +++--
 .../spark/sql/types/AbstractDataType.scala       |  9 +++++----
 .../org/apache/spark/sql/types/ArrayType.scala   |  5 +++--
 .../org/apache/spark/sql/types/DecimalType.scala |  3 ++-
 .../org/apache/spark/sql/types/ObjectType.scala  |  3 ++-
 .../org/apache/spark/sql/types/StructType.scala  |  5 +++--
 .../catalyst/analysis/AnalysisErrorSuite.scala   |  2 +-
 .../analysis/ExpressionTypeCheckingSuite.scala   | 16 ++++++++--------
 .../catalyst/parser/ExpressionParserSuite.scala  |  2 +-
 .../apache/spark/sql/types/DataTypeSuite.scala   |  2 +-
 .../parquet/VectorizedColumnReader.java          |  2 +-
 .../spark/sql/RelationalGroupedDataset.scala     |  2 +-
 .../spark/sql/execution/arrow/ArrowUtils.scala   |  3 ++-
 .../execution/datasources/orc/OrcFilters.scala   |  2 +-
 .../parquet/ParquetSchemaConverter.scala         |  2 +-
 .../spark/sql/execution/stat/StatFunctions.scala |  2 +-
 .../sql-tests/results/json-functions.sql.out     |  4 ++--
 .../resources/sql-tests/results/literals.sql.out |  6 +++---
 .../datasources/parquet/ParquetSchemaSuite.scala |  4 ++--
 .../sql/hive/execution/HiveTableScanExec.scala   |  6 +++---
 48 files changed, 108 insertions(+), 88 deletions(-)

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
index d90630a8adc93..59a84706d4f55 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
@@ -110,7 +110,7 @@ private[kafka010] abstract class KafkaRowWriter(
       case t =>
         throw new IllegalStateException(s"${KafkaWriter.TOPIC_ATTRIBUTE_NAME} " +
           s"attribute unsupported type $t. ${KafkaWriter.TOPIC_ATTRIBUTE_NAME} " +
-          "must be a StringType")
+          s"must be a ${StringType.simpleString}")
     }
     val keyExpression = inputSchema.find(_.name == KafkaWriter.KEY_ATTRIBUTE_NAME)
       .getOrElse(Literal(null, BinaryType))
@@ -118,7 +118,7 @@ private[kafka010] abstract class KafkaRowWriter(
       case StringType | BinaryType => // good
       case t =>
         throw new IllegalStateException(s"${KafkaWriter.KEY_ATTRIBUTE_NAME} " +
-          s"attribute unsupported type $t")
+          s"attribute unsupported type ${t.simpleString}")
     }
     val valueExpression = inputSchema
       .find(_.name == KafkaWriter.VALUE_ATTRIBUTE_NAME).getOrElse(
@@ -129,7 +129,7 @@ private[kafka010] abstract class KafkaRowWriter(
       case StringType | BinaryType => // good
       case t =>
         throw new IllegalStateException(s"${KafkaWriter.VALUE_ATTRIBUTE_NAME} " +
-          s"attribute unsupported type $t")
+          s"attribute unsupported type ${t.simpleString}")
     }
     UnsafeProjection.create(
       Seq(topicExpression, Cast(keyExpression, BinaryType),
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
index 15cd44812cb0c..3ec26e9edd353 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
@@ -57,7 +57,7 @@ private[kafka010] object KafkaWriter extends Logging {
     ).dataType match {
       case StringType => // good
       case _ =>
-        throw new AnalysisException(s"Topic type must be a String")
+        throw new AnalysisException(s"Topic type must be a ${StringType.simpleString}")
     }
     schema.find(_.name == KEY_ATTRIBUTE_NAME).getOrElse(
       Literal(null, StringType)
@@ -65,7 +65,7 @@ private[kafka010] object KafkaWriter extends Logging {
       case StringType | BinaryType => // good
       case _ =>
         throw new AnalysisException(s"$KEY_ATTRIBUTE_NAME attribute type " +
-          s"must be a String or BinaryType")
+          s"must be a ${StringType.simpleString} or ${BinaryType.simpleString}")
     }
     schema.find(_.name == VALUE_ATTRIBUTE_NAME).getOrElse(
       throw new AnalysisException(s"Required attribute '$VALUE_ATTRIBUTE_NAME' not found")
@@ -73,7 +73,7 @@ private[kafka010] object KafkaWriter extends Logging {
       case StringType | BinaryType => // good
       case _ =>
         throw new AnalysisException(s"$VALUE_ATTRIBUTE_NAME attribute type " +
-          s"must be a String or BinaryType")
+          s"must be a ${StringType.simpleString} or ${BinaryType.simpleString}")
     }
   }
 
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
index ddfc0c1a4be2d..0e1492ac27449 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
@@ -314,7 +314,7 @@ class KafkaContinuousSinkSuite extends KafkaContinuousTest {
       writer.stop()
     }
     assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
-      "value attribute type must be a string or binarytype"))
+      "value attribute type must be a string or binary"))
 
     try {
       /* key field wrong type */
@@ -330,7 +330,7 @@ class KafkaContinuousSinkSuite extends KafkaContinuousTest {
       writer.stop()
     }
     assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
-      "key attribute type must be a string or binarytype"))
+      "key attribute type must be a string or binary"))
   }
 
   test("streaming - write to non-existing topic") {
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala
index 7079ac6453ffc..70ffd7dee89d7 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala
@@ -303,7 +303,7 @@ class KafkaSinkSuite extends StreamTest with SharedSQLContext {
       writer.stop()
     }
     assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
-      "value attribute type must be a string or binarytype"))
+      "value attribute type must be a string or binary"))
 
     try {
       ex = intercept[StreamingQueryException] {
@@ -318,7 +318,7 @@ class KafkaSinkSuite extends StreamTest with SharedSQLContext {
       writer.stop()
     }
     assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
-      "key attribute type must be a string or binarytype"))
+      "key attribute type must be a string or binary"))
   }
 
   test("streaming - write to non-existing topic") {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala
index 682787a830113..1eac1d1613d2a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala
@@ -69,7 +69,8 @@ class DCT @Since("1.5.0") (@Since("1.5.0") override val uid: String)
   }
 
   override protected def validateInputType(inputType: DataType): Unit = {
-    require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.")
+    require(inputType.isInstanceOf[VectorUDT],
+      s"Input type must be ${(new VectorUDT).simpleString} but got ${inputType.simpleString}.")
   }
 
   override protected def outputDataType: DataType = new VectorUDT
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
index d67e4819b161a..405ea467cb02a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
@@ -208,8 +208,9 @@ class FeatureHasher(@Since("2.3.0") override val uid: String) extends Transforme
       require(dataType.isInstanceOf[NumericType] ||
         dataType.isInstanceOf[StringType] ||
         dataType.isInstanceOf[BooleanType],
-        s"FeatureHasher requires columns to be of NumericType, BooleanType or StringType. " +
-          s"Column $fieldName was $dataType")
+        s"FeatureHasher requires columns to be of ${NumericType.simpleString}, " +
+          s"${BooleanType.simpleString} or ${StringType.simpleString}. " +
+          s"Column $fieldName was ${dataType.simpleString}")
     }
     val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
     SchemaUtils.appendColumn(schema, attrGroup.toStructField())
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
index db432b6fefaff..403b0a813aedd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
@@ -104,7 +104,7 @@ class HashingTF @Since("1.4.0") (@Since("1.4.0") override val uid: String)
   override def transformSchema(schema: StructType): StructType = {
     val inputType = schema($(inputCol)).dataType
     require(inputType.isInstanceOf[ArrayType],
-      s"The input column must be ArrayType, but got $inputType.")
+      s"The input column must be ${ArrayType.simpleString}, but got ${inputType.simpleString}.")
     val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
     SchemaUtils.appendColumn(schema, attrGroup.toStructField())
   }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala
index 4ff1d0ef356f3..5e01ec30bb2eb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala
@@ -261,7 +261,8 @@ private[ml] class FeatureEncoder(numFeatures: Array[Int]) extends Serializable {
    */
   def foreachNonzeroOutput(value: Any, f: (Int, Double) => Unit): Unit = value match {
     case d: Double =>
-      assert(numFeatures.length == 1, "DoubleType columns should only contain one feature.")
+      assert(numFeatures.length == 1,
+        s"${DoubleType.simpleString} columns should only contain one feature.")
       val numOutputCols = numFeatures.head
       if (numOutputCols > 1) {
         assert(
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
index c8760f9dc178f..6445360f7fd90 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
@@ -65,7 +65,7 @@ class NGram @Since("1.5.0") (@Since("1.5.0") override val uid: String)
 
   override protected def validateInputType(inputType: DataType): Unit = {
     require(inputType.sameType(ArrayType(StringType)),
-      s"Input type must be ArrayType(StringType) but got $inputType.")
+      s"Input type must be ${ArrayType(StringType).simpleString} but got $inputType.")
   }
 
   override protected def outputDataType: DataType = new ArrayType(StringType, false)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
index 5ab6c2dde667a..24045f0448c81 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
@@ -85,7 +85,8 @@ class OneHotEncoder @Since("1.4.0") (@Since("1.4.0") override val uid: String) e
     val inputFields = schema.fields
 
     require(schema(inputColName).dataType.isInstanceOf[NumericType],
-      s"Input column must be of type NumericType but got ${schema(inputColName).dataType}")
+      s"Input column must be of type ${NumericType.simpleString} but got " +
+        schema(inputColName).dataType.simpleString)
     require(!inputFields.exists(_.name == outputColName),
       s"Output column $outputColName already exists.")
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index 55e595eee6ffb..346e1823f00b8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -394,7 +394,7 @@ class RFormulaModel private[feature](
     require(!columnNames.contains($(featuresCol)), "Features column already exists.")
     require(
       !columnNames.contains($(labelCol)) || schema($(labelCol)).dataType.isInstanceOf[NumericType],
-      "Label column already exists and is not of type NumericType.")
+      s"Label column already exists and is not of type ${NumericType.simpleString}.")
   }
 
   @Since("2.0.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
index 0f946dd2e015b..ead75d5b8def3 100755
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
@@ -131,8 +131,8 @@ class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String
   @Since("1.5.0")
   override def transformSchema(schema: StructType): StructType = {
     val inputType = schema($(inputCol)).dataType
-    require(inputType.sameType(ArrayType(StringType)),
-      s"Input type must be ArrayType(StringType) but got $inputType.")
+    require(inputType.sameType(ArrayType(StringType)), "Input type must be " +
+      s"${ArrayType(StringType).simpleString} but got ${inputType.simpleString}.")
     SchemaUtils.appendColumn(schema, $(outputCol), inputType, schema($(inputCol)).nullable)
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
index cfaf6c0e610b3..5132f63af1796 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
@@ -40,7 +40,8 @@ class Tokenizer @Since("1.4.0") (@Since("1.4.0") override val uid: String)
   }
 
   override protected def validateInputType(inputType: DataType): Unit = {
-    require(inputType == StringType, s"Input type must be string type but got $inputType.")
+    require(inputType == StringType,
+      s"Input type must be ${StringType.simpleString} type but got ${inputType.simpleString}.")
   }
 
   override protected def outputDataType: DataType = new ArrayType(StringType, true)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
index 4061154b39c14..ed3b36ee5ab2f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
@@ -162,7 +162,7 @@ class VectorAssembler @Since("1.4.0") (@Since("1.4.0") override val uid: String)
       schema(name).dataType match {
         case _: NumericType | BooleanType => None
         case t if t.isInstanceOf[VectorUDT] => None
-        case other => Some(s"Data type $other of column $name is not supported.")
+        case other => Some(s"Data type ${other.simpleString} of column $name is not supported.")
       }
     }
     if (incorrectColumns.nonEmpty) {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala
index d7fbe28ae7a64..51b88b3117f4e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala
@@ -106,7 +106,7 @@ private[fpm] trait FPGrowthParams extends Params with HasPredictionCol {
   protected def validateAndTransformSchema(schema: StructType): StructType = {
     val inputType = schema($(itemsCol)).dataType
     require(inputType.isInstanceOf[ArrayType],
-      s"The input column must be ArrayType, but got $inputType.")
+      s"The input column must be ${ArrayType.simpleString}, but got ${inputType.simpleString}.")
     SchemaUtils.appendColumn(schema, $(predictionCol), schema($(itemsCol)).dataType)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
index d9a3f85ef9a24..b500582074398 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
@@ -41,7 +41,8 @@ private[spark] object SchemaUtils {
     val actualDataType = schema(colName).dataType
     val message = if (msg != null && msg.trim.length > 0) " " + msg else ""
     require(actualDataType.equals(dataType),
-      s"Column $colName must be of type $dataType but was actually $actualDataType.$message")
+      s"Column $colName must be of type ${dataType.simpleString} but was actually " +
+        s"${actualDataType.simpleString}.$message")
   }
 
   /**
@@ -58,7 +59,8 @@ private[spark] object SchemaUtils {
     val message = if (msg != null && msg.trim.length > 0) " " + msg else ""
     require(dataTypes.exists(actualDataType.equals),
       s"Column $colName must be of type equal to one of the following types: " +
-        s"${dataTypes.mkString("[", ", ", "]")} but was actually of type $actualDataType.$message")
+        s"${dataTypes.map(_.simpleString).mkString("[", ", ", "]")} but was actually of type " +
+        s"${actualDataType.simpleString}.$message")
   }
 
   /**
@@ -71,8 +73,9 @@ private[spark] object SchemaUtils {
       msg: String = ""): Unit = {
     val actualDataType = schema(colName).dataType
     val message = if (msg != null && msg.trim.length > 0) " " + msg else ""
-    require(actualDataType.isInstanceOf[NumericType], s"Column $colName must be of type " +
-      s"NumericType but was actually of type $actualDataType.$message")
+    require(actualDataType.isInstanceOf[NumericType],
+      s"Column $colName must be of type ${NumericType.simpleString} but was actually of type " +
+      s"${actualDataType.simpleString}.$message")
   }
 
   /**
diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluatorSuite.scala
index ede284712b1c0..2b0909acf69c3 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluatorSuite.scala
@@ -67,8 +67,8 @@ class BinaryClassificationEvaluatorSuite
       evaluator.evaluate(stringDF)
     }
     assert(thrown.getMessage.replace("\n", "") contains "Column rawPrediction must be of type " +
-      "equal to one of the following types: [DoubleType, ")
-    assert(thrown.getMessage.replace("\n", "") contains "but was actually of type StringType.")
+      "equal to one of the following types: [double, ")
+    assert(thrown.getMessage.replace("\n", "") contains "but was actually of type string.")
   }
 
   test("should support all NumericType labels and not support other types") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index a250331efeb1d..0de6528c4cf22 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -105,7 +105,7 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
     testTransformerByInterceptingException[(Int, Boolean)](
       original,
       model,
-      "Label column already exists and is not of type NumericType.",
+      "Label column already exists and is not of type numeric.",
       "x")
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
index 91fb24a268b8c..ed15a1d88a269 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
@@ -99,9 +99,9 @@ class VectorAssemblerSuite
       assembler.transform(df)
     }
     assert(thrown.getMessage contains
-      "Data type StringType of column a is not supported.\n" +
-      "Data type StringType of column b is not supported.\n" +
-      "Data type StringType of column c is not supported.")
+      "Data type string of column a is not supported.\n" +
+      "Data type string of column b is not supported.\n" +
+      "Data type string of column c is not supported.")
   }
 
   test("ML attributes") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
index e3dfe2faf5698..65bee4edc4965 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
@@ -612,7 +612,7 @@ class ALSSuite extends MLTest with DefaultReadWriteTest with Logging {
       estimator.fit(strDF)
     }
     assert(thrown.getMessage.contains(
-      s"$column must be of type NumericType but was actually of type StringType"))
+      s"$column must be of type numeric but was actually of type string"))
   }
 
   private class NumericTypeWithEncoder[A](val numericType: NumericType)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
index 4e4ff71c9de90..6cc73e040e82c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
@@ -385,7 +385,7 @@ class AFTSurvivalRegressionSuite extends MLTest with DefaultReadWriteTest {
       aft.fit(dfWithStringCensors)
     }
     assert(thrown.getMessage.contains(
-      "Column censor must be of type NumericType but was actually of type StringType"))
+      "Column censor must be of type numeric but was actually of type string"))
   }
 
   test("numerical stability of standardization") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala b/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala
index 5e72b4d864c1d..91a8b14625a86 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala
@@ -74,7 +74,7 @@ object MLTestingUtils extends SparkFunSuite {
       estimator.fit(dfWithStringLabels)
     }
     assert(thrown.getMessage.contains(
-      "Column label must be of type NumericType but was actually of type StringType"))
+      "Column label must be of type numeric but was actually of type string"))
 
     estimator match {
       case weighted: Estimator[M] with HasWeightCol =>
@@ -86,7 +86,7 @@ object MLTestingUtils extends SparkFunSuite {
           weighted.fit(dfWithStringWeights)
         }
         assert(thrown.getMessage.contains(
-          "Column weight must be of type NumericType but was actually of type StringType"))
+          "Column weight must be of type numeric but was actually of type string"))
       case _ =>
     }
   }
@@ -104,7 +104,7 @@ object MLTestingUtils extends SparkFunSuite {
       evaluator.evaluate(dfWithStringLabels)
     }
     assert(thrown.getMessage.contains(
-      "Column label must be of type NumericType but was actually of type StringType"))
+      "Column label must be of type numeric but was actually of type string"))
   }
 
   def genClassifDFWithNumericLabelCol(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 0a5f8a907b50a..cf0e3765de80f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -385,8 +385,8 @@ trait CreateNamedStructLike extends Expression {
       val invalidNames = nameExprs.filterNot(e => e.foldable && e.dataType == StringType)
       if (invalidNames.nonEmpty) {
         TypeCheckResult.TypeCheckFailure(
-          "Only foldable StringType expressions are allowed to appear at odd position, got:" +
-            s" ${invalidNames.mkString(",")}")
+          s"Only foldable ${StringType.simpleString} expressions are allowed to appear at odd" +
+            s" position, got: ${invalidNames.mkString(",")}")
       } else if (!names.contains(null)) {
         TypeCheckResult.TypeCheckSuccess
       } else {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index 8cd86053a01c7..1bcf11d7ee737 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -796,7 +796,7 @@ object JsonExprUtils {
       }
     case m: CreateMap =>
       throw new AnalysisException(
-        s"A type of keys and values in map() must be string, but got ${m.dataType}")
+        s"A type of keys and values in map() must be string, but got ${m.dataType.simpleString}")
     case _ =>
       throw new AnalysisException("Must use a map() function for options")
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index bedad7da334ae..70dd4df9df511 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -222,11 +222,12 @@ case class Elt(children: Seq[Expression]) extends Expression {
       val (indexType, inputTypes) = (indexExpr.dataType, inputExprs.map(_.dataType))
       if (indexType != IntegerType) {
         return TypeCheckResult.TypeCheckFailure(s"first input to function $prettyName should " +
-          s"have IntegerType, but it's $indexType")
+          s"have ${IntegerType.simpleString}, but it's ${indexType.simpleString}")
       }
       if (inputTypes.exists(tpe => !Seq(StringType, BinaryType).contains(tpe))) {
         return TypeCheckResult.TypeCheckFailure(
-          s"input to function $prettyName should have StringType or BinaryType, but it's " +
+          s"input to function $prettyName should have ${StringType.simpleString} or " +
+            s"${BinaryType.simpleString}, but it's " +
             inputTypes.map(_.simpleString).mkString("[", ", ", "]"))
       }
       TypeUtils.checkForSameTypeInputExpr(inputTypes, s"function $prettyName")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
index 9c413de752a8c..00086abbefd08 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
@@ -45,8 +45,8 @@ private[sql] class JacksonGenerator(
 
   // `JackGenerator` can only be initialized with a `StructType` or a `MapType`.
   require(dataType.isInstanceOf[StructType] || dataType.isInstanceOf[MapType],
-    "JacksonGenerator only supports to be initialized with a StructType " +
-      s"or MapType but got ${dataType.simpleString}")
+    s"JacksonGenerator only supports to be initialized with a ${StructType.simpleString} " +
+      s"or ${MapType.simpleString} but got ${dataType.simpleString}")
 
   // `ValueWriter`s for all fields of the schema
   private lazy val rootFieldWriters: Array[ValueWriter] = dataType match {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
index c3a4ca8f64bf6..aa1691bb40d93 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
@@ -143,7 +143,8 @@ class JacksonParser(
             case "NaN" => Float.NaN
             case "Infinity" => Float.PositiveInfinity
             case "-Infinity" => Float.NegativeInfinity
-            case other => throw new RuntimeException(s"Cannot parse $other as FloatType.")
+            case other => throw new RuntimeException(
+              s"Cannot parse $other as ${FloatType.simpleString}.")
           }
       }
 
@@ -158,7 +159,8 @@ class JacksonParser(
             case "NaN" => Double.NaN
             case "Infinity" => Double.PositiveInfinity
             case "-Infinity" => Double.NegativeInfinity
-            case other => throw new RuntimeException(s"Cannot parse $other as DoubleType.")
+            case other =>
+              throw new RuntimeException(s"Cannot parse $other as ${DoubleType.simpleString}.")
           }
       }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
index 491ca005877f8..5f70e062d46c8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
@@ -294,8 +294,10 @@ private[sql] object JsonInferSchema {
           // Both fields1 and fields2 should be sorted by name, since inferField performs sorting.
           // Therefore, we can take advantage of the fact that we're merging sorted lists and skip
           // building a hash map or performing additional sorting.
-          assert(isSorted(fields1), s"StructType's fields were not sorted: ${fields1.toSeq}")
-          assert(isSorted(fields2), s"StructType's fields were not sorted: ${fields2.toSeq}")
+          assert(isSorted(fields1),
+            s"${StructType.simpleString}'s fields were not sorted: ${fields1.toSeq}")
+          assert(isSorted(fields2),
+            s"${StructType.simpleString}'s fields were not sorted: ${fields2.toSeq}")
 
           val newFields = new java.util.ArrayList[StructField]()
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala
index 1dcda49a3af6a..a9aaf617f7837 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala
@@ -29,7 +29,7 @@ object TypeUtils {
     if (dt.isInstanceOf[NumericType] || dt == NullType) {
       TypeCheckResult.TypeCheckSuccess
     } else {
-      TypeCheckResult.TypeCheckFailure(s"$caller requires numeric types, not $dt")
+      TypeCheckResult.TypeCheckFailure(s"$caller requires numeric types, not ${dt.simpleString}")
     }
   }
 
@@ -37,7 +37,8 @@ object TypeUtils {
     if (RowOrdering.isOrderable(dt)) {
       TypeCheckResult.TypeCheckSuccess
     } else {
-      TypeCheckResult.TypeCheckFailure(s"$caller does not support ordering on type $dt")
+      TypeCheckResult.TypeCheckFailure(
+        s"$caller does not support ordering on type ${dt.simpleString}")
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
index 3041f44b116ea..c43cc748655e8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
@@ -145,7 +145,7 @@ abstract class NumericType extends AtomicType {
 }
 
 
-private[sql] object NumericType extends AbstractDataType {
+private[spark] object NumericType extends AbstractDataType {
   /**
    * Enables matching against NumericType for expressions:
    * {{{
@@ -155,11 +155,12 @@ private[sql] object NumericType extends AbstractDataType {
    */
   def unapply(e: Expression): Boolean = e.dataType.isInstanceOf[NumericType]
 
-  override private[sql] def defaultConcreteType: DataType = DoubleType
+  override private[spark] def defaultConcreteType: DataType = DoubleType
 
-  override private[sql] def simpleString: String = "numeric"
+  override private[spark] def simpleString: String = "numeric"
 
-  override private[sql] def acceptsType(other: DataType): Boolean = other.isInstanceOf[NumericType]
+  override private[spark] def acceptsType(other: DataType): Boolean =
+    other.isInstanceOf[NumericType]
 }
 
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
index 38c40482fa4d9..8f118624f6d2f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
@@ -42,7 +42,7 @@ object ArrayType extends AbstractDataType {
     other.isInstanceOf[ArrayType]
   }
 
-  override private[sql] def simpleString: String = "array"
+  override private[spark] def simpleString: String = "array"
 }
 
 /**
@@ -103,7 +103,8 @@ case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataT
       case a : ArrayType => a.interpretedOrdering.asInstanceOf[Ordering[Any]]
       case s: StructType => s.interpretedOrdering.asInstanceOf[Ordering[Any]]
       case other =>
-        throw new IllegalArgumentException(s"Type $other does not support ordered operations")
+        throw new IllegalArgumentException(
+          s"Type ${other.simpleString} does not support ordered operations")
     }
 
     def compare(x: ArrayData, y: ArrayData): Int = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
index dbf51c398fa47..f780ffd46a876 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
@@ -48,7 +48,8 @@ case class DecimalType(precision: Int, scale: Int) extends FractionalType {
   }
 
   if (precision > DecimalType.MAX_PRECISION) {
-    throw new AnalysisException(s"DecimalType can only support precision up to 38")
+    throw new AnalysisException(
+      s"${DecimalType.simpleString} can only support precision up to ${DecimalType.MAX_PRECISION}")
   }
 
   // default constructor for Java
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala
index 2d49fe076786a..203e85e1c99bd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala
@@ -24,7 +24,8 @@ import org.apache.spark.annotation.InterfaceStability
 @InterfaceStability.Evolving
 object ObjectType extends AbstractDataType {
   override private[sql] def defaultConcreteType: DataType =
-    throw new UnsupportedOperationException("null literals can't be casted to ObjectType")
+    throw new UnsupportedOperationException(
+      s"null literals can't be casted to ${ObjectType.simpleString}")
 
   override private[sql] def acceptsType(other: DataType): Boolean = other match {
     case ObjectType(_) => true
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index 362676b252126..0e69ef8ba73e8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -426,7 +426,7 @@ object StructType extends AbstractDataType {
   private[sql] def fromString(raw: String): StructType = {
     Try(DataType.fromJson(raw)).getOrElse(LegacyTypeStringParser.parse(raw)) match {
       case t: StructType => t
-      case _ => throw new RuntimeException(s"Failed parsing StructType: $raw")
+      case _ => throw new RuntimeException(s"Failed parsing ${StructType.simpleString}: $raw")
     }
   }
 
@@ -528,7 +528,8 @@ object StructType extends AbstractDataType {
         leftType
 
       case _ =>
-        throw new SparkException(s"Failed to merge incompatible data types $left and $right")
+        throw new SparkException(s"Failed to merge incompatible data types ${left.simpleString} " +
+          s"and ${right.simpleString}")
     }
 
   private[sql] def fieldsMap(fields: Array[StructField]): Map[String, StructField] = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index 5d2f8e735e3d4..5e503be416a1f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -514,7 +514,7 @@ class AnalysisErrorSuite extends AnalysisTest {
       right,
       joinType = Cross,
       condition = Some('b === 'd))
-    assertAnalysisError(plan2, "EqualTo does not support ordering on type MapType" :: Nil)
+    assertAnalysisError(plan2, "EqualTo does not support ordering on type map" :: Nil)
   }
 
   test("PredicateSubQuery is used outside of a filter") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
index 36714bd631b0e..8eec14842c7e7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
@@ -109,17 +109,17 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
     assertErrorForDifferingTypes(GreaterThan('intField, 'booleanField))
     assertErrorForDifferingTypes(GreaterThanOrEqual('intField, 'booleanField))
 
-    assertError(EqualTo('mapField, 'mapField), "EqualTo does not support ordering on type MapType")
+    assertError(EqualTo('mapField, 'mapField), "EqualTo does not support ordering on type map")
     assertError(EqualNullSafe('mapField, 'mapField),
-      "EqualNullSafe does not support ordering on type MapType")
+      "EqualNullSafe does not support ordering on type map")
     assertError(LessThan('mapField, 'mapField),
-      "LessThan does not support ordering on type MapType")
+      "LessThan does not support ordering on type map")
     assertError(LessThanOrEqual('mapField, 'mapField),
-      "LessThanOrEqual does not support ordering on type MapType")
+      "LessThanOrEqual does not support ordering on type map")
     assertError(GreaterThan('mapField, 'mapField),
-      "GreaterThan does not support ordering on type MapType")
+      "GreaterThan does not support ordering on type map")
     assertError(GreaterThanOrEqual('mapField, 'mapField),
-      "GreaterThanOrEqual does not support ordering on type MapType")
+      "GreaterThanOrEqual does not support ordering on type map")
 
     assertError(If('intField, 'stringField, 'stringField),
       "type of predicate expression in If should be boolean")
@@ -169,10 +169,10 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
       CreateNamedStruct(Seq("a", "b", 2.0)), "even number of arguments")
     assertError(
       CreateNamedStruct(Seq(1, "a", "b", 2.0)),
-      "Only foldable StringType expressions are allowed to appear at odd position")
+      "Only foldable string expressions are allowed to appear at odd position")
     assertError(
       CreateNamedStruct(Seq('a.string.at(0), "a", "b", 2.0)),
-      "Only foldable StringType expressions are allowed to appear at odd position")
+      "Only foldable string expressions are allowed to appear at odd position")
     assertError(
       CreateNamedStruct(Seq(Literal.create(null, StringType), "a")),
       "Field name should not be null")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
index cb8a1fecb80a7..b4d422d8506fc 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
@@ -469,7 +469,7 @@ class ExpressionParserSuite extends PlanTest {
       Literal(BigDecimal("90912830918230182310293801923652346786").underlying()))
     assertEqual("123.0E-28BD", Literal(BigDecimal("123.0E-28").underlying()))
     assertEqual("123.08BD", Literal(BigDecimal("123.08").underlying()))
-    intercept("1.20E-38BD", "DecimalType can only support precision up to 38")
+    intercept("1.20E-38BD", "decimal can only support precision up to 38")
   }
 
   test("strings") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
index 5a86f4055dce7..fccd057e577d4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -154,7 +154,7 @@ class DataTypeSuite extends SparkFunSuite {
       left.merge(right)
     }.getMessage
     assert(message.equals("Failed to merge fields 'b' and 'b'. " +
-      "Failed to merge incompatible data types FloatType and LongType"))
+      "Failed to merge incompatible data types float and bigint"))
   }
 
   test("existsRecursively") {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
index d5969b55eef96..060e2ec068053 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -244,7 +244,7 @@ private SchemaColumnConvertNotSupportedException constructConvertNotSupportedExc
     return new SchemaColumnConvertNotSupportedException(
       Arrays.toString(descriptor.getPath()),
       descriptor.getType().toString(),
-      column.dataType().toString());
+      column.dataType().simpleString());
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
index c6449cd5a16b0..b068493f2dd17 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
@@ -452,7 +452,7 @@ class RelationalGroupedDataset protected[sql](
     require(expr.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
       "Must pass a grouped map udf")
     require(expr.dataType.isInstanceOf[StructType],
-      "The returnType of the udf must be a StructType")
+      s"The returnType of the udf must be a ${StructType.simpleString}")
 
     val groupingNamedExpressions = groupingExprs.map {
       case ne: NamedExpression => ne
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala
index 93c8127681b3e..1274abffaa116 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala
@@ -47,7 +47,8 @@ object ArrowUtils {
     case DateType => new ArrowType.Date(DateUnit.DAY)
     case TimestampType =>
       if (timeZoneId == null) {
-        throw new UnsupportedOperationException("TimestampType must supply timeZoneId parameter")
+        throw new UnsupportedOperationException(
+          s"${TimestampType.simpleString} must supply timeZoneId parameter")
       } else {
         new ArrowType.Timestamp(TimeUnit.MICROSECOND, timeZoneId)
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala
index 4f44ae4fa1d71..c90328f7ad43f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala
@@ -98,7 +98,7 @@ private[orc] object OrcFilters {
     case DateType => PredicateLeaf.Type.DATE
     case TimestampType => PredicateLeaf.Type.TIMESTAMP
     case _: DecimalType => PredicateLeaf.Type.DECIMAL
-    case _ => throw new UnsupportedOperationException(s"DataType: $dataType")
+    case _ => throw new UnsupportedOperationException(s"DataType: ${dataType.simpleString}")
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala
index c61be077d309f..18decad3f62f0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala
@@ -555,7 +555,7 @@ class SparkToParquetSchemaConverter(
         convertField(field.copy(dataType = udt.sqlType))
 
       case _ =>
-        throw new AnalysisException(s"Unsupported data type $field.dataType")
+        throw new AnalysisException(s"Unsupported data type ${field.dataType.simpleString}")
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
index 685d5841ab551..f772a3336d6af 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@@ -157,7 +157,7 @@ object StatFunctions extends Logging {
     cols.map(name => (name, df.schema.fields.find(_.name == name))).foreach { case (name, data) =>
       require(data.nonEmpty, s"Couldn't find column with name $name")
       require(data.get.dataType.isInstanceOf[NumericType], s"Currently $functionName calculation " +
-        s"for columns with dataType ${data.get.dataType} not supported.")
+        s"for columns with dataType ${data.get.dataType.simpleString} not supported.")
     }
     val columns = cols.map(n => Column(Cast(Column(n).expr, DoubleType)))
     df.select(columns: _*).queryExecution.toRdd.treeAggregate(new CovarianceCounter)(
diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
index 3d49323751a10..827931d74138d 100644
--- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
@@ -120,7 +120,7 @@ select to_json(named_struct('a', 1, 'b', 2), map('mode', 1))
 struct<>
 -- !query 11 output
 org.apache.spark.sql.AnalysisException
-A type of keys and values in map() must be string, but got MapType(StringType,IntegerType,false);; line 1 pos 7
+A type of keys and values in map() must be string, but got map<string,int>;; line 1 pos 7
 
 
 -- !query 12
@@ -216,7 +216,7 @@ select from_json('{"a":1}', 'a INT', map('mode', 1))
 struct<>
 -- !query 20 output
 org.apache.spark.sql.AnalysisException
-A type of keys and values in map() must be string, but got MapType(StringType,IntegerType,false);; line 1 pos 7
+A type of keys and values in map() must be string, but got map<string,int>;; line 1 pos 7
 
 
 -- !query 21
diff --git a/sql/core/src/test/resources/sql-tests/results/literals.sql.out b/sql/core/src/test/resources/sql-tests/results/literals.sql.out
index b8c91dc8b59a4..7f301614523b2 100644
--- a/sql/core/src/test/resources/sql-tests/results/literals.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/literals.sql.out
@@ -147,7 +147,7 @@ struct<>
 -- !query 15 output
 org.apache.spark.sql.catalyst.parser.ParseException
 
-DecimalType can only support precision up to 38
+decimal can only support precision up to 38
 == SQL ==
 select 1234567890123456789012345678901234567890
 
@@ -159,7 +159,7 @@ struct<>
 -- !query 16 output
 org.apache.spark.sql.catalyst.parser.ParseException
 
-DecimalType can only support precision up to 38
+decimal can only support precision up to 38
 == SQL ==
 select 1234567890123456789012345678901234567890.0
 
@@ -379,7 +379,7 @@ struct<>
 -- !query 39 output
 org.apache.spark.sql.catalyst.parser.ParseException
 
-DecimalType can only support precision up to 38(line 1, pos 7)
+decimal can only support precision up to 38(line 1, pos 7)
 
 == SQL ==
 select 1.20E-38BD
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
index 9d3dfae348beb..368e52cfbda9c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
@@ -430,9 +430,9 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       val col = spark.read.parquet(file).schema.fields.filter(_.name.equals("a"))
       assert(col.length == 1)
       if (col(0).dataType == StringType) {
-        assert(errMsg.contains("Column: [a], Expected: IntegerType, Found: BINARY"))
+        assert(errMsg.contains("Column: [a], Expected: int, Found: BINARY"))
       } else {
-        assert(errMsg.endsWith("Column: [a], Expected: StringType, Found: INT32"))
+        assert(errMsg.endsWith("Column: [a], Expected: string, Found: INT32"))
       }
     }
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
index 7dcaf170f9693..40be4e8c1f5be 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
@@ -78,9 +78,9 @@ case class HiveTableScanExec(
   // Bind all partition key attribute references in the partition pruning predicate for later
   // evaluation.
   private lazy val boundPruningPred = partitionPruningPred.reduceLeftOption(And).map { pred =>
-    require(
-      pred.dataType == BooleanType,
-      s"Data type of predicate $pred must be BooleanType rather than ${pred.dataType}.")
+    require(pred.dataType == BooleanType,
+      s"Data type of predicate $pred must be ${BooleanType.simpleString} rather than " +
+        s"${pred.dataType.simpleString}.")
 
     BindReferences.bindReference(pred, relation.partitionCols)
   }

From aec966b05e8df9d459dae88d091de1923e50e2dc Mon Sep 17 00:00:00 2001
From: Xiao Li <gatorsmile@gmail.com>
Date: Mon, 9 Jul 2018 14:24:23 -0700
Subject: [PATCH 76/79] Revert "[SPARK-24268][SQL] Use datatype.simpleString in
 error messages"

This reverts commit 1bd3d61f4191767a94b71b42f4d00706b703e84f.
---
 .../spark/sql/kafka010/KafkaWriteTask.scala      |  6 +++---
 .../apache/spark/sql/kafka010/KafkaWriter.scala  |  6 +++---
 .../sql/kafka010/KafkaContinuousSinkSuite.scala  |  4 ++--
 .../spark/sql/kafka010/KafkaSinkSuite.scala      |  4 ++--
 .../scala/org/apache/spark/ml/feature/DCT.scala  |  3 +--
 .../apache/spark/ml/feature/FeatureHasher.scala  |  5 ++---
 .../org/apache/spark/ml/feature/HashingTF.scala  |  2 +-
 .../apache/spark/ml/feature/Interaction.scala    |  3 +--
 .../org/apache/spark/ml/feature/NGram.scala      |  2 +-
 .../apache/spark/ml/feature/OneHotEncoder.scala  |  3 +--
 .../org/apache/spark/ml/feature/RFormula.scala   |  2 +-
 .../spark/ml/feature/StopWordsRemover.scala      |  4 ++--
 .../org/apache/spark/ml/feature/Tokenizer.scala  |  3 +--
 .../spark/ml/feature/VectorAssembler.scala       |  2 +-
 .../scala/org/apache/spark/ml/fpm/FPGrowth.scala |  2 +-
 .../org/apache/spark/ml/util/SchemaUtils.scala   | 11 ++++-------
 .../BinaryClassificationEvaluatorSuite.scala     |  4 ++--
 .../apache/spark/ml/feature/RFormulaSuite.scala  |  2 +-
 .../spark/ml/feature/VectorAssemblerSuite.scala  |  6 +++---
 .../spark/ml/recommendation/ALSSuite.scala       |  2 +-
 .../regression/AFTSurvivalRegressionSuite.scala  |  2 +-
 .../apache/spark/ml/util/MLTestingUtils.scala    |  6 +++---
 .../expressions/complexTypeCreator.scala         |  4 ++--
 .../catalyst/expressions/jsonExpressions.scala   |  2 +-
 .../catalyst/expressions/stringExpressions.scala |  5 ++---
 .../sql/catalyst/json/JacksonGenerator.scala     |  4 ++--
 .../spark/sql/catalyst/json/JacksonParser.scala  |  6 ++----
 .../sql/catalyst/json/JsonInferSchema.scala      |  6 ++----
 .../spark/sql/catalyst/util/TypeUtils.scala      |  5 ++---
 .../spark/sql/types/AbstractDataType.scala       |  9 ++++-----
 .../org/apache/spark/sql/types/ArrayType.scala   |  5 ++---
 .../org/apache/spark/sql/types/DecimalType.scala |  3 +--
 .../org/apache/spark/sql/types/ObjectType.scala  |  3 +--
 .../org/apache/spark/sql/types/StructType.scala  |  5 ++---
 .../catalyst/analysis/AnalysisErrorSuite.scala   |  2 +-
 .../analysis/ExpressionTypeCheckingSuite.scala   | 16 ++++++++--------
 .../catalyst/parser/ExpressionParserSuite.scala  |  2 +-
 .../apache/spark/sql/types/DataTypeSuite.scala   |  2 +-
 .../parquet/VectorizedColumnReader.java          |  2 +-
 .../spark/sql/RelationalGroupedDataset.scala     |  2 +-
 .../spark/sql/execution/arrow/ArrowUtils.scala   |  3 +--
 .../execution/datasources/orc/OrcFilters.scala   |  2 +-
 .../parquet/ParquetSchemaConverter.scala         |  2 +-
 .../spark/sql/execution/stat/StatFunctions.scala |  2 +-
 .../sql-tests/results/json-functions.sql.out     |  4 ++--
 .../resources/sql-tests/results/literals.sql.out |  6 +++---
 .../datasources/parquet/ParquetSchemaSuite.scala |  4 ++--
 .../sql/hive/execution/HiveTableScanExec.scala   |  6 +++---
 48 files changed, 88 insertions(+), 108 deletions(-)

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
index 59a84706d4f55..d90630a8adc93 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
@@ -110,7 +110,7 @@ private[kafka010] abstract class KafkaRowWriter(
       case t =>
         throw new IllegalStateException(s"${KafkaWriter.TOPIC_ATTRIBUTE_NAME} " +
           s"attribute unsupported type $t. ${KafkaWriter.TOPIC_ATTRIBUTE_NAME} " +
-          s"must be a ${StringType.simpleString}")
+          "must be a StringType")
     }
     val keyExpression = inputSchema.find(_.name == KafkaWriter.KEY_ATTRIBUTE_NAME)
       .getOrElse(Literal(null, BinaryType))
@@ -118,7 +118,7 @@ private[kafka010] abstract class KafkaRowWriter(
       case StringType | BinaryType => // good
       case t =>
         throw new IllegalStateException(s"${KafkaWriter.KEY_ATTRIBUTE_NAME} " +
-          s"attribute unsupported type ${t.simpleString}")
+          s"attribute unsupported type $t")
     }
     val valueExpression = inputSchema
       .find(_.name == KafkaWriter.VALUE_ATTRIBUTE_NAME).getOrElse(
@@ -129,7 +129,7 @@ private[kafka010] abstract class KafkaRowWriter(
       case StringType | BinaryType => // good
       case t =>
         throw new IllegalStateException(s"${KafkaWriter.VALUE_ATTRIBUTE_NAME} " +
-          s"attribute unsupported type ${t.simpleString}")
+          s"attribute unsupported type $t")
     }
     UnsafeProjection.create(
       Seq(topicExpression, Cast(keyExpression, BinaryType),
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
index 3ec26e9edd353..15cd44812cb0c 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
@@ -57,7 +57,7 @@ private[kafka010] object KafkaWriter extends Logging {
     ).dataType match {
       case StringType => // good
       case _ =>
-        throw new AnalysisException(s"Topic type must be a ${StringType.simpleString}")
+        throw new AnalysisException(s"Topic type must be a String")
     }
     schema.find(_.name == KEY_ATTRIBUTE_NAME).getOrElse(
       Literal(null, StringType)
@@ -65,7 +65,7 @@ private[kafka010] object KafkaWriter extends Logging {
       case StringType | BinaryType => // good
       case _ =>
         throw new AnalysisException(s"$KEY_ATTRIBUTE_NAME attribute type " +
-          s"must be a ${StringType.simpleString} or ${BinaryType.simpleString}")
+          s"must be a String or BinaryType")
     }
     schema.find(_.name == VALUE_ATTRIBUTE_NAME).getOrElse(
       throw new AnalysisException(s"Required attribute '$VALUE_ATTRIBUTE_NAME' not found")
@@ -73,7 +73,7 @@ private[kafka010] object KafkaWriter extends Logging {
       case StringType | BinaryType => // good
       case _ =>
         throw new AnalysisException(s"$VALUE_ATTRIBUTE_NAME attribute type " +
-          s"must be a ${StringType.simpleString} or ${BinaryType.simpleString}")
+          s"must be a String or BinaryType")
     }
   }
 
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
index 0e1492ac27449..ddfc0c1a4be2d 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
@@ -314,7 +314,7 @@ class KafkaContinuousSinkSuite extends KafkaContinuousTest {
       writer.stop()
     }
     assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
-      "value attribute type must be a string or binary"))
+      "value attribute type must be a string or binarytype"))
 
     try {
       /* key field wrong type */
@@ -330,7 +330,7 @@ class KafkaContinuousSinkSuite extends KafkaContinuousTest {
       writer.stop()
     }
     assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
-      "key attribute type must be a string or binary"))
+      "key attribute type must be a string or binarytype"))
   }
 
   test("streaming - write to non-existing topic") {
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala
index 70ffd7dee89d7..7079ac6453ffc 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala
@@ -303,7 +303,7 @@ class KafkaSinkSuite extends StreamTest with SharedSQLContext {
       writer.stop()
     }
     assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
-      "value attribute type must be a string or binary"))
+      "value attribute type must be a string or binarytype"))
 
     try {
       ex = intercept[StreamingQueryException] {
@@ -318,7 +318,7 @@ class KafkaSinkSuite extends StreamTest with SharedSQLContext {
       writer.stop()
     }
     assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
-      "key attribute type must be a string or binary"))
+      "key attribute type must be a string or binarytype"))
   }
 
   test("streaming - write to non-existing topic") {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala
index 1eac1d1613d2a..682787a830113 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala
@@ -69,8 +69,7 @@ class DCT @Since("1.5.0") (@Since("1.5.0") override val uid: String)
   }
 
   override protected def validateInputType(inputType: DataType): Unit = {
-    require(inputType.isInstanceOf[VectorUDT],
-      s"Input type must be ${(new VectorUDT).simpleString} but got ${inputType.simpleString}.")
+    require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.")
   }
 
   override protected def outputDataType: DataType = new VectorUDT
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
index 405ea467cb02a..d67e4819b161a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
@@ -208,9 +208,8 @@ class FeatureHasher(@Since("2.3.0") override val uid: String) extends Transforme
       require(dataType.isInstanceOf[NumericType] ||
         dataType.isInstanceOf[StringType] ||
         dataType.isInstanceOf[BooleanType],
-        s"FeatureHasher requires columns to be of ${NumericType.simpleString}, " +
-          s"${BooleanType.simpleString} or ${StringType.simpleString}. " +
-          s"Column $fieldName was ${dataType.simpleString}")
+        s"FeatureHasher requires columns to be of NumericType, BooleanType or StringType. " +
+          s"Column $fieldName was $dataType")
     }
     val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
     SchemaUtils.appendColumn(schema, attrGroup.toStructField())
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
index 403b0a813aedd..db432b6fefaff 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
@@ -104,7 +104,7 @@ class HashingTF @Since("1.4.0") (@Since("1.4.0") override val uid: String)
   override def transformSchema(schema: StructType): StructType = {
     val inputType = schema($(inputCol)).dataType
     require(inputType.isInstanceOf[ArrayType],
-      s"The input column must be ${ArrayType.simpleString}, but got ${inputType.simpleString}.")
+      s"The input column must be ArrayType, but got $inputType.")
     val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
     SchemaUtils.appendColumn(schema, attrGroup.toStructField())
   }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala
index 5e01ec30bb2eb..4ff1d0ef356f3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala
@@ -261,8 +261,7 @@ private[ml] class FeatureEncoder(numFeatures: Array[Int]) extends Serializable {
    */
   def foreachNonzeroOutput(value: Any, f: (Int, Double) => Unit): Unit = value match {
     case d: Double =>
-      assert(numFeatures.length == 1,
-        s"${DoubleType.simpleString} columns should only contain one feature.")
+      assert(numFeatures.length == 1, "DoubleType columns should only contain one feature.")
       val numOutputCols = numFeatures.head
       if (numOutputCols > 1) {
         assert(
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
index 6445360f7fd90..c8760f9dc178f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
@@ -65,7 +65,7 @@ class NGram @Since("1.5.0") (@Since("1.5.0") override val uid: String)
 
   override protected def validateInputType(inputType: DataType): Unit = {
     require(inputType.sameType(ArrayType(StringType)),
-      s"Input type must be ${ArrayType(StringType).simpleString} but got $inputType.")
+      s"Input type must be ArrayType(StringType) but got $inputType.")
   }
 
   override protected def outputDataType: DataType = new ArrayType(StringType, false)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
index 24045f0448c81..5ab6c2dde667a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
@@ -85,8 +85,7 @@ class OneHotEncoder @Since("1.4.0") (@Since("1.4.0") override val uid: String) e
     val inputFields = schema.fields
 
     require(schema(inputColName).dataType.isInstanceOf[NumericType],
-      s"Input column must be of type ${NumericType.simpleString} but got " +
-        schema(inputColName).dataType.simpleString)
+      s"Input column must be of type NumericType but got ${schema(inputColName).dataType}")
     require(!inputFields.exists(_.name == outputColName),
       s"Output column $outputColName already exists.")
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index 346e1823f00b8..55e595eee6ffb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -394,7 +394,7 @@ class RFormulaModel private[feature](
     require(!columnNames.contains($(featuresCol)), "Features column already exists.")
     require(
       !columnNames.contains($(labelCol)) || schema($(labelCol)).dataType.isInstanceOf[NumericType],
-      s"Label column already exists and is not of type ${NumericType.simpleString}.")
+      "Label column already exists and is not of type NumericType.")
   }
 
   @Since("2.0.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
index ead75d5b8def3..0f946dd2e015b 100755
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
@@ -131,8 +131,8 @@ class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String
   @Since("1.5.0")
   override def transformSchema(schema: StructType): StructType = {
     val inputType = schema($(inputCol)).dataType
-    require(inputType.sameType(ArrayType(StringType)), "Input type must be " +
-      s"${ArrayType(StringType).simpleString} but got ${inputType.simpleString}.")
+    require(inputType.sameType(ArrayType(StringType)),
+      s"Input type must be ArrayType(StringType) but got $inputType.")
     SchemaUtils.appendColumn(schema, $(outputCol), inputType, schema($(inputCol)).nullable)
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
index 5132f63af1796..cfaf6c0e610b3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
@@ -40,8 +40,7 @@ class Tokenizer @Since("1.4.0") (@Since("1.4.0") override val uid: String)
   }
 
   override protected def validateInputType(inputType: DataType): Unit = {
-    require(inputType == StringType,
-      s"Input type must be ${StringType.simpleString} type but got ${inputType.simpleString}.")
+    require(inputType == StringType, s"Input type must be string type but got $inputType.")
   }
 
   override protected def outputDataType: DataType = new ArrayType(StringType, true)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
index ed3b36ee5ab2f..4061154b39c14 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
@@ -162,7 +162,7 @@ class VectorAssembler @Since("1.4.0") (@Since("1.4.0") override val uid: String)
       schema(name).dataType match {
         case _: NumericType | BooleanType => None
         case t if t.isInstanceOf[VectorUDT] => None
-        case other => Some(s"Data type ${other.simpleString} of column $name is not supported.")
+        case other => Some(s"Data type $other of column $name is not supported.")
       }
     }
     if (incorrectColumns.nonEmpty) {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala
index 51b88b3117f4e..d7fbe28ae7a64 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala
@@ -106,7 +106,7 @@ private[fpm] trait FPGrowthParams extends Params with HasPredictionCol {
   protected def validateAndTransformSchema(schema: StructType): StructType = {
     val inputType = schema($(itemsCol)).dataType
     require(inputType.isInstanceOf[ArrayType],
-      s"The input column must be ${ArrayType.simpleString}, but got ${inputType.simpleString}.")
+      s"The input column must be ArrayType, but got $inputType.")
     SchemaUtils.appendColumn(schema, $(predictionCol), schema($(itemsCol)).dataType)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
index b500582074398..d9a3f85ef9a24 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
@@ -41,8 +41,7 @@ private[spark] object SchemaUtils {
     val actualDataType = schema(colName).dataType
     val message = if (msg != null && msg.trim.length > 0) " " + msg else ""
     require(actualDataType.equals(dataType),
-      s"Column $colName must be of type ${dataType.simpleString} but was actually " +
-        s"${actualDataType.simpleString}.$message")
+      s"Column $colName must be of type $dataType but was actually $actualDataType.$message")
   }
 
   /**
@@ -59,8 +58,7 @@ private[spark] object SchemaUtils {
     val message = if (msg != null && msg.trim.length > 0) " " + msg else ""
     require(dataTypes.exists(actualDataType.equals),
       s"Column $colName must be of type equal to one of the following types: " +
-        s"${dataTypes.map(_.simpleString).mkString("[", ", ", "]")} but was actually of type " +
-        s"${actualDataType.simpleString}.$message")
+        s"${dataTypes.mkString("[", ", ", "]")} but was actually of type $actualDataType.$message")
   }
 
   /**
@@ -73,9 +71,8 @@ private[spark] object SchemaUtils {
       msg: String = ""): Unit = {
     val actualDataType = schema(colName).dataType
     val message = if (msg != null && msg.trim.length > 0) " " + msg else ""
-    require(actualDataType.isInstanceOf[NumericType],
-      s"Column $colName must be of type ${NumericType.simpleString} but was actually of type " +
-      s"${actualDataType.simpleString}.$message")
+    require(actualDataType.isInstanceOf[NumericType], s"Column $colName must be of type " +
+      s"NumericType but was actually of type $actualDataType.$message")
   }
 
   /**
diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluatorSuite.scala
index 2b0909acf69c3..ede284712b1c0 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluatorSuite.scala
@@ -67,8 +67,8 @@ class BinaryClassificationEvaluatorSuite
       evaluator.evaluate(stringDF)
     }
     assert(thrown.getMessage.replace("\n", "") contains "Column rawPrediction must be of type " +
-      "equal to one of the following types: [double, ")
-    assert(thrown.getMessage.replace("\n", "") contains "but was actually of type string.")
+      "equal to one of the following types: [DoubleType, ")
+    assert(thrown.getMessage.replace("\n", "") contains "but was actually of type StringType.")
   }
 
   test("should support all NumericType labels and not support other types") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index 0de6528c4cf22..a250331efeb1d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -105,7 +105,7 @@ class RFormulaSuite extends MLTest with DefaultReadWriteTest {
     testTransformerByInterceptingException[(Int, Boolean)](
       original,
       model,
-      "Label column already exists and is not of type numeric.",
+      "Label column already exists and is not of type NumericType.",
       "x")
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
index ed15a1d88a269..91fb24a268b8c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
@@ -99,9 +99,9 @@ class VectorAssemblerSuite
       assembler.transform(df)
     }
     assert(thrown.getMessage contains
-      "Data type string of column a is not supported.\n" +
-      "Data type string of column b is not supported.\n" +
-      "Data type string of column c is not supported.")
+      "Data type StringType of column a is not supported.\n" +
+      "Data type StringType of column b is not supported.\n" +
+      "Data type StringType of column c is not supported.")
   }
 
   test("ML attributes") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
index 65bee4edc4965..e3dfe2faf5698 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
@@ -612,7 +612,7 @@ class ALSSuite extends MLTest with DefaultReadWriteTest with Logging {
       estimator.fit(strDF)
     }
     assert(thrown.getMessage.contains(
-      s"$column must be of type numeric but was actually of type string"))
+      s"$column must be of type NumericType but was actually of type StringType"))
   }
 
   private class NumericTypeWithEncoder[A](val numericType: NumericType)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
index 6cc73e040e82c..4e4ff71c9de90 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
@@ -385,7 +385,7 @@ class AFTSurvivalRegressionSuite extends MLTest with DefaultReadWriteTest {
       aft.fit(dfWithStringCensors)
     }
     assert(thrown.getMessage.contains(
-      "Column censor must be of type numeric but was actually of type string"))
+      "Column censor must be of type NumericType but was actually of type StringType"))
   }
 
   test("numerical stability of standardization") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala b/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala
index 91a8b14625a86..5e72b4d864c1d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala
@@ -74,7 +74,7 @@ object MLTestingUtils extends SparkFunSuite {
       estimator.fit(dfWithStringLabels)
     }
     assert(thrown.getMessage.contains(
-      "Column label must be of type numeric but was actually of type string"))
+      "Column label must be of type NumericType but was actually of type StringType"))
 
     estimator match {
       case weighted: Estimator[M] with HasWeightCol =>
@@ -86,7 +86,7 @@ object MLTestingUtils extends SparkFunSuite {
           weighted.fit(dfWithStringWeights)
         }
         assert(thrown.getMessage.contains(
-          "Column weight must be of type numeric but was actually of type string"))
+          "Column weight must be of type NumericType but was actually of type StringType"))
       case _ =>
     }
   }
@@ -104,7 +104,7 @@ object MLTestingUtils extends SparkFunSuite {
       evaluator.evaluate(dfWithStringLabels)
     }
     assert(thrown.getMessage.contains(
-      "Column label must be of type numeric but was actually of type string"))
+      "Column label must be of type NumericType but was actually of type StringType"))
   }
 
   def genClassifDFWithNumericLabelCol(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index cf0e3765de80f..0a5f8a907b50a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -385,8 +385,8 @@ trait CreateNamedStructLike extends Expression {
       val invalidNames = nameExprs.filterNot(e => e.foldable && e.dataType == StringType)
       if (invalidNames.nonEmpty) {
         TypeCheckResult.TypeCheckFailure(
-          s"Only foldable ${StringType.simpleString} expressions are allowed to appear at odd" +
-            s" position, got: ${invalidNames.mkString(",")}")
+          "Only foldable StringType expressions are allowed to appear at odd position, got:" +
+            s" ${invalidNames.mkString(",")}")
       } else if (!names.contains(null)) {
         TypeCheckResult.TypeCheckSuccess
       } else {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index 1bcf11d7ee737..8cd86053a01c7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -796,7 +796,7 @@ object JsonExprUtils {
       }
     case m: CreateMap =>
       throw new AnalysisException(
-        s"A type of keys and values in map() must be string, but got ${m.dataType.simpleString}")
+        s"A type of keys and values in map() must be string, but got ${m.dataType}")
     case _ =>
       throw new AnalysisException("Must use a map() function for options")
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 70dd4df9df511..bedad7da334ae 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -222,12 +222,11 @@ case class Elt(children: Seq[Expression]) extends Expression {
       val (indexType, inputTypes) = (indexExpr.dataType, inputExprs.map(_.dataType))
       if (indexType != IntegerType) {
         return TypeCheckResult.TypeCheckFailure(s"first input to function $prettyName should " +
-          s"have ${IntegerType.simpleString}, but it's ${indexType.simpleString}")
+          s"have IntegerType, but it's $indexType")
       }
       if (inputTypes.exists(tpe => !Seq(StringType, BinaryType).contains(tpe))) {
         return TypeCheckResult.TypeCheckFailure(
-          s"input to function $prettyName should have ${StringType.simpleString} or " +
-            s"${BinaryType.simpleString}, but it's " +
+          s"input to function $prettyName should have StringType or BinaryType, but it's " +
             inputTypes.map(_.simpleString).mkString("[", ", ", "]"))
       }
       TypeUtils.checkForSameTypeInputExpr(inputTypes, s"function $prettyName")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
index 00086abbefd08..9c413de752a8c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
@@ -45,8 +45,8 @@ private[sql] class JacksonGenerator(
 
   // `JackGenerator` can only be initialized with a `StructType` or a `MapType`.
   require(dataType.isInstanceOf[StructType] || dataType.isInstanceOf[MapType],
-    s"JacksonGenerator only supports to be initialized with a ${StructType.simpleString} " +
-      s"or ${MapType.simpleString} but got ${dataType.simpleString}")
+    "JacksonGenerator only supports to be initialized with a StructType " +
+      s"or MapType but got ${dataType.simpleString}")
 
   // `ValueWriter`s for all fields of the schema
   private lazy val rootFieldWriters: Array[ValueWriter] = dataType match {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
index aa1691bb40d93..c3a4ca8f64bf6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
@@ -143,8 +143,7 @@ class JacksonParser(
             case "NaN" => Float.NaN
             case "Infinity" => Float.PositiveInfinity
             case "-Infinity" => Float.NegativeInfinity
-            case other => throw new RuntimeException(
-              s"Cannot parse $other as ${FloatType.simpleString}.")
+            case other => throw new RuntimeException(s"Cannot parse $other as FloatType.")
           }
       }
 
@@ -159,8 +158,7 @@ class JacksonParser(
             case "NaN" => Double.NaN
             case "Infinity" => Double.PositiveInfinity
             case "-Infinity" => Double.NegativeInfinity
-            case other =>
-              throw new RuntimeException(s"Cannot parse $other as ${DoubleType.simpleString}.")
+            case other => throw new RuntimeException(s"Cannot parse $other as DoubleType.")
           }
       }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
index 5f70e062d46c8..491ca005877f8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
@@ -294,10 +294,8 @@ private[sql] object JsonInferSchema {
           // Both fields1 and fields2 should be sorted by name, since inferField performs sorting.
           // Therefore, we can take advantage of the fact that we're merging sorted lists and skip
           // building a hash map or performing additional sorting.
-          assert(isSorted(fields1),
-            s"${StructType.simpleString}'s fields were not sorted: ${fields1.toSeq}")
-          assert(isSorted(fields2),
-            s"${StructType.simpleString}'s fields were not sorted: ${fields2.toSeq}")
+          assert(isSorted(fields1), s"StructType's fields were not sorted: ${fields1.toSeq}")
+          assert(isSorted(fields2), s"StructType's fields were not sorted: ${fields2.toSeq}")
 
           val newFields = new java.util.ArrayList[StructField]()
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala
index a9aaf617f7837..1dcda49a3af6a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala
@@ -29,7 +29,7 @@ object TypeUtils {
     if (dt.isInstanceOf[NumericType] || dt == NullType) {
       TypeCheckResult.TypeCheckSuccess
     } else {
-      TypeCheckResult.TypeCheckFailure(s"$caller requires numeric types, not ${dt.simpleString}")
+      TypeCheckResult.TypeCheckFailure(s"$caller requires numeric types, not $dt")
     }
   }
 
@@ -37,8 +37,7 @@ object TypeUtils {
     if (RowOrdering.isOrderable(dt)) {
       TypeCheckResult.TypeCheckSuccess
     } else {
-      TypeCheckResult.TypeCheckFailure(
-        s"$caller does not support ordering on type ${dt.simpleString}")
+      TypeCheckResult.TypeCheckFailure(s"$caller does not support ordering on type $dt")
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
index c43cc748655e8..3041f44b116ea 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
@@ -145,7 +145,7 @@ abstract class NumericType extends AtomicType {
 }
 
 
-private[spark] object NumericType extends AbstractDataType {
+private[sql] object NumericType extends AbstractDataType {
   /**
    * Enables matching against NumericType for expressions:
    * {{{
@@ -155,12 +155,11 @@ private[spark] object NumericType extends AbstractDataType {
    */
   def unapply(e: Expression): Boolean = e.dataType.isInstanceOf[NumericType]
 
-  override private[spark] def defaultConcreteType: DataType = DoubleType
+  override private[sql] def defaultConcreteType: DataType = DoubleType
 
-  override private[spark] def simpleString: String = "numeric"
+  override private[sql] def simpleString: String = "numeric"
 
-  override private[spark] def acceptsType(other: DataType): Boolean =
-    other.isInstanceOf[NumericType]
+  override private[sql] def acceptsType(other: DataType): Boolean = other.isInstanceOf[NumericType]
 }
 
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
index 8f118624f6d2f..38c40482fa4d9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
@@ -42,7 +42,7 @@ object ArrayType extends AbstractDataType {
     other.isInstanceOf[ArrayType]
   }
 
-  override private[spark] def simpleString: String = "array"
+  override private[sql] def simpleString: String = "array"
 }
 
 /**
@@ -103,8 +103,7 @@ case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataT
       case a : ArrayType => a.interpretedOrdering.asInstanceOf[Ordering[Any]]
       case s: StructType => s.interpretedOrdering.asInstanceOf[Ordering[Any]]
       case other =>
-        throw new IllegalArgumentException(
-          s"Type ${other.simpleString} does not support ordered operations")
+        throw new IllegalArgumentException(s"Type $other does not support ordered operations")
     }
 
     def compare(x: ArrayData, y: ArrayData): Int = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
index f780ffd46a876..dbf51c398fa47 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
@@ -48,8 +48,7 @@ case class DecimalType(precision: Int, scale: Int) extends FractionalType {
   }
 
   if (precision > DecimalType.MAX_PRECISION) {
-    throw new AnalysisException(
-      s"${DecimalType.simpleString} can only support precision up to ${DecimalType.MAX_PRECISION}")
+    throw new AnalysisException(s"DecimalType can only support precision up to 38")
   }
 
   // default constructor for Java
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala
index 203e85e1c99bd..2d49fe076786a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala
@@ -24,8 +24,7 @@ import org.apache.spark.annotation.InterfaceStability
 @InterfaceStability.Evolving
 object ObjectType extends AbstractDataType {
   override private[sql] def defaultConcreteType: DataType =
-    throw new UnsupportedOperationException(
-      s"null literals can't be casted to ${ObjectType.simpleString}")
+    throw new UnsupportedOperationException("null literals can't be casted to ObjectType")
 
   override private[sql] def acceptsType(other: DataType): Boolean = other match {
     case ObjectType(_) => true
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index 0e69ef8ba73e8..362676b252126 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -426,7 +426,7 @@ object StructType extends AbstractDataType {
   private[sql] def fromString(raw: String): StructType = {
     Try(DataType.fromJson(raw)).getOrElse(LegacyTypeStringParser.parse(raw)) match {
       case t: StructType => t
-      case _ => throw new RuntimeException(s"Failed parsing ${StructType.simpleString}: $raw")
+      case _ => throw new RuntimeException(s"Failed parsing StructType: $raw")
     }
   }
 
@@ -528,8 +528,7 @@ object StructType extends AbstractDataType {
         leftType
 
       case _ =>
-        throw new SparkException(s"Failed to merge incompatible data types ${left.simpleString} " +
-          s"and ${right.simpleString}")
+        throw new SparkException(s"Failed to merge incompatible data types $left and $right")
     }
 
   private[sql] def fieldsMap(fields: Array[StructField]): Map[String, StructField] = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index 5e503be416a1f..5d2f8e735e3d4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -514,7 +514,7 @@ class AnalysisErrorSuite extends AnalysisTest {
       right,
       joinType = Cross,
       condition = Some('b === 'd))
-    assertAnalysisError(plan2, "EqualTo does not support ordering on type map" :: Nil)
+    assertAnalysisError(plan2, "EqualTo does not support ordering on type MapType" :: Nil)
   }
 
   test("PredicateSubQuery is used outside of a filter") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
index 8eec14842c7e7..36714bd631b0e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
@@ -109,17 +109,17 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
     assertErrorForDifferingTypes(GreaterThan('intField, 'booleanField))
     assertErrorForDifferingTypes(GreaterThanOrEqual('intField, 'booleanField))
 
-    assertError(EqualTo('mapField, 'mapField), "EqualTo does not support ordering on type map")
+    assertError(EqualTo('mapField, 'mapField), "EqualTo does not support ordering on type MapType")
     assertError(EqualNullSafe('mapField, 'mapField),
-      "EqualNullSafe does not support ordering on type map")
+      "EqualNullSafe does not support ordering on type MapType")
     assertError(LessThan('mapField, 'mapField),
-      "LessThan does not support ordering on type map")
+      "LessThan does not support ordering on type MapType")
     assertError(LessThanOrEqual('mapField, 'mapField),
-      "LessThanOrEqual does not support ordering on type map")
+      "LessThanOrEqual does not support ordering on type MapType")
     assertError(GreaterThan('mapField, 'mapField),
-      "GreaterThan does not support ordering on type map")
+      "GreaterThan does not support ordering on type MapType")
     assertError(GreaterThanOrEqual('mapField, 'mapField),
-      "GreaterThanOrEqual does not support ordering on type map")
+      "GreaterThanOrEqual does not support ordering on type MapType")
 
     assertError(If('intField, 'stringField, 'stringField),
       "type of predicate expression in If should be boolean")
@@ -169,10 +169,10 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
       CreateNamedStruct(Seq("a", "b", 2.0)), "even number of arguments")
     assertError(
       CreateNamedStruct(Seq(1, "a", "b", 2.0)),
-      "Only foldable string expressions are allowed to appear at odd position")
+      "Only foldable StringType expressions are allowed to appear at odd position")
     assertError(
       CreateNamedStruct(Seq('a.string.at(0), "a", "b", 2.0)),
-      "Only foldable string expressions are allowed to appear at odd position")
+      "Only foldable StringType expressions are allowed to appear at odd position")
     assertError(
       CreateNamedStruct(Seq(Literal.create(null, StringType), "a")),
       "Field name should not be null")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
index b4d422d8506fc..cb8a1fecb80a7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
@@ -469,7 +469,7 @@ class ExpressionParserSuite extends PlanTest {
       Literal(BigDecimal("90912830918230182310293801923652346786").underlying()))
     assertEqual("123.0E-28BD", Literal(BigDecimal("123.0E-28").underlying()))
     assertEqual("123.08BD", Literal(BigDecimal("123.08").underlying()))
-    intercept("1.20E-38BD", "decimal can only support precision up to 38")
+    intercept("1.20E-38BD", "DecimalType can only support precision up to 38")
   }
 
   test("strings") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
index fccd057e577d4..5a86f4055dce7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -154,7 +154,7 @@ class DataTypeSuite extends SparkFunSuite {
       left.merge(right)
     }.getMessage
     assert(message.equals("Failed to merge fields 'b' and 'b'. " +
-      "Failed to merge incompatible data types float and bigint"))
+      "Failed to merge incompatible data types FloatType and LongType"))
   }
 
   test("existsRecursively") {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
index 060e2ec068053..d5969b55eef96 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -244,7 +244,7 @@ private SchemaColumnConvertNotSupportedException constructConvertNotSupportedExc
     return new SchemaColumnConvertNotSupportedException(
       Arrays.toString(descriptor.getPath()),
       descriptor.getType().toString(),
-      column.dataType().simpleString());
+      column.dataType().toString());
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
index b068493f2dd17..c6449cd5a16b0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
@@ -452,7 +452,7 @@ class RelationalGroupedDataset protected[sql](
     require(expr.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
       "Must pass a grouped map udf")
     require(expr.dataType.isInstanceOf[StructType],
-      s"The returnType of the udf must be a ${StructType.simpleString}")
+      "The returnType of the udf must be a StructType")
 
     val groupingNamedExpressions = groupingExprs.map {
       case ne: NamedExpression => ne
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala
index 1274abffaa116..93c8127681b3e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala
@@ -47,8 +47,7 @@ object ArrowUtils {
     case DateType => new ArrowType.Date(DateUnit.DAY)
     case TimestampType =>
       if (timeZoneId == null) {
-        throw new UnsupportedOperationException(
-          s"${TimestampType.simpleString} must supply timeZoneId parameter")
+        throw new UnsupportedOperationException("TimestampType must supply timeZoneId parameter")
       } else {
         new ArrowType.Timestamp(TimeUnit.MICROSECOND, timeZoneId)
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala
index c90328f7ad43f..4f44ae4fa1d71 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala
@@ -98,7 +98,7 @@ private[orc] object OrcFilters {
     case DateType => PredicateLeaf.Type.DATE
     case TimestampType => PredicateLeaf.Type.TIMESTAMP
     case _: DecimalType => PredicateLeaf.Type.DECIMAL
-    case _ => throw new UnsupportedOperationException(s"DataType: ${dataType.simpleString}")
+    case _ => throw new UnsupportedOperationException(s"DataType: $dataType")
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala
index 18decad3f62f0..c61be077d309f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala
@@ -555,7 +555,7 @@ class SparkToParquetSchemaConverter(
         convertField(field.copy(dataType = udt.sqlType))
 
       case _ =>
-        throw new AnalysisException(s"Unsupported data type ${field.dataType.simpleString}")
+        throw new AnalysisException(s"Unsupported data type $field.dataType")
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
index f772a3336d6af..685d5841ab551 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@@ -157,7 +157,7 @@ object StatFunctions extends Logging {
     cols.map(name => (name, df.schema.fields.find(_.name == name))).foreach { case (name, data) =>
       require(data.nonEmpty, s"Couldn't find column with name $name")
       require(data.get.dataType.isInstanceOf[NumericType], s"Currently $functionName calculation " +
-        s"for columns with dataType ${data.get.dataType.simpleString} not supported.")
+        s"for columns with dataType ${data.get.dataType} not supported.")
     }
     val columns = cols.map(n => Column(Cast(Column(n).expr, DoubleType)))
     df.select(columns: _*).queryExecution.toRdd.treeAggregate(new CovarianceCounter)(
diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
index 827931d74138d..3d49323751a10 100644
--- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
@@ -120,7 +120,7 @@ select to_json(named_struct('a', 1, 'b', 2), map('mode', 1))
 struct<>
 -- !query 11 output
 org.apache.spark.sql.AnalysisException
-A type of keys and values in map() must be string, but got map<string,int>;; line 1 pos 7
+A type of keys and values in map() must be string, but got MapType(StringType,IntegerType,false);; line 1 pos 7
 
 
 -- !query 12
@@ -216,7 +216,7 @@ select from_json('{"a":1}', 'a INT', map('mode', 1))
 struct<>
 -- !query 20 output
 org.apache.spark.sql.AnalysisException
-A type of keys and values in map() must be string, but got map<string,int>;; line 1 pos 7
+A type of keys and values in map() must be string, but got MapType(StringType,IntegerType,false);; line 1 pos 7
 
 
 -- !query 21
diff --git a/sql/core/src/test/resources/sql-tests/results/literals.sql.out b/sql/core/src/test/resources/sql-tests/results/literals.sql.out
index 7f301614523b2..b8c91dc8b59a4 100644
--- a/sql/core/src/test/resources/sql-tests/results/literals.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/literals.sql.out
@@ -147,7 +147,7 @@ struct<>
 -- !query 15 output
 org.apache.spark.sql.catalyst.parser.ParseException
 
-decimal can only support precision up to 38
+DecimalType can only support precision up to 38
 == SQL ==
 select 1234567890123456789012345678901234567890
 
@@ -159,7 +159,7 @@ struct<>
 -- !query 16 output
 org.apache.spark.sql.catalyst.parser.ParseException
 
-decimal can only support precision up to 38
+DecimalType can only support precision up to 38
 == SQL ==
 select 1234567890123456789012345678901234567890.0
 
@@ -379,7 +379,7 @@ struct<>
 -- !query 39 output
 org.apache.spark.sql.catalyst.parser.ParseException
 
-decimal can only support precision up to 38(line 1, pos 7)
+DecimalType can only support precision up to 38(line 1, pos 7)
 
 == SQL ==
 select 1.20E-38BD
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
index 368e52cfbda9c..9d3dfae348beb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
@@ -430,9 +430,9 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       val col = spark.read.parquet(file).schema.fields.filter(_.name.equals("a"))
       assert(col.length == 1)
       if (col(0).dataType == StringType) {
-        assert(errMsg.contains("Column: [a], Expected: int, Found: BINARY"))
+        assert(errMsg.contains("Column: [a], Expected: IntegerType, Found: BINARY"))
       } else {
-        assert(errMsg.endsWith("Column: [a], Expected: string, Found: INT32"))
+        assert(errMsg.endsWith("Column: [a], Expected: StringType, Found: INT32"))
       }
     }
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
index 40be4e8c1f5be..7dcaf170f9693 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
@@ -78,9 +78,9 @@ case class HiveTableScanExec(
   // Bind all partition key attribute references in the partition pruning predicate for later
   // evaluation.
   private lazy val boundPruningPred = partitionPruningPred.reduceLeftOption(And).map { pred =>
-    require(pred.dataType == BooleanType,
-      s"Data type of predicate $pred must be ${BooleanType.simpleString} rather than " +
-        s"${pred.dataType.simpleString}.")
+    require(
+      pred.dataType == BooleanType,
+      s"Data type of predicate $pred must be BooleanType rather than ${pred.dataType}.")
 
     BindReferences.bindReference(pred, relation.partitionCols)
   }

From eb6e9880397dbac8b0b9ebc0796150b6924fc566 Mon Sep 17 00:00:00 2001
From: Xiao Li <gatorsmile@gmail.com>
Date: Mon, 9 Jul 2018 14:53:14 -0700
Subject: [PATCH 77/79] [SPARK-24759][SQL] No reordering keys for broadcast
 hash join

## What changes were proposed in this pull request?

As the implementation of the broadcast hash join is independent of the input hash partitioning, reordering keys is not necessary. Thus, we solve this issue by simply removing the broadcast hash join from the reordering rule in EnsureRequirements.

## How was this patch tested?
N/A

Author: Xiao Li <gatorsmile@gmail.com>

Closes #21728 from gatorsmile/cleanER.
---
 .../spark/sql/execution/exchange/EnsureRequirements.scala  | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
index ad95879d86f42..d96ecbaa48029 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
@@ -279,13 +279,6 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
    */
   private def reorderJoinPredicates(plan: SparkPlan): SparkPlan = {
     plan match {
-      case BroadcastHashJoinExec(leftKeys, rightKeys, joinType, buildSide, condition, left,
-        right) =>
-        val (reorderedLeftKeys, reorderedRightKeys) =
-          reorderJoinKeys(leftKeys, rightKeys, left.outputPartitioning, right.outputPartitioning)
-        BroadcastHashJoinExec(reorderedLeftKeys, reorderedRightKeys, joinType, buildSide, condition,
-          left, right)
-
       case ShuffledHashJoinExec(leftKeys, rightKeys, joinType, buildSide, condition, left, right) =>
         val (reorderedLeftKeys, reorderedRightKeys) =
           reorderJoinKeys(leftKeys, rightKeys, left.outputPartitioning, right.outputPartitioning)

From 4984f1af7e48dab1ae08021a3b17c5ad6d47a87e Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@apache.org>
Date: Tue, 10 Jul 2018 13:54:04 +0800
Subject: [PATCH 78/79] [MINOR] Add Sphinx into dev/requirements.txt

## What changes were proposed in this pull request?

Not a big deal but this PR adds `sphinx` into `dev/requirements.txt` since we found it needed - https://github.com/apache/spark-website/pull/122#discussion_r200896018

## How was this patch tested?

manually:

```
pip install -r requirements.txt
```

Author: hyukjinkwon <gurwls223@apache.org>

Closes #21735 from HyukjinKwon/minor-dev.
---
 dev/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dev/requirements.txt b/dev/requirements.txt
index 79782279f8fbd..fa833ab96b8e7 100644
--- a/dev/requirements.txt
+++ b/dev/requirements.txt
@@ -2,3 +2,4 @@ jira==1.0.3
 PyGithub==1.26.0
 Unidecode==0.04.19
 pypandoc==1.3.3
+sphinx

From a289009567c1566a1df4bcdfdf0111e82ae3d81d Mon Sep 17 00:00:00 2001
From: Yuming Wang <yumwang@ebay.com>
Date: Tue, 10 Jul 2018 15:58:14 +0800
Subject: [PATCH 79/79] [SPARK-24706][SQL] ByteType and ShortType support
 pushdown to parquet

## What changes were proposed in this pull request?

`ByteType` and `ShortType` support pushdown to parquet data source.
[Benchmark result](https://issues.apache.org/jira/browse/SPARK-24706?focusedCommentId=16528878&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-16528878).

## How was this patch tested?

unit tests

Author: Yuming Wang <yumwang@ebay.com>

Closes #21682 from wangyum/SPARK-24706.
---
 .../FilterPushdownBenchmark-results.txt       | 32 +++++------
 .../datasources/parquet/ParquetFilters.scala  | 34 +++++++----
 .../parquet/ParquetFilterSuite.scala          | 56 +++++++++++++++++++
 3 files changed, 94 insertions(+), 28 deletions(-)

diff --git a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt
index 29fe4345d69da..110669b69a00d 100644
--- a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt
+++ b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt
@@ -542,39 +542,39 @@ Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
 
 Select 1 tinyint row (value = CAST(63 AS tinyint)): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------
-Parquet Vectorized                            3726 / 3775          4.2         236.9       1.0X
-Parquet Vectorized (Pushdown)                 3741 / 3789          4.2         237.9       1.0X
-Native ORC Vectorized                         2793 / 2909          5.6         177.6       1.3X
-Native ORC Vectorized (Pushdown)               530 /  561         29.7          33.7       7.0X
+Parquet Vectorized                            3461 / 3997          4.5         220.1       1.0X
+Parquet Vectorized (Pushdown)                  270 /  315         58.4          17.1      12.8X
+Native ORC Vectorized                         4107 / 5372          3.8         261.1       0.8X
+Native ORC Vectorized (Pushdown)               778 / 1553         20.2          49.5       4.4X
 
 Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
 Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
 
 Select 10% tinyint rows (value < CAST(12 AS tinyint)): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------
-Parquet Vectorized                            4385 / 4406          3.6         278.8       1.0X
-Parquet Vectorized (Pushdown)                 4398 / 4454          3.6         279.6       1.0X
-Native ORC Vectorized                         3420 / 3501          4.6         217.4       1.3X
-Native ORC Vectorized (Pushdown)              1395 / 1432         11.3          88.7       3.1X
+Parquet Vectorized                            4771 / 6655          3.3         303.3       1.0X
+Parquet Vectorized (Pushdown)                 1322 / 1606         11.9          84.0       3.6X
+Native ORC Vectorized                         4437 / 4572          3.5         282.1       1.1X
+Native ORC Vectorized (Pushdown)              1781 / 1976          8.8         113.2       2.7X
 
 Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
 Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
 
 Select 50% tinyint rows (value < CAST(63 AS tinyint)): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------
-Parquet Vectorized                            7307 / 7394          2.2         464.6       1.0X
-Parquet Vectorized (Pushdown)                 7411 / 7461          2.1         471.2       1.0X
-Native ORC Vectorized                         6501 / 7814          2.4         413.4       1.1X
-Native ORC Vectorized (Pushdown)              7341 / 8637          2.1         466.7       1.0X
+Parquet Vectorized                            7433 / 7752          2.1         472.6       1.0X
+Parquet Vectorized (Pushdown)                 5863 / 5913          2.7         372.8       1.3X
+Native ORC Vectorized                         7986 / 8084          2.0         507.7       0.9X
+Native ORC Vectorized (Pushdown)              6522 / 6608          2.4         414.6       1.1X
 
 Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
 Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
 
 Select 90% tinyint rows (value < CAST(114 AS tinyint)): Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------
-Parquet Vectorized                          11886 / 13122          1.3         755.7       1.0X
-Parquet Vectorized (Pushdown)               12557 / 14173          1.3         798.4       0.9X
-Native ORC Vectorized                       10758 / 11971          1.5         684.0       1.1X
-Native ORC Vectorized (Pushdown)            10564 / 10713          1.5         671.6       1.1X
+Parquet Vectorized                          11190 / 11519          1.4         711.4       1.0X
+Parquet Vectorized (Pushdown)               10861 / 11206          1.4         690.5       1.0X
+Native ORC Vectorized                       11622 / 12196          1.4         738.9       1.0X
+Native ORC Vectorized (Pushdown)            11377 / 11654          1.4         723.3       1.0X
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
index 4827f706e6016..4c9b940db2b30 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
@@ -45,6 +45,8 @@ private[parquet] class ParquetFilters(pushDownDate: Boolean, pushDownStartWith:
       decimalMetadata: DecimalMetadata)
 
   private val ParquetBooleanType = ParquetSchemaType(null, BOOLEAN, null)
+  private val ParquetByteType = ParquetSchemaType(INT_8, INT32, null)
+  private val ParquetShortType = ParquetSchemaType(INT_16, INT32, null)
   private val ParquetIntegerType = ParquetSchemaType(null, INT32, null)
   private val ParquetLongType = ParquetSchemaType(null, INT64, null)
   private val ParquetFloatType = ParquetSchemaType(null, FLOAT, null)
@@ -60,8 +62,10 @@ private[parquet] class ParquetFilters(pushDownDate: Boolean, pushDownStartWith:
   private val makeEq: PartialFunction[ParquetSchemaType, (String, Any) => FilterPredicate] = {
     case ParquetBooleanType =>
       (n: String, v: Any) => FilterApi.eq(booleanColumn(n), v.asInstanceOf[java.lang.Boolean])
-    case ParquetIntegerType =>
-      (n: String, v: Any) => FilterApi.eq(intColumn(n), v.asInstanceOf[Integer])
+    case ParquetByteType | ParquetShortType | ParquetIntegerType =>
+      (n: String, v: Any) => FilterApi.eq(
+        intColumn(n),
+        Option(v).map(_.asInstanceOf[Number].intValue.asInstanceOf[Integer]).orNull)
     case ParquetLongType =>
       (n: String, v: Any) => FilterApi.eq(longColumn(n), v.asInstanceOf[java.lang.Long])
     case ParquetFloatType =>
@@ -87,8 +91,10 @@ private[parquet] class ParquetFilters(pushDownDate: Boolean, pushDownStartWith:
   private val makeNotEq: PartialFunction[ParquetSchemaType, (String, Any) => FilterPredicate] = {
     case ParquetBooleanType =>
       (n: String, v: Any) => FilterApi.notEq(booleanColumn(n), v.asInstanceOf[java.lang.Boolean])
-    case ParquetIntegerType =>
-      (n: String, v: Any) => FilterApi.notEq(intColumn(n), v.asInstanceOf[Integer])
+    case ParquetByteType | ParquetShortType | ParquetIntegerType =>
+      (n: String, v: Any) => FilterApi.notEq(
+        intColumn(n),
+        Option(v).map(_.asInstanceOf[Number].intValue.asInstanceOf[Integer]).orNull)
     case ParquetLongType =>
       (n: String, v: Any) => FilterApi.notEq(longColumn(n), v.asInstanceOf[java.lang.Long])
     case ParquetFloatType =>
@@ -111,8 +117,9 @@ private[parquet] class ParquetFilters(pushDownDate: Boolean, pushDownStartWith:
   }
 
   private val makeLt: PartialFunction[ParquetSchemaType, (String, Any) => FilterPredicate] = {
-    case ParquetIntegerType =>
-      (n: String, v: Any) => FilterApi.lt(intColumn(n), v.asInstanceOf[Integer])
+    case ParquetByteType | ParquetShortType | ParquetIntegerType =>
+      (n: String, v: Any) =>
+        FilterApi.lt(intColumn(n), v.asInstanceOf[Number].intValue.asInstanceOf[Integer])
     case ParquetLongType =>
       (n: String, v: Any) => FilterApi.lt(longColumn(n), v.asInstanceOf[java.lang.Long])
     case ParquetFloatType =>
@@ -132,8 +139,9 @@ private[parquet] class ParquetFilters(pushDownDate: Boolean, pushDownStartWith:
   }
 
   private val makeLtEq: PartialFunction[ParquetSchemaType, (String, Any) => FilterPredicate] = {
-    case ParquetIntegerType =>
-      (n: String, v: Any) => FilterApi.ltEq(intColumn(n), v.asInstanceOf[Integer])
+    case ParquetByteType | ParquetShortType | ParquetIntegerType =>
+      (n: String, v: Any) =>
+        FilterApi.ltEq(intColumn(n), v.asInstanceOf[Number].intValue.asInstanceOf[Integer])
     case ParquetLongType =>
       (n: String, v: Any) => FilterApi.ltEq(longColumn(n), v.asInstanceOf[java.lang.Long])
     case ParquetFloatType =>
@@ -153,8 +161,9 @@ private[parquet] class ParquetFilters(pushDownDate: Boolean, pushDownStartWith:
   }
 
   private val makeGt: PartialFunction[ParquetSchemaType, (String, Any) => FilterPredicate] = {
-    case ParquetIntegerType =>
-      (n: String, v: Any) => FilterApi.gt(intColumn(n), v.asInstanceOf[Integer])
+    case ParquetByteType | ParquetShortType | ParquetIntegerType =>
+      (n: String, v: Any) =>
+        FilterApi.gt(intColumn(n), v.asInstanceOf[Number].intValue.asInstanceOf[Integer])
     case ParquetLongType =>
       (n: String, v: Any) => FilterApi.gt(longColumn(n), v.asInstanceOf[java.lang.Long])
     case ParquetFloatType =>
@@ -174,8 +183,9 @@ private[parquet] class ParquetFilters(pushDownDate: Boolean, pushDownStartWith:
   }
 
   private val makeGtEq: PartialFunction[ParquetSchemaType, (String, Any) => FilterPredicate] = {
-    case ParquetIntegerType =>
-      (n: String, v: Any) => FilterApi.gtEq(intColumn(n), v.asInstanceOf[Integer])
+    case ParquetByteType | ParquetShortType | ParquetIntegerType =>
+      (n: String, v: Any) =>
+        FilterApi.gtEq(intColumn(n), v.asInstanceOf[Number].intValue.asInstanceOf[Integer])
     case ParquetLongType =>
       (n: String, v: Any) => FilterApi.gtEq(longColumn(n), v.asInstanceOf[java.lang.Long])
     case ParquetFloatType =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index f2c0bda256239..067d2fea14fd7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -179,6 +179,62 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
     }
   }
 
+  test("filter pushdown - tinyint") {
+    withParquetDataFrame((1 to 4).map(i => Tuple1(Option(i.toByte)))) { implicit df =>
+      assert(df.schema.head.dataType === ByteType)
+      checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
+      checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
+
+      checkFilterPredicate('_1 === 1.toByte, classOf[Eq[_]], 1)
+      checkFilterPredicate('_1 <=> 1.toByte, classOf[Eq[_]], 1)
+      checkFilterPredicate('_1 =!= 1.toByte, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
+
+      checkFilterPredicate('_1 < 2.toByte, classOf[Lt[_]], 1)
+      checkFilterPredicate('_1 > 3.toByte, classOf[Gt[_]], 4)
+      checkFilterPredicate('_1 <= 1.toByte, classOf[LtEq[_]], 1)
+      checkFilterPredicate('_1 >= 4.toByte, classOf[GtEq[_]], 4)
+
+      checkFilterPredicate(Literal(1.toByte) === '_1, classOf[Eq[_]], 1)
+      checkFilterPredicate(Literal(1.toByte) <=> '_1, classOf[Eq[_]], 1)
+      checkFilterPredicate(Literal(2.toByte) > '_1, classOf[Lt[_]], 1)
+      checkFilterPredicate(Literal(3.toByte) < '_1, classOf[Gt[_]], 4)
+      checkFilterPredicate(Literal(1.toByte) >= '_1, classOf[LtEq[_]], 1)
+      checkFilterPredicate(Literal(4.toByte) <= '_1, classOf[GtEq[_]], 4)
+
+      checkFilterPredicate(!('_1 < 4.toByte), classOf[GtEq[_]], 4)
+      checkFilterPredicate('_1 < 2.toByte || '_1 > 3.toByte,
+        classOf[Operators.Or], Seq(Row(1), Row(4)))
+    }
+  }
+
+  test("filter pushdown - smallint") {
+    withParquetDataFrame((1 to 4).map(i => Tuple1(Option(i.toShort)))) { implicit df =>
+      assert(df.schema.head.dataType === ShortType)
+      checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
+      checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
+
+      checkFilterPredicate('_1 === 1.toShort, classOf[Eq[_]], 1)
+      checkFilterPredicate('_1 <=> 1.toShort, classOf[Eq[_]], 1)
+      checkFilterPredicate('_1 =!= 1.toShort, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
+
+      checkFilterPredicate('_1 < 2.toShort, classOf[Lt[_]], 1)
+      checkFilterPredicate('_1 > 3.toShort, classOf[Gt[_]], 4)
+      checkFilterPredicate('_1 <= 1.toShort, classOf[LtEq[_]], 1)
+      checkFilterPredicate('_1 >= 4.toShort, classOf[GtEq[_]], 4)
+
+      checkFilterPredicate(Literal(1.toShort) === '_1, classOf[Eq[_]], 1)
+      checkFilterPredicate(Literal(1.toShort) <=> '_1, classOf[Eq[_]], 1)
+      checkFilterPredicate(Literal(2.toShort) > '_1, classOf[Lt[_]], 1)
+      checkFilterPredicate(Literal(3.toShort) < '_1, classOf[Gt[_]], 4)
+      checkFilterPredicate(Literal(1.toShort) >= '_1, classOf[LtEq[_]], 1)
+      checkFilterPredicate(Literal(4.toShort) <= '_1, classOf[GtEq[_]], 4)
+
+      checkFilterPredicate(!('_1 < 4.toShort), classOf[GtEq[_]], 4)
+      checkFilterPredicate('_1 < 2.toShort || '_1 > 3.toShort,
+        classOf[Operators.Or], Seq(Row(1), Row(4)))
+    }
+  }
+
   test("filter pushdown - integer") {
     withParquetDataFrame((1 to 4).map(i => Tuple1(Option(i)))) { implicit df =>
       checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])